satellite/audit: better handling of piece fetch errors

We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.

Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.

Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.

After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.

Refs: https://github.com/storj/storj/issues/4669

Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
This commit is contained in:
paul cannon 2022-09-26 18:47:03 -05:00 committed by Storj Robot
parent 352e937813
commit 802ff18bd8
2 changed files with 21 additions and 8 deletions

View File

@ -13,12 +13,15 @@ storj.io/storj/satellite/audit."audit_contained_percentage" FloatVal
storj.io/storj/satellite/audit."audit_fail_nodes" IntVal
storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter
storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal
storj.io/storj/satellite/audit."audit_not_enough_nodes_online" Counter
storj.io/storj/satellite/audit."audit_not_enough_shares_acquired" Counter
storj.io/storj/satellite/audit."audit_offline_nodes" IntVal
storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter
storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal
storj.io/storj/satellite/audit."audit_success_nodes" IntVal
storj.io/storj/satellite/audit."audit_success_nodes_global" Meter
storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal
storj.io/storj/satellite/audit."audit_suspected_network_problem" Counter
storj.io/storj/satellite/audit."audit_total_nodes" IntVal
storj.io/storj/satellite/audit."audit_total_nodes_global" Meter
storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal
@ -28,6 +31,7 @@ storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter
storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal
storj.io/storj/satellite/audit."audited_percentage" FloatVal
storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter
storj.io/storj/satellite/audit."not_enough_shares_for_audit" Counter
storj.io/storj/satellite/audit."reverify_contained" IntVal
storj.io/storj/satellite/audit."reverify_contained_global" Meter
storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal

View File

@ -123,7 +123,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip)
if err != nil {
if orders.ErrDownloadFailedNotEnoughPieces.Has(err) {
mon.Counter("not_enough_shares_for_audit").Inc(1)
mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
mon.Counter("audit_not_enough_nodes_online").Inc(1) //mon:locked
err = ErrNotEnoughShares.Wrap(err)
}
return Report{}, err
@ -232,10 +233,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
total := segmentInfo.Redundancy.TotalShares
if len(sharesToAudit) < int(required) {
mon.Counter("not_enough_shares_for_audit").Inc(1)
mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
// if we have reached this point, most likely something went wrong
// like a forgotten delete. Don't fail nodes. We have an alert on this.
// Check the logs and see what happened.
// like a network problem or a forgotten delete. Don't fail nodes.
// We have an alert on this. Check the logs and see what happened.
if len(offlineNodes)+len(containedNodes) > len(sharesToAudit)+len(failedNodes)+len(unknownNodes) {
mon.Counter("audit_suspected_network_problem").Inc(1) //mon:locked
} else {
mon.Counter("audit_not_enough_shares_acquired").Inc(1) //mon:locked
}
report := Report{
Offlines: offlineNodes,
Unknown: unknownNodes,
@ -243,14 +249,17 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
}
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("not_enough_shares_for_audit").Inc(0)
// ensure we get values, even if only zero values, so that redash can have an alert based on these
mon.Counter("not_enough_shares_for_audit").Inc(0) //mon:locked
mon.Counter("audit_not_enough_nodes_online").Inc(0) //mon:locked
mon.Counter("audit_not_enough_shares_acquired").Inc(0) //mon:locked
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
mon.Counter("audit_suspected_network_problem").Inc(0) //mon:locked
pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit)
if err != nil {
mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked
verifier.log.Error("could not verify shares", zap.Error(err))
verifier.log.Error("could not verify shares", zap.String("Segment", segmentInfoString(segment)), zap.Error(err))
return Report{
Fails: failedNodes,
Offlines: offlineNodes,