satellite/audit: better handling of piece fetch errors

We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.

Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.

Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.

After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.

Refs: https://github.com/storj/storj/issues/4669

Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
This commit is contained in:
paul cannon 2022-09-26 18:47:03 -05:00 committed by Storj Robot
parent 352e937813
commit 802ff18bd8
2 changed files with 21 additions and 8 deletions

View File

@ -13,12 +13,15 @@ storj.io/storj/satellite/audit."audit_contained_percentage" FloatVal
storj.io/storj/satellite/audit."audit_fail_nodes" IntVal storj.io/storj/satellite/audit."audit_fail_nodes" IntVal
storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter
storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal
storj.io/storj/satellite/audit."audit_not_enough_nodes_online" Counter
storj.io/storj/satellite/audit."audit_not_enough_shares_acquired" Counter
storj.io/storj/satellite/audit."audit_offline_nodes" IntVal storj.io/storj/satellite/audit."audit_offline_nodes" IntVal
storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter
storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal
storj.io/storj/satellite/audit."audit_success_nodes" IntVal storj.io/storj/satellite/audit."audit_success_nodes" IntVal
storj.io/storj/satellite/audit."audit_success_nodes_global" Meter storj.io/storj/satellite/audit."audit_success_nodes_global" Meter
storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal
storj.io/storj/satellite/audit."audit_suspected_network_problem" Counter
storj.io/storj/satellite/audit."audit_total_nodes" IntVal storj.io/storj/satellite/audit."audit_total_nodes" IntVal
storj.io/storj/satellite/audit."audit_total_nodes_global" Meter storj.io/storj/satellite/audit."audit_total_nodes_global" Meter
storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal
@ -28,6 +31,7 @@ storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter
storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal
storj.io/storj/satellite/audit."audited_percentage" FloatVal storj.io/storj/satellite/audit."audited_percentage" FloatVal
storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter
storj.io/storj/satellite/audit."not_enough_shares_for_audit" Counter
storj.io/storj/satellite/audit."reverify_contained" IntVal storj.io/storj/satellite/audit."reverify_contained" IntVal
storj.io/storj/satellite/audit."reverify_contained_global" Meter storj.io/storj/satellite/audit."reverify_contained_global" Meter
storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal

View File

@ -123,7 +123,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip) orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip)
if err != nil { if err != nil {
if orders.ErrDownloadFailedNotEnoughPieces.Has(err) { if orders.ErrDownloadFailedNotEnoughPieces.Has(err) {
mon.Counter("not_enough_shares_for_audit").Inc(1) mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
mon.Counter("audit_not_enough_nodes_online").Inc(1) //mon:locked
err = ErrNotEnoughShares.Wrap(err) err = ErrNotEnoughShares.Wrap(err)
} }
return Report{}, err return Report{}, err
@ -232,10 +233,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
total := segmentInfo.Redundancy.TotalShares total := segmentInfo.Redundancy.TotalShares
if len(sharesToAudit) < int(required) { if len(sharesToAudit) < int(required) {
mon.Counter("not_enough_shares_for_audit").Inc(1) mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
// if we have reached this point, most likely something went wrong // if we have reached this point, most likely something went wrong
// like a forgotten delete. Don't fail nodes. We have an alert on this. // like a network problem or a forgotten delete. Don't fail nodes.
// Check the logs and see what happened. // We have an alert on this. Check the logs and see what happened.
if len(offlineNodes)+len(containedNodes) > len(sharesToAudit)+len(failedNodes)+len(unknownNodes) {
mon.Counter("audit_suspected_network_problem").Inc(1) //mon:locked
} else {
mon.Counter("audit_not_enough_shares_acquired").Inc(1) //mon:locked
}
report := Report{ report := Report{
Offlines: offlineNodes, Offlines: offlineNodes,
Unknown: unknownNodes, Unknown: unknownNodes,
@ -243,14 +249,17 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d", return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes)) len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
} }
// ensure we get values, even if only zero values, so that redash can have an alert based on this // ensure we get values, even if only zero values, so that redash can have an alert based on these
mon.Counter("not_enough_shares_for_audit").Inc(0) mon.Counter("not_enough_shares_for_audit").Inc(0) //mon:locked
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked mon.Counter("audit_not_enough_nodes_online").Inc(0) //mon:locked
mon.Counter("audit_not_enough_shares_acquired").Inc(0) //mon:locked
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
mon.Counter("audit_suspected_network_problem").Inc(0) //mon:locked
pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit) pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit)
if err != nil { if err != nil {
mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked
verifier.log.Error("could not verify shares", zap.Error(err)) verifier.log.Error("could not verify shares", zap.String("Segment", segmentInfoString(segment)), zap.Error(err))
return Report{ return Report{
Fails: failedNodes, Fails: failedNodes,
Offlines: offlineNodes, Offlines: offlineNodes,