satellite/audit: better handling of piece fetch errors

We have an alert on `not_enough_shares_for_audit` which fires too frequently. Every time so far, it has been because of a network blip of some nature on the satellite side. Satellite operators are expected to have other means in place for alerting on network problems and fixing them, so it's not necessary for the audit framework to act in that way. Instead, in this change, we add three new metrics, `audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and `audit_suspected_network_problem`. When an audit fails, and emits `not_enough_shares_for_audit`, we will now determine whether it looks like we are having network problems (most errors are connection failures, possibly also some successful connections which subsequently time out) or whether something else has happened. After this is deployed, we can remove the alert on `not_enough_shares_for_audit` and add new alerts on `audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`. `audit_suspected_network_problem` does not need an alert. Refs: https://github.com/storj/storj/issues/4669 Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
2022-09-26 18:47:03 -05:00 · 2022-09-26 18:47:03 -05:00 · 802ff18bd8
commit 802ff18bd8
parent 352e937813
2 changed files with 21 additions and 8 deletions
--- a/monkit.lock
+++ b/monkit.lock
@ -13,12 +13,15 @@ storj.io/storj/satellite/audit."audit_contained_percentage" FloatVal
 storj.io/storj/satellite/audit."audit_fail_nodes" IntVal
 storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter
 storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal
+storj.io/storj/satellite/audit."audit_not_enough_nodes_online" Counter
+storj.io/storj/satellite/audit."audit_not_enough_shares_acquired" Counter
 storj.io/storj/satellite/audit."audit_offline_nodes" IntVal
 storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter
 storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal
 storj.io/storj/satellite/audit."audit_success_nodes" IntVal
 storj.io/storj/satellite/audit."audit_success_nodes_global" Meter
 storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal
+storj.io/storj/satellite/audit."audit_suspected_network_problem" Counter
 storj.io/storj/satellite/audit."audit_total_nodes" IntVal
 storj.io/storj/satellite/audit."audit_total_nodes_global" Meter
 storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal
@ -28,6 +31,7 @@ storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter
 storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal
 storj.io/storj/satellite/audit."audited_percentage" FloatVal
 storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter
+storj.io/storj/satellite/audit."not_enough_shares_for_audit" Counter
 storj.io/storj/satellite/audit."reverify_contained" IntVal
 storj.io/storj/satellite/audit."reverify_contained_global" Meter
 storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal
--- a/satellite/audit/verifier.go
+++ b/satellite/audit/verifier.go
@ -123,7 +123,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
 	orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip)
 	if err != nil {
 		if orders.ErrDownloadFailedNotEnoughPieces.Has(err) {
-			mon.Counter("not_enough_shares_for_audit").Inc(1)
+			mon.Counter("not_enough_shares_for_audit").Inc(1)   //mon:locked
+			mon.Counter("audit_not_enough_nodes_online").Inc(1) //mon:locked
 			err = ErrNotEnoughShares.Wrap(err)
 		}
 		return Report{}, err
@ -232,10 +233,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
 	total := segmentInfo.Redundancy.TotalShares

 	if len(sharesToAudit) < int(required) {
-		mon.Counter("not_enough_shares_for_audit").Inc(1)
+		mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
 		// if we have reached this point, most likely something went wrong
-		// like a forgotten delete. Don't fail nodes. We have an alert on this.
-		// Check the logs and see what happened.
+		// like a network problem or a forgotten delete. Don't fail nodes.
+		// We have an alert on this. Check the logs and see what happened.
+		if len(offlineNodes)+len(containedNodes) > len(sharesToAudit)+len(failedNodes)+len(unknownNodes) {
+			mon.Counter("audit_suspected_network_problem").Inc(1) //mon:locked
+		} else {
+			mon.Counter("audit_not_enough_shares_acquired").Inc(1) //mon:locked
+		}
 		report := Report{
 			Offlines: offlineNodes,
 			Unknown:  unknownNodes,
@ -243,14 +249,17 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
 		return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
 			len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
 	}
-	// ensure we get values, even if only zero values, so that redash can have an alert based on this
-	mon.Counter("not_enough_shares_for_audit").Inc(0)
-	mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
+	// ensure we get values, even if only zero values, so that redash can have an alert based on these
+	mon.Counter("not_enough_shares_for_audit").Inc(0)      //mon:locked
+	mon.Counter("audit_not_enough_nodes_online").Inc(0)    //mon:locked
+	mon.Counter("audit_not_enough_shares_acquired").Inc(0) //mon:locked
+	mon.Counter("could_not_verify_audit_shares").Inc(0)    //mon:locked
+	mon.Counter("audit_suspected_network_problem").Inc(0)  //mon:locked

 	pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit)
 	if err != nil {
 		mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked
-		verifier.log.Error("could not verify shares", zap.Error(err))
+		verifier.log.Error("could not verify shares", zap.String("Segment", segmentInfoString(segment)), zap.Error(err))
 		return Report{
 			Fails:    failedNodes,
 			Offlines: offlineNodes,