diff --git a/monkit.lock b/monkit.lock index 989b9e834..42fc429cc 100644 --- a/monkit.lock +++ b/monkit.lock @@ -13,12 +13,15 @@ storj.io/storj/satellite/audit."audit_contained_percentage" FloatVal storj.io/storj/satellite/audit."audit_fail_nodes" IntVal storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal +storj.io/storj/satellite/audit."audit_not_enough_nodes_online" Counter +storj.io/storj/satellite/audit."audit_not_enough_shares_acquired" Counter storj.io/storj/satellite/audit."audit_offline_nodes" IntVal storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal storj.io/storj/satellite/audit."audit_success_nodes" IntVal storj.io/storj/satellite/audit."audit_success_nodes_global" Meter storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal +storj.io/storj/satellite/audit."audit_suspected_network_problem" Counter storj.io/storj/satellite/audit."audit_total_nodes" IntVal storj.io/storj/satellite/audit."audit_total_nodes_global" Meter storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal @@ -28,6 +31,7 @@ storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal storj.io/storj/satellite/audit."audited_percentage" FloatVal storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter +storj.io/storj/satellite/audit."not_enough_shares_for_audit" Counter storj.io/storj/satellite/audit."reverify_contained" IntVal storj.io/storj/satellite/audit."reverify_contained_global" Meter storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal diff --git a/satellite/audit/verifier.go b/satellite/audit/verifier.go index 2b6dce10c..516e0a233 100644 --- a/satellite/audit/verifier.go +++ b/satellite/audit/verifier.go @@ -123,7 +123,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[ orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip) if err != nil { if orders.ErrDownloadFailedNotEnoughPieces.Has(err) { - mon.Counter("not_enough_shares_for_audit").Inc(1) + mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked + mon.Counter("audit_not_enough_nodes_online").Inc(1) //mon:locked err = ErrNotEnoughShares.Wrap(err) } return Report{}, err @@ -232,10 +233,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[ total := segmentInfo.Redundancy.TotalShares if len(sharesToAudit) < int(required) { - mon.Counter("not_enough_shares_for_audit").Inc(1) + mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked // if we have reached this point, most likely something went wrong - // like a forgotten delete. Don't fail nodes. We have an alert on this. - // Check the logs and see what happened. + // like a network problem or a forgotten delete. Don't fail nodes. + // We have an alert on this. Check the logs and see what happened. + if len(offlineNodes)+len(containedNodes) > len(sharesToAudit)+len(failedNodes)+len(unknownNodes) { + mon.Counter("audit_suspected_network_problem").Inc(1) //mon:locked + } else { + mon.Counter("audit_not_enough_shares_acquired").Inc(1) //mon:locked + } report := Report{ Offlines: offlineNodes, Unknown: unknownNodes, @@ -243,14 +249,17 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[ return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d", len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes)) } - // ensure we get values, even if only zero values, so that redash can have an alert based on this - mon.Counter("not_enough_shares_for_audit").Inc(0) - mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked + // ensure we get values, even if only zero values, so that redash can have an alert based on these + mon.Counter("not_enough_shares_for_audit").Inc(0) //mon:locked + mon.Counter("audit_not_enough_nodes_online").Inc(0) //mon:locked + mon.Counter("audit_not_enough_shares_acquired").Inc(0) //mon:locked + mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked + mon.Counter("audit_suspected_network_problem").Inc(0) //mon:locked pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit) if err != nil { mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked - verifier.log.Error("could not verify shares", zap.Error(err)) + verifier.log.Error("could not verify shares", zap.String("Segment", segmentInfoString(segment)), zap.Error(err)) return Report{ Fails: failedNodes, Offlines: offlineNodes,