satellite/audit: better handling of piece fetch errors
We have an alert on `not_enough_shares_for_audit` which fires too frequently. Every time so far, it has been because of a network blip of some nature on the satellite side. Satellite operators are expected to have other means in place for alerting on network problems and fixing them, so it's not necessary for the audit framework to act in that way. Instead, in this change, we add three new metrics, `audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and `audit_suspected_network_problem`. When an audit fails, and emits `not_enough_shares_for_audit`, we will now determine whether it looks like we are having network problems (most errors are connection failures, possibly also some successful connections which subsequently time out) or whether something else has happened. After this is deployed, we can remove the alert on `not_enough_shares_for_audit` and add new alerts on `audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`. `audit_suspected_network_problem` does not need an alert. Refs: https://github.com/storj/storj/issues/4669 Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
This commit is contained in:
parent
352e937813
commit
802ff18bd8
@ -13,12 +13,15 @@ storj.io/storj/satellite/audit."audit_contained_percentage" FloatVal
|
||||
storj.io/storj/satellite/audit."audit_fail_nodes" IntVal
|
||||
storj.io/storj/satellite/audit."audit_fail_nodes_global" Meter
|
||||
storj.io/storj/satellite/audit."audit_failed_percentage" FloatVal
|
||||
storj.io/storj/satellite/audit."audit_not_enough_nodes_online" Counter
|
||||
storj.io/storj/satellite/audit."audit_not_enough_shares_acquired" Counter
|
||||
storj.io/storj/satellite/audit."audit_offline_nodes" IntVal
|
||||
storj.io/storj/satellite/audit."audit_offline_nodes_global" Meter
|
||||
storj.io/storj/satellite/audit."audit_offline_percentage" FloatVal
|
||||
storj.io/storj/satellite/audit."audit_success_nodes" IntVal
|
||||
storj.io/storj/satellite/audit."audit_success_nodes_global" Meter
|
||||
storj.io/storj/satellite/audit."audit_successful_percentage" FloatVal
|
||||
storj.io/storj/satellite/audit."audit_suspected_network_problem" Counter
|
||||
storj.io/storj/satellite/audit."audit_total_nodes" IntVal
|
||||
storj.io/storj/satellite/audit."audit_total_nodes_global" Meter
|
||||
storj.io/storj/satellite/audit."audit_total_pointer_nodes" IntVal
|
||||
@ -28,6 +31,7 @@ storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter
|
||||
storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal
|
||||
storj.io/storj/satellite/audit."audited_percentage" FloatVal
|
||||
storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter
|
||||
storj.io/storj/satellite/audit."not_enough_shares_for_audit" Counter
|
||||
storj.io/storj/satellite/audit."reverify_contained" IntVal
|
||||
storj.io/storj/satellite/audit."reverify_contained_global" Meter
|
||||
storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal
|
||||
|
@ -123,7 +123,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip)
|
||||
if err != nil {
|
||||
if orders.ErrDownloadFailedNotEnoughPieces.Has(err) {
|
||||
mon.Counter("not_enough_shares_for_audit").Inc(1)
|
||||
mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
|
||||
mon.Counter("audit_not_enough_nodes_online").Inc(1) //mon:locked
|
||||
err = ErrNotEnoughShares.Wrap(err)
|
||||
}
|
||||
return Report{}, err
|
||||
@ -232,10 +233,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
total := segmentInfo.Redundancy.TotalShares
|
||||
|
||||
if len(sharesToAudit) < int(required) {
|
||||
mon.Counter("not_enough_shares_for_audit").Inc(1)
|
||||
mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
|
||||
// if we have reached this point, most likely something went wrong
|
||||
// like a forgotten delete. Don't fail nodes. We have an alert on this.
|
||||
// Check the logs and see what happened.
|
||||
// like a network problem or a forgotten delete. Don't fail nodes.
|
||||
// We have an alert on this. Check the logs and see what happened.
|
||||
if len(offlineNodes)+len(containedNodes) > len(sharesToAudit)+len(failedNodes)+len(unknownNodes) {
|
||||
mon.Counter("audit_suspected_network_problem").Inc(1) //mon:locked
|
||||
} else {
|
||||
mon.Counter("audit_not_enough_shares_acquired").Inc(1) //mon:locked
|
||||
}
|
||||
report := Report{
|
||||
Offlines: offlineNodes,
|
||||
Unknown: unknownNodes,
|
||||
@ -243,14 +249,17 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
|
||||
len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
|
||||
}
|
||||
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
||||
mon.Counter("not_enough_shares_for_audit").Inc(0)
|
||||
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
|
||||
// ensure we get values, even if only zero values, so that redash can have an alert based on these
|
||||
mon.Counter("not_enough_shares_for_audit").Inc(0) //mon:locked
|
||||
mon.Counter("audit_not_enough_nodes_online").Inc(0) //mon:locked
|
||||
mon.Counter("audit_not_enough_shares_acquired").Inc(0) //mon:locked
|
||||
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
|
||||
mon.Counter("audit_suspected_network_problem").Inc(0) //mon:locked
|
||||
|
||||
pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit)
|
||||
if err != nil {
|
||||
mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked
|
||||
verifier.log.Error("could not verify shares", zap.Error(err))
|
||||
verifier.log.Error("could not verify shares", zap.String("Segment", segmentInfoString(segment)), zap.Error(err))
|
||||
return Report{
|
||||
Fails: failedNodes,
|
||||
Offlines: offlineNodes,
|
||||
|
Loading…
Reference in New Issue
Block a user