satellite/audit: log error and increment metric if shares cannot be verified

If we encounter an error during the infectious error correction, we just
add it to the errlist to be logged at the worker level.
We want to make sure we know about this if it happens. Give it its own
error log and increment a monkit metric.

Change-Id: Ie5946ae3cd97b766e3099af8ce160a686135ee27
This commit is contained in:
Cameron Ayer 2021-08-16 16:11:45 -04:00 committed by Cameron Ayer
parent 26f839a445
commit 28cb690618
2 changed files with 4 additions and 0 deletions

View File

@ -27,6 +27,7 @@ storj.io/storj/satellite/audit."audit_unknown_nodes" IntVal
storj.io/storj/satellite/audit."audit_unknown_nodes_global" Meter
storj.io/storj/satellite/audit."audit_unknown_percentage" FloatVal
storj.io/storj/satellite/audit."audited_percentage" FloatVal
storj.io/storj/satellite/audit."could_not_verify_audit_shares" Counter
storj.io/storj/satellite/audit."reverify_contained" IntVal
storj.io/storj/satellite/audit."reverify_contained_global" Meter
storj.io/storj/satellite/audit."reverify_contained_in_segment" IntVal

View File

@ -236,9 +236,12 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
}
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("not_enough_shares_for_audit").Inc(0)
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
pieceNums, correctedShares, err := auditShares(ctx, required, total, sharesToAudit)
if err != nil {
mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked
verifier.log.Error("could not verify shares", zap.Error(err))
return Report{
Fails: failedNodes,
Offlines: offlineNodes,