satellite:{audit,repair}: log additional info when we can't download enough pieces

When we can't complete an audit or repair, we need more information about
what happened during each individual share/piece download.

In audit, add the number of offline, unknown, contained, failed nodes to
the error log. In repair, combine the errors from each download and add
them to the error log.

Change-Id: Ic5d2a0f3f291f26cb82662bfb37355dd2b5c89ba
This commit is contained in:
Cameron Ayer 2021-08-06 13:58:22 -04:00 committed by Yingrong Zhao
parent 7b152cddec
commit a8f125c671
3 changed files with 15 additions and 1 deletions

View File

@ -227,7 +227,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
Offlines: offlineNodes, Offlines: offlineNodes,
Unknown: unknownNodes, Unknown: unknownNodes,
} }
return report, ErrNotEnoughShares.New("got %d, required %d", len(sharesToAudit), required) return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
} }
// ensure we get values, even if only zero values, so that redash can have an alert based on this // ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("not_enough_shares_for_audit").Inc(0) mon.Counter("not_enough_shares_for_audit").Inc(0)

View File

@ -7,6 +7,7 @@ import (
"bytes" "bytes"
"context" "context"
"errors" "errors"
"fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"sort" "sort"
@ -83,6 +84,7 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
limiter := sync2.NewLimiter(es.RequiredCount()) limiter := sync2.NewLimiter(es.RequiredCount())
cond := sync.NewCond(&sync.Mutex{}) cond := sync.NewCond(&sync.Mutex{})
errChan := make(chan error, len(limits))
for currentLimitIndex, limit := range limits { for currentLimitIndex, limit := range limits {
if limit == nil { if limit == nil {
@ -145,6 +147,7 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
ec.log.Debug("Failed to download pieces for repair", ec.log.Debug("Failed to download pieces for repair",
zap.Error(err)) zap.Error(err))
} }
errChan <- fmt.Errorf("node id: %s, error: %w", limit.GetLimit().StorageNodeId.String(), err)
return return
} }
@ -157,12 +160,20 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
} }
limiter.Wait() limiter.Wait()
close(errChan)
var errlist errs.Group
if successfulPieces < es.RequiredCount() { if successfulPieces < es.RequiredCount() {
mon.Meter("download_failed_not_enough_pieces_repair").Mark(1) //mon:locked mon.Meter("download_failed_not_enough_pieces_repair").Mark(1) //mon:locked
for err := range errChan {
if err != nil {
errlist.Add(err)
}
}
return nil, failedPieces, &irreparableError{ return nil, failedPieces, &irreparableError{
piecesAvailable: int32(successfulPieces), piecesAvailable: int32(successfulPieces),
piecesRequired: int32(es.RequiredCount()), piecesRequired: int32(es.RequiredCount()),
errlist: errlist,
} }
} }

View File

@ -42,6 +42,7 @@ var (
type irreparableError struct { type irreparableError struct {
piecesAvailable int32 piecesAvailable int32
piecesRequired int32 piecesRequired int32
errlist []error
} }
func (ie *irreparableError) Error() string { func (ie *irreparableError) Error() string {
@ -303,6 +304,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
zap.Uint64("Position", queueSegment.Position.Encode()), zap.Uint64("Position", queueSegment.Position.Encode()),
zap.Int32("piecesAvailable", irreparableErr.piecesAvailable), zap.Int32("piecesAvailable", irreparableErr.piecesAvailable),
zap.Int32("piecesRequired", irreparableErr.piecesRequired), zap.Int32("piecesRequired", irreparableErr.piecesRequired),
zap.Error(errs.Combine(irreparableErr.errlist...)),
) )
return false, nil return false, nil
} }