satellite:{audit,repair}: log additional info when we can't download enough pieces
When we can't complete an audit or repair, we need more information about what happened during each individual share/piece download. In audit, add the number of offline, unknown, contained, failed nodes to the error log. In repair, combine the errors from each download and add them to the error log. Change-Id: Ic5d2a0f3f291f26cb82662bfb37355dd2b5c89ba
This commit is contained in:
parent
7b152cddec
commit
a8f125c671
@ -227,7 +227,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
|||||||
Offlines: offlineNodes,
|
Offlines: offlineNodes,
|
||||||
Unknown: unknownNodes,
|
Unknown: unknownNodes,
|
||||||
}
|
}
|
||||||
return report, ErrNotEnoughShares.New("got %d, required %d", len(sharesToAudit), required)
|
return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
|
||||||
|
len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
|
||||||
}
|
}
|
||||||
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
||||||
mon.Counter("not_enough_shares_for_audit").Inc(0)
|
mon.Counter("not_enough_shares_for_audit").Inc(0)
|
||||||
|
@ -7,6 +7,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"sort"
|
"sort"
|
||||||
@ -83,6 +84,7 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
|
|||||||
|
|
||||||
limiter := sync2.NewLimiter(es.RequiredCount())
|
limiter := sync2.NewLimiter(es.RequiredCount())
|
||||||
cond := sync.NewCond(&sync.Mutex{})
|
cond := sync.NewCond(&sync.Mutex{})
|
||||||
|
errChan := make(chan error, len(limits))
|
||||||
|
|
||||||
for currentLimitIndex, limit := range limits {
|
for currentLimitIndex, limit := range limits {
|
||||||
if limit == nil {
|
if limit == nil {
|
||||||
@ -145,6 +147,7 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
|
|||||||
ec.log.Debug("Failed to download pieces for repair",
|
ec.log.Debug("Failed to download pieces for repair",
|
||||||
zap.Error(err))
|
zap.Error(err))
|
||||||
}
|
}
|
||||||
|
errChan <- fmt.Errorf("node id: %s, error: %w", limit.GetLimit().StorageNodeId.String(), err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,12 +160,20 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
|
|||||||
}
|
}
|
||||||
|
|
||||||
limiter.Wait()
|
limiter.Wait()
|
||||||
|
close(errChan)
|
||||||
|
|
||||||
|
var errlist errs.Group
|
||||||
if successfulPieces < es.RequiredCount() {
|
if successfulPieces < es.RequiredCount() {
|
||||||
mon.Meter("download_failed_not_enough_pieces_repair").Mark(1) //mon:locked
|
mon.Meter("download_failed_not_enough_pieces_repair").Mark(1) //mon:locked
|
||||||
|
for err := range errChan {
|
||||||
|
if err != nil {
|
||||||
|
errlist.Add(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
return nil, failedPieces, &irreparableError{
|
return nil, failedPieces, &irreparableError{
|
||||||
piecesAvailable: int32(successfulPieces),
|
piecesAvailable: int32(successfulPieces),
|
||||||
piecesRequired: int32(es.RequiredCount()),
|
piecesRequired: int32(es.RequiredCount()),
|
||||||
|
errlist: errlist,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,6 +42,7 @@ var (
|
|||||||
type irreparableError struct {
|
type irreparableError struct {
|
||||||
piecesAvailable int32
|
piecesAvailable int32
|
||||||
piecesRequired int32
|
piecesRequired int32
|
||||||
|
errlist []error
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ie *irreparableError) Error() string {
|
func (ie *irreparableError) Error() string {
|
||||||
@ -303,6 +304,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
|||||||
zap.Uint64("Position", queueSegment.Position.Encode()),
|
zap.Uint64("Position", queueSegment.Position.Encode()),
|
||||||
zap.Int32("piecesAvailable", irreparableErr.piecesAvailable),
|
zap.Int32("piecesAvailable", irreparableErr.piecesAvailable),
|
||||||
zap.Int32("piecesRequired", irreparableErr.piecesRequired),
|
zap.Int32("piecesRequired", irreparableErr.piecesRequired),
|
||||||
|
zap.Error(errs.Combine(irreparableErr.errlist...)),
|
||||||
)
|
)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user