cmd/tools/segment-verify: improve logging around common problems

Change-Id: I4f684745df708627f135baee619d17788bc8d63e
This commit is contained in:
paul cannon 2023-10-13 16:15:36 -05:00 committed by Storj Robot
parent 281edfa585
commit 23c5d6c287

View File

@ -5,6 +5,7 @@ package main
import (
"context"
"errors"
"sync"
"go.uber.org/zap"
@ -82,24 +83,29 @@ func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) err
limiter := sync2.NewLimiter(service.config.Concurrency)
for _, batch := range batches {
batch := batch
log := service.log.With(zap.Int("num pieces", batch.Len()))
info, err := service.GetNodeInfo(ctx, batch.Alias)
if err != nil {
if ErrNoSuchNode.Has(err) {
service.log.Error("will not verify batch; consider pieces lost",
zap.Int("alias", int(batch.Alias)),
zap.Error(err))
log.Info("node has left the cluster; considering pieces lost",
zap.Int("alias", int(batch.Alias)))
for _, seg := range batch.Items {
seg.Status.MarkNotFound()
}
continue
}
return Error.Wrap(err)
}
log = log.With(zap.Stringer("node ID", info.NodeURL.ID))
ignoreThrottle := service.priorityNodes.Contains(batch.Alias)
limiter.Go(ctx, func() {
verifiedCount, err := service.verifier.Verify(ctx, batch.Alias, info.NodeURL, info.Version, batch.Items, ignoreThrottle)
if err != nil {
if ErrNodeOffline.Has(err) {
switch {
case ErrNodeOffline.Has(err):
mu.Lock()
if verifiedCount == 0 {
service.offlineNodes.Add(batch.Alias)
@ -110,8 +116,14 @@ func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) err
}
}
mu.Unlock()
log.Info("node is offline; marking pieces as retryable")
return
case errors.Is(err, context.DeadlineExceeded):
log.Info("request to node timed out; marking pieces as retryable")
return
default:
log.Error("verifying a batch failed", zap.Error(err))
}
service.log.Error("verifying a batch failed", zap.Error(err))
} else {
mu.Lock()
if service.offlineCount[batch.Alias] > 0 {