From 23c5d6c2872e36d0eb87a05d0cbd60d229fe5126 Mon Sep 17 00:00:00 2001 From: paul cannon Date: Fri, 13 Oct 2023 16:15:36 -0500 Subject: [PATCH] cmd/tools/segment-verify: improve logging around common problems Change-Id: I4f684745df708627f135baee619d17788bc8d63e --- cmd/tools/segment-verify/process.go | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/cmd/tools/segment-verify/process.go b/cmd/tools/segment-verify/process.go index 49990c99a..149f9f1c9 100644 --- a/cmd/tools/segment-verify/process.go +++ b/cmd/tools/segment-verify/process.go @@ -5,6 +5,7 @@ package main import ( "context" + "errors" "sync" "go.uber.org/zap" @@ -82,24 +83,29 @@ func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) err limiter := sync2.NewLimiter(service.config.Concurrency) for _, batch := range batches { batch := batch + log := service.log.With(zap.Int("num pieces", batch.Len())) info, err := service.GetNodeInfo(ctx, batch.Alias) if err != nil { if ErrNoSuchNode.Has(err) { - service.log.Error("will not verify batch; consider pieces lost", - zap.Int("alias", int(batch.Alias)), - zap.Error(err)) + log.Info("node has left the cluster; considering pieces lost", + zap.Int("alias", int(batch.Alias))) + for _, seg := range batch.Items { + seg.Status.MarkNotFound() + } continue } return Error.Wrap(err) } + log = log.With(zap.Stringer("node ID", info.NodeURL.ID)) ignoreThrottle := service.priorityNodes.Contains(batch.Alias) limiter.Go(ctx, func() { verifiedCount, err := service.verifier.Verify(ctx, batch.Alias, info.NodeURL, info.Version, batch.Items, ignoreThrottle) if err != nil { - if ErrNodeOffline.Has(err) { + switch { + case ErrNodeOffline.Has(err): mu.Lock() if verifiedCount == 0 { service.offlineNodes.Add(batch.Alias) @@ -110,8 +116,14 @@ func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) err } } mu.Unlock() + log.Info("node is offline; marking pieces as retryable") + return + case errors.Is(err, context.DeadlineExceeded): + log.Info("request to node timed out; marking pieces as retryable") + return + default: + log.Error("verifying a batch failed", zap.Error(err)) } - service.log.Error("verifying a batch failed", zap.Error(err)) } else { mu.Lock() if service.offlineCount[batch.Alias] > 0 {