satellite/audit: fix go1.19 dial timeouts and log more

Change-Id: Ide17c1b8e0ca8c86f305bea1b4ae553cc4cb60d0
This commit is contained in:
JT Olio 2023-02-28 00:08:47 -05:00 committed by Storj Robot
parent ec4b0c8ff6
commit 4362761fc7
2 changed files with 19 additions and 4 deletions

2
go.mod
View File

@ -8,6 +8,7 @@ require (
github.com/blang/semver v3.5.1+incompatible
github.com/calebcase/tmpfile v1.0.3
github.com/cheggaaa/pb/v3 v3.0.5
github.com/davecgh/go-spew v1.1.1
github.com/fatih/color v1.9.0
github.com/go-oauth2/oauth2/v4 v4.4.2
github.com/go-redis/redis/v8 v8.11.5
@ -71,7 +72,6 @@ require (
github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/cloudfoundry/gosigar v1.1.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/flynn/noise v1.0.0 // indirect
github.com/fsnotify/fsnotify v1.5.4 // indirect

View File

@ -6,11 +6,13 @@ package audit
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"math/rand"
"time"
"github.com/davecgh/go-spew/spew"
"github.com/spacemonkeygo/monkit/v3"
"github.com/vivint/infectious"
"github.com/zeebo/errs"
@ -172,8 +174,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
sharesToAudit[pieceNum] = share
continue
}
// TODO: just because an error came from the common/rpc package
// does not decisively mean that the problem is something to do
// with dialing. instead of trying to guess what different
// error classes mean, we should make GetShare inside
// DownloadShares return more direct information about when
// the failure happened.
if rpc.Error.Has(share.Error) {
if errs.Is(share.Error, context.DeadlineExceeded) {
if errors.Is(share.Error, context.DeadlineExceeded) || errs.Is(share.Error, context.DeadlineExceeded) {
// dial timeout
offlineNodes = append(offlineNodes, share.NodeID)
verifier.log.Debug("Verify: dial timeout (offline)",
@ -184,6 +193,10 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
}
if errs2.IsRPC(share.Error, rpcstatus.Unknown) {
// dial failed -- offline node
// TODO: we should never assume what an unknown
// error means. This should be looking for an explicit
// indication that dialing failed, not assuming dialing
// failed because the rpc status is unknown
offlineNodes = append(offlineNodes, share.NodeID)
verifier.log.Debug("Verify: dial failed (offline)",
zap.Stringer("Node ID", share.NodeID),
@ -196,7 +209,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
verifier.log.Info("Verify: unknown transport error (skipped)",
zap.Stringer("Node ID", share.NodeID),
zap.String("Segment", segmentInfoString(segment)),
zap.Error(share.Error))
zap.Error(share.Error),
zap.String("ErrorType", spew.Sprintf("%#+v", share.Error)))
continue
}
@ -225,7 +239,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
verifier.log.Info("Verify: unknown error (skipped)",
zap.Stringer("Node ID", share.NodeID),
zap.String("Segment", segmentInfoString(segment)),
zap.Error(share.Error))
zap.Error(share.Error),
zap.String("ErrorType", spew.Sprintf("%#+v", share.Error)))
}
mon.IntVal("verify_shares_downloaded_successfully").Observe(int64(len(sharesToAudit))) //mon:locked