satellite/audit: fix go1.19 dial timeouts and log more
Change-Id: Ide17c1b8e0ca8c86f305bea1b4ae553cc4cb60d0
This commit is contained in:
parent
ec4b0c8ff6
commit
4362761fc7
2
go.mod
2
go.mod
@ -8,6 +8,7 @@ require (
|
||||
github.com/blang/semver v3.5.1+incompatible
|
||||
github.com/calebcase/tmpfile v1.0.3
|
||||
github.com/cheggaaa/pb/v3 v3.0.5
|
||||
github.com/davecgh/go-spew v1.1.1
|
||||
github.com/fatih/color v1.9.0
|
||||
github.com/go-oauth2/oauth2/v4 v4.4.2
|
||||
github.com/go-redis/redis/v8 v8.11.5
|
||||
@ -71,7 +72,6 @@ require (
|
||||
github.com/boombuler/barcode v1.0.1-0.20190219062509-6c824513bacc // indirect
|
||||
github.com/cespare/xxhash/v2 v2.1.2 // indirect
|
||||
github.com/cloudfoundry/gosigar v1.1.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/flynn/noise v1.0.0 // indirect
|
||||
github.com/fsnotify/fsnotify v1.5.4 // indirect
|
||||
|
@ -6,11 +6,13 @@ package audit
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
"github.com/davecgh/go-spew/spew"
|
||||
"github.com/spacemonkeygo/monkit/v3"
|
||||
"github.com/vivint/infectious"
|
||||
"github.com/zeebo/errs"
|
||||
@ -172,8 +174,15 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
sharesToAudit[pieceNum] = share
|
||||
continue
|
||||
}
|
||||
|
||||
// TODO: just because an error came from the common/rpc package
|
||||
// does not decisively mean that the problem is something to do
|
||||
// with dialing. instead of trying to guess what different
|
||||
// error classes mean, we should make GetShare inside
|
||||
// DownloadShares return more direct information about when
|
||||
// the failure happened.
|
||||
if rpc.Error.Has(share.Error) {
|
||||
if errs.Is(share.Error, context.DeadlineExceeded) {
|
||||
if errors.Is(share.Error, context.DeadlineExceeded) || errs.Is(share.Error, context.DeadlineExceeded) {
|
||||
// dial timeout
|
||||
offlineNodes = append(offlineNodes, share.NodeID)
|
||||
verifier.log.Debug("Verify: dial timeout (offline)",
|
||||
@ -184,6 +193,10 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
}
|
||||
if errs2.IsRPC(share.Error, rpcstatus.Unknown) {
|
||||
// dial failed -- offline node
|
||||
// TODO: we should never assume what an unknown
|
||||
// error means. This should be looking for an explicit
|
||||
// indication that dialing failed, not assuming dialing
|
||||
// failed because the rpc status is unknown
|
||||
offlineNodes = append(offlineNodes, share.NodeID)
|
||||
verifier.log.Debug("Verify: dial failed (offline)",
|
||||
zap.Stringer("Node ID", share.NodeID),
|
||||
@ -196,7 +209,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
verifier.log.Info("Verify: unknown transport error (skipped)",
|
||||
zap.Stringer("Node ID", share.NodeID),
|
||||
zap.String("Segment", segmentInfoString(segment)),
|
||||
zap.Error(share.Error))
|
||||
zap.Error(share.Error),
|
||||
zap.String("ErrorType", spew.Sprintf("%#+v", share.Error)))
|
||||
continue
|
||||
}
|
||||
|
||||
@ -225,7 +239,8 @@ func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[
|
||||
verifier.log.Info("Verify: unknown error (skipped)",
|
||||
zap.Stringer("Node ID", share.NodeID),
|
||||
zap.String("Segment", segmentInfoString(segment)),
|
||||
zap.Error(share.Error))
|
||||
zap.Error(share.Error),
|
||||
zap.String("ErrorType", spew.Sprintf("%#+v", share.Error)))
|
||||
}
|
||||
mon.IntVal("verify_shares_downloaded_successfully").Observe(int64(len(sharesToAudit))) //mon:locked
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user