satellite/repair/repairer: make DialTimeout configurable

This change makes dial timeout configurable and change it also from
defatul 20s to 5s. Main motivation is that during repair we often loose
lots of time to dial which eventually will fail. New timeout should be
still enough to dial but we will move forward quicker to next node if
that one will fail.

Timeout is also applied directly as context timeout in case we will
use noise of tcp fast open one day.

Change-Id: I021bf459af49b11241e314fa1a7887c81d5214ea
This commit is contained in:
Michal Niewrzal 2023-06-16 11:42:05 +02:00 committed by Storj Robot
parent dbd575e50b
commit cb9a7bdc71
6 changed files with 22 additions and 5 deletions

View File

@ -114,6 +114,7 @@ func cmdRepairSegment(cmd *cobra.Command, args []string) (err error) {
log.Named("ec-repair"),
dialer,
signing.SigneeFromPeerIdentity(identity.PeerIdentity()),
config.Repairer.DialTimeout,
config.Repairer.DownloadTimeout,
true) // force inmemory download of pieces

View File

@ -2458,6 +2458,7 @@ func ecRepairerWithMockConnector(t testing.TB, sat *testplanet.Satellite, mock *
zaptest.NewLogger(t).Named("a-special-repairer"),
newDialer,
signing.SigneeFromPeerIdentity(sat.Identity.PeerIdentity()),
sat.Config.Repairer.DialTimeout,
sat.Config.Repairer.DownloadTimeout,
sat.Config.Repairer.InMemoryRepair,
)

View File

@ -46,6 +46,7 @@ type ECRepairer struct {
log *zap.Logger
dialer rpc.Dialer
satelliteSignee signing.Signee
dialTimeout time.Duration
downloadTimeout time.Duration
inmemory bool
@ -54,11 +55,12 @@ type ECRepairer struct {
}
// NewECRepairer creates a new repairer for interfacing with storagenodes.
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee, downloadTimeout time.Duration, inmemory bool) *ECRepairer {
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee, dialTimeout time.Duration, downloadTimeout time.Duration, inmemory bool) *ECRepairer {
return &ECRepairer{
log: log,
dialer: dialer,
satelliteSignee: satelliteSignee,
dialTimeout: dialTimeout,
downloadTimeout: downloadTimeout,
inmemory: inmemory,
}
@ -293,10 +295,10 @@ func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.Addr
defer mon.Task()(&ctx)(&err)
// contact node
downloadCtx, cancel := context.WithTimeout(ctx, ec.downloadTimeout)
defer cancel()
dialCtx, dialCancel := context.WithTimeout(ctx, ec.dialTimeout)
defer dialCancel()
ps, err := ec.dialPiecestore(downloadCtx, storj.NodeURL{
ps, err := ec.dialPiecestore(dialCtx, storj.NodeURL{
ID: limit.GetLimit().StorageNodeId,
Address: address,
})
@ -305,6 +307,9 @@ func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.Addr
}
defer func() { err = errs.Combine(err, ps.Close()) }()
downloadCtx, cancel := context.WithTimeout(ctx, ec.downloadTimeout)
defer cancel()
downloader, err := ps.Download(downloadCtx, limit.GetLimit(), privateKey, 0, pieceSize)
if err != nil {
return nil, nil, nil, err
@ -544,7 +549,11 @@ func (ec *ECRepairer) putPiece(ctx, parent context.Context, limit *pb.AddressedO
storageNodeID := limit.GetLimit().StorageNodeId
pieceID := limit.GetLimit().PieceId
ps, err := ec.dialPiecestore(ctx, storj.NodeURL{
dialCtx, dialCancel := context.WithTimeout(ctx, ec.dialTimeout)
defer dialCancel()
ps, err := ec.dialPiecestore(dialCtx, storj.NodeURL{
ID: storageNodeID,
Address: limit.GetStorageNodeAddress().Address,
})

View File

@ -27,6 +27,7 @@ var (
type Config struct {
MaxRepair int `help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1" testDefault:"10"`
Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"5m0s" devDefault:"1m0s" testDefault:"$TESTINTERVAL"`
DialTimeout time.Duration `help:"time limit for dialing storage node" default:"5s"`
Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"5m0s" testDefault:"1m"`
DownloadTimeout time.Duration `help:"time limit for downloading pieces from a node for repair" default:"5m0s" testDefault:"1m"`
TotalTimeout time.Duration `help:"time limit for an entire repair job, from queue pop to upload completion" default:"45m" testDefault:"10m"`

View File

@ -136,6 +136,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,
}
peer.Dialer = rpc.NewDefaultDialer(tlsOptions)
peer.Dialer.DialTimeout = config.Repairer.DialTimeout
}
{ // setup overlay
@ -204,6 +205,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,
log.Named("ec-repair"),
peer.Dialer,
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
config.Repairer.DialTimeout,
config.Repairer.DownloadTimeout,
config.Repairer.InMemoryRepair)

View File

@ -916,6 +916,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# ratio where to consider processed count as supicious
# ranged-loop.suspicious-processed-ratio: 0.03
# time limit for dialing storage node
# repairer.dial-timeout: 5s
# repair pieces on the same network to other nodes
# repairer.do-declumping: false