satellite/repair: add timeout for repair download from a single node(#3418)
This commit is contained in:
parent
e96d615013
commit
bfa6699e2c
@ -315,6 +315,7 @@ func (planet *Planet) newSatellites(count int) ([]*SatelliteSystem, error) {
|
||||
MaxRepair: 10,
|
||||
Interval: time.Hour,
|
||||
Timeout: 1 * time.Minute, // Repairs can take up to 10 seconds. Leaving room for outliers
|
||||
DownloadTimeout: 1 * time.Minute,
|
||||
MaxBufferMem: 4 * memory.MiB,
|
||||
MaxExcessRateOptimalThreshold: 0.05,
|
||||
},
|
||||
|
@ -305,6 +305,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, pointerDB metainfo
|
||||
config.Repairer.Timeout,
|
||||
config.Repairer.MaxExcessRateOptimalThreshold,
|
||||
config.Checker.RepairOverride,
|
||||
config.Repairer.DownloadTimeout,
|
||||
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
|
||||
)
|
||||
|
||||
|
@ -36,14 +36,16 @@ type ECRepairer struct {
|
||||
log *zap.Logger
|
||||
dialer rpc.Dialer
|
||||
satelliteSignee signing.Signee
|
||||
downloadTimeout time.Duration
|
||||
}
|
||||
|
||||
// NewECRepairer creates a new repairer for interfacing with storagenodes.
|
||||
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee) *ECRepairer {
|
||||
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee, downloadTimeout time.Duration) *ECRepairer {
|
||||
return &ECRepairer{
|
||||
log: log,
|
||||
dialer: dialer,
|
||||
satelliteSignee: satelliteSignee,
|
||||
downloadTimeout: downloadTimeout,
|
||||
}
|
||||
}
|
||||
|
||||
@ -162,7 +164,10 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
|
||||
// and expects the hash of the data to match the signed hash provided by the storagenode.
|
||||
func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, pieceSize int64) (data []byte, err error) {
|
||||
// contact node
|
||||
ps, err := ec.dialPiecestore(ctx, &pb.Node{
|
||||
downloadCtx, cancel := context.WithTimeout(ctx, ec.downloadTimeout)
|
||||
defer cancel()
|
||||
|
||||
ps, err := ec.dialPiecestore(downloadCtx, &pb.Node{
|
||||
Id: limit.GetLimit().StorageNodeId,
|
||||
Address: limit.GetStorageNodeAddress(),
|
||||
})
|
||||
@ -171,7 +176,7 @@ func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.Addr
|
||||
}
|
||||
defer func() { err = errs.Combine(err, ps.Close()) }()
|
||||
|
||||
downloader, err := ps.Download(ctx, limit.GetLimit(), privateKey, 0, pieceSize)
|
||||
downloader, err := ps.Download(downloadCtx, limit.GetLimit(), privateKey, 0, pieceSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ type Config struct {
|
||||
MaxRepair int `help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1"`
|
||||
Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"1h" devDefault:"0h5m0s"`
|
||||
Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"10m0s"`
|
||||
DownloadTimeout time.Duration `help:"time limit for downloading pieces from a node for repair" default:"5m0s"`
|
||||
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4M"`
|
||||
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
|
||||
}
|
||||
|
@ -51,7 +51,9 @@ type SegmentRepairer struct {
|
||||
func NewSegmentRepairer(
|
||||
log *zap.Logger, metainfo *metainfo.Service, orders *orders.Service,
|
||||
overlay *overlay.Service, dialer rpc.Dialer, timeout time.Duration,
|
||||
excessOptimalThreshold float64, repairOverride int, satelliteSignee signing.Signee,
|
||||
excessOptimalThreshold float64, repairOverride int,
|
||||
downloadTimeout time.Duration,
|
||||
satelliteSignee signing.Signee,
|
||||
) *SegmentRepairer {
|
||||
|
||||
if excessOptimalThreshold < 0 {
|
||||
@ -63,7 +65,7 @@ func NewSegmentRepairer(
|
||||
metainfo: metainfo,
|
||||
orders: orders,
|
||||
overlay: overlay,
|
||||
ec: NewECRepairer(log.Named("ec repairer"), dialer, satelliteSignee),
|
||||
ec: NewECRepairer(log.Named("ec repairer"), dialer, satelliteSignee, downloadTimeout),
|
||||
timeout: timeout,
|
||||
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
|
||||
repairOverride: repairOverride,
|
||||
|
@ -108,6 +108,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity, pointerDB metainf
|
||||
config.Repairer.Timeout,
|
||||
config.Repairer.MaxExcessRateOptimalThreshold,
|
||||
config.Checker.RepairOverride,
|
||||
config.Repairer.DownloadTimeout,
|
||||
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
|
||||
)
|
||||
peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer)
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -328,6 +328,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# number of update requests to process per transaction
|
||||
# overlay.update-stats-batch-size: 100
|
||||
|
||||
# time limit for downloading pieces from a node for repair
|
||||
# repairer.download-timeout: 5m0s
|
||||
|
||||
# how frequently repairer should try and repair more data
|
||||
# repairer.interval: 1h0m0s
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user