satellite/repair: add timeout for repair download from a single node(#3418)
This commit is contained in:
parent
e96d615013
commit
bfa6699e2c
@ -315,6 +315,7 @@ func (planet *Planet) newSatellites(count int) ([]*SatelliteSystem, error) {
|
|||||||
MaxRepair: 10,
|
MaxRepair: 10,
|
||||||
Interval: time.Hour,
|
Interval: time.Hour,
|
||||||
Timeout: 1 * time.Minute, // Repairs can take up to 10 seconds. Leaving room for outliers
|
Timeout: 1 * time.Minute, // Repairs can take up to 10 seconds. Leaving room for outliers
|
||||||
|
DownloadTimeout: 1 * time.Minute,
|
||||||
MaxBufferMem: 4 * memory.MiB,
|
MaxBufferMem: 4 * memory.MiB,
|
||||||
MaxExcessRateOptimalThreshold: 0.05,
|
MaxExcessRateOptimalThreshold: 0.05,
|
||||||
},
|
},
|
||||||
|
@ -305,6 +305,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, pointerDB metainfo
|
|||||||
config.Repairer.Timeout,
|
config.Repairer.Timeout,
|
||||||
config.Repairer.MaxExcessRateOptimalThreshold,
|
config.Repairer.MaxExcessRateOptimalThreshold,
|
||||||
config.Checker.RepairOverride,
|
config.Checker.RepairOverride,
|
||||||
|
config.Repairer.DownloadTimeout,
|
||||||
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
|
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -36,14 +36,16 @@ type ECRepairer struct {
|
|||||||
log *zap.Logger
|
log *zap.Logger
|
||||||
dialer rpc.Dialer
|
dialer rpc.Dialer
|
||||||
satelliteSignee signing.Signee
|
satelliteSignee signing.Signee
|
||||||
|
downloadTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewECRepairer creates a new repairer for interfacing with storagenodes.
|
// NewECRepairer creates a new repairer for interfacing with storagenodes.
|
||||||
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee) *ECRepairer {
|
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee, downloadTimeout time.Duration) *ECRepairer {
|
||||||
return &ECRepairer{
|
return &ECRepairer{
|
||||||
log: log,
|
log: log,
|
||||||
dialer: dialer,
|
dialer: dialer,
|
||||||
satelliteSignee: satelliteSignee,
|
satelliteSignee: satelliteSignee,
|
||||||
|
downloadTimeout: downloadTimeout,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,7 +164,10 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
|
|||||||
// and expects the hash of the data to match the signed hash provided by the storagenode.
|
// and expects the hash of the data to match the signed hash provided by the storagenode.
|
||||||
func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, pieceSize int64) (data []byte, err error) {
|
func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, pieceSize int64) (data []byte, err error) {
|
||||||
// contact node
|
// contact node
|
||||||
ps, err := ec.dialPiecestore(ctx, &pb.Node{
|
downloadCtx, cancel := context.WithTimeout(ctx, ec.downloadTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
ps, err := ec.dialPiecestore(downloadCtx, &pb.Node{
|
||||||
Id: limit.GetLimit().StorageNodeId,
|
Id: limit.GetLimit().StorageNodeId,
|
||||||
Address: limit.GetStorageNodeAddress(),
|
Address: limit.GetStorageNodeAddress(),
|
||||||
})
|
})
|
||||||
@ -171,7 +176,7 @@ func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.Addr
|
|||||||
}
|
}
|
||||||
defer func() { err = errs.Combine(err, ps.Close()) }()
|
defer func() { err = errs.Combine(err, ps.Close()) }()
|
||||||
|
|
||||||
downloader, err := ps.Download(ctx, limit.GetLimit(), privateKey, 0, pieceSize)
|
downloader, err := ps.Download(downloadCtx, limit.GetLimit(), privateKey, 0, pieceSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ type Config struct {
|
|||||||
MaxRepair int `help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1"`
|
MaxRepair int `help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1"`
|
||||||
Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"1h" devDefault:"0h5m0s"`
|
Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"1h" devDefault:"0h5m0s"`
|
||||||
Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"10m0s"`
|
Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"10m0s"`
|
||||||
|
DownloadTimeout time.Duration `help:"time limit for downloading pieces from a node for repair" default:"5m0s"`
|
||||||
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4M"`
|
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4M"`
|
||||||
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
|
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
|
||||||
}
|
}
|
||||||
|
@ -51,7 +51,9 @@ type SegmentRepairer struct {
|
|||||||
func NewSegmentRepairer(
|
func NewSegmentRepairer(
|
||||||
log *zap.Logger, metainfo *metainfo.Service, orders *orders.Service,
|
log *zap.Logger, metainfo *metainfo.Service, orders *orders.Service,
|
||||||
overlay *overlay.Service, dialer rpc.Dialer, timeout time.Duration,
|
overlay *overlay.Service, dialer rpc.Dialer, timeout time.Duration,
|
||||||
excessOptimalThreshold float64, repairOverride int, satelliteSignee signing.Signee,
|
excessOptimalThreshold float64, repairOverride int,
|
||||||
|
downloadTimeout time.Duration,
|
||||||
|
satelliteSignee signing.Signee,
|
||||||
) *SegmentRepairer {
|
) *SegmentRepairer {
|
||||||
|
|
||||||
if excessOptimalThreshold < 0 {
|
if excessOptimalThreshold < 0 {
|
||||||
@ -63,7 +65,7 @@ func NewSegmentRepairer(
|
|||||||
metainfo: metainfo,
|
metainfo: metainfo,
|
||||||
orders: orders,
|
orders: orders,
|
||||||
overlay: overlay,
|
overlay: overlay,
|
||||||
ec: NewECRepairer(log.Named("ec repairer"), dialer, satelliteSignee),
|
ec: NewECRepairer(log.Named("ec repairer"), dialer, satelliteSignee, downloadTimeout),
|
||||||
timeout: timeout,
|
timeout: timeout,
|
||||||
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
|
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
|
||||||
repairOverride: repairOverride,
|
repairOverride: repairOverride,
|
||||||
|
@ -108,6 +108,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity, pointerDB metainf
|
|||||||
config.Repairer.Timeout,
|
config.Repairer.Timeout,
|
||||||
config.Repairer.MaxExcessRateOptimalThreshold,
|
config.Repairer.MaxExcessRateOptimalThreshold,
|
||||||
config.Checker.RepairOverride,
|
config.Checker.RepairOverride,
|
||||||
|
config.Repairer.DownloadTimeout,
|
||||||
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
|
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
|
||||||
)
|
)
|
||||||
peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer)
|
peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer)
|
||||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -328,6 +328,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
|||||||
# number of update requests to process per transaction
|
# number of update requests to process per transaction
|
||||||
# overlay.update-stats-batch-size: 100
|
# overlay.update-stats-batch-size: 100
|
||||||
|
|
||||||
|
# time limit for downloading pieces from a node for repair
|
||||||
|
# repairer.download-timeout: 5m0s
|
||||||
|
|
||||||
# how frequently repairer should try and repair more data
|
# how frequently repairer should try and repair more data
|
||||||
# repairer.interval: 1h0m0s
|
# repairer.interval: 1h0m0s
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user