satellite/repair: add timeout for repair download from a single node(#3418)

This commit is contained in:
Yingrong Zhao 2019-10-30 16:31:08 -04:00 committed by GitHub
parent e96d615013
commit bfa6699e2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 19 additions and 5 deletions

View File

@ -315,6 +315,7 @@ func (planet *Planet) newSatellites(count int) ([]*SatelliteSystem, error) {
MaxRepair: 10, MaxRepair: 10,
Interval: time.Hour, Interval: time.Hour,
Timeout: 1 * time.Minute, // Repairs can take up to 10 seconds. Leaving room for outliers Timeout: 1 * time.Minute, // Repairs can take up to 10 seconds. Leaving room for outliers
DownloadTimeout: 1 * time.Minute,
MaxBufferMem: 4 * memory.MiB, MaxBufferMem: 4 * memory.MiB,
MaxExcessRateOptimalThreshold: 0.05, MaxExcessRateOptimalThreshold: 0.05,
}, },

View File

@ -305,6 +305,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, pointerDB metainfo
config.Repairer.Timeout, config.Repairer.Timeout,
config.Repairer.MaxExcessRateOptimalThreshold, config.Repairer.MaxExcessRateOptimalThreshold,
config.Checker.RepairOverride, config.Checker.RepairOverride,
config.Repairer.DownloadTimeout,
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()), signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
) )

View File

@ -36,14 +36,16 @@ type ECRepairer struct {
log *zap.Logger log *zap.Logger
dialer rpc.Dialer dialer rpc.Dialer
satelliteSignee signing.Signee satelliteSignee signing.Signee
downloadTimeout time.Duration
} }
// NewECRepairer creates a new repairer for interfacing with storagenodes. // NewECRepairer creates a new repairer for interfacing with storagenodes.
func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee) *ECRepairer { func NewECRepairer(log *zap.Logger, dialer rpc.Dialer, satelliteSignee signing.Signee, downloadTimeout time.Duration) *ECRepairer {
return &ECRepairer{ return &ECRepairer{
log: log, log: log,
dialer: dialer, dialer: dialer,
satelliteSignee: satelliteSignee, satelliteSignee: satelliteSignee,
downloadTimeout: downloadTimeout,
} }
} }
@ -162,7 +164,10 @@ func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit,
// and expects the hash of the data to match the signed hash provided by the storagenode. // and expects the hash of the data to match the signed hash provided by the storagenode.
func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, pieceSize int64) (data []byte, err error) { func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, pieceSize int64) (data []byte, err error) {
// contact node // contact node
ps, err := ec.dialPiecestore(ctx, &pb.Node{ downloadCtx, cancel := context.WithTimeout(ctx, ec.downloadTimeout)
defer cancel()
ps, err := ec.dialPiecestore(downloadCtx, &pb.Node{
Id: limit.GetLimit().StorageNodeId, Id: limit.GetLimit().StorageNodeId,
Address: limit.GetStorageNodeAddress(), Address: limit.GetStorageNodeAddress(),
}) })
@ -171,7 +176,7 @@ func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.Addr
} }
defer func() { err = errs.Combine(err, ps.Close()) }() defer func() { err = errs.Combine(err, ps.Close()) }()
downloader, err := ps.Download(ctx, limit.GetLimit(), privateKey, 0, pieceSize) downloader, err := ps.Download(downloadCtx, limit.GetLimit(), privateKey, 0, pieceSize)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -29,6 +29,7 @@ type Config struct {
MaxRepair int `help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1"` MaxRepair int `help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1"`
Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"1h" devDefault:"0h5m0s"` Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"1h" devDefault:"0h5m0s"`
Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"10m0s"` Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"10m0s"`
DownloadTimeout time.Duration `help:"time limit for downloading pieces from a node for repair" default:"5m0s"`
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4M"` MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4M"`
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"` MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
} }

View File

@ -51,7 +51,9 @@ type SegmentRepairer struct {
func NewSegmentRepairer( func NewSegmentRepairer(
log *zap.Logger, metainfo *metainfo.Service, orders *orders.Service, log *zap.Logger, metainfo *metainfo.Service, orders *orders.Service,
overlay *overlay.Service, dialer rpc.Dialer, timeout time.Duration, overlay *overlay.Service, dialer rpc.Dialer, timeout time.Duration,
excessOptimalThreshold float64, repairOverride int, satelliteSignee signing.Signee, excessOptimalThreshold float64, repairOverride int,
downloadTimeout time.Duration,
satelliteSignee signing.Signee,
) *SegmentRepairer { ) *SegmentRepairer {
if excessOptimalThreshold < 0 { if excessOptimalThreshold < 0 {
@ -63,7 +65,7 @@ func NewSegmentRepairer(
metainfo: metainfo, metainfo: metainfo,
orders: orders, orders: orders,
overlay: overlay, overlay: overlay,
ec: NewECRepairer(log.Named("ec repairer"), dialer, satelliteSignee), ec: NewECRepairer(log.Named("ec repairer"), dialer, satelliteSignee, downloadTimeout),
timeout: timeout, timeout: timeout,
multiplierOptimalThreshold: 1 + excessOptimalThreshold, multiplierOptimalThreshold: 1 + excessOptimalThreshold,
repairOverride: repairOverride, repairOverride: repairOverride,

View File

@ -108,6 +108,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity, pointerDB metainf
config.Repairer.Timeout, config.Repairer.Timeout,
config.Repairer.MaxExcessRateOptimalThreshold, config.Repairer.MaxExcessRateOptimalThreshold,
config.Checker.RepairOverride, config.Checker.RepairOverride,
config.Repairer.DownloadTimeout,
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()), signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
) )
peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer) peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer)

View File

@ -328,6 +328,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# number of update requests to process per transaction # number of update requests to process per transaction
# overlay.update-stats-batch-size: 100 # overlay.update-stats-batch-size: 100
# time limit for downloading pieces from a node for repair
# repairer.download-timeout: 5m0s
# how frequently repairer should try and repair more data # how frequently repairer should try and repair more data
# repairer.interval: 1h0m0s # repairer.interval: 1h0m0s