satellite/repair: fix repair concurrency
This new repair timeout (configured as TotalTimeout) will include both the time to download pieces and the time to upload pieces, as well as the time to pop the segment from the repair queue. This is a move from Github PR #3645. Change-Id: I47d618f57285845d8473fcd285f7d9be9b4318c8
This commit is contained in:
parent
f22bddf122
commit
92d86fa044
@ -363,6 +363,7 @@ func (planet *Planet) newSatellites(count int) ([]*SatelliteSystem, error) {
|
||||
Interval: defaultInterval,
|
||||
Timeout: 1 * time.Minute, // Repairs can take up to 10 seconds. Leaving room for outliers
|
||||
DownloadTimeout: 1 * time.Minute,
|
||||
TotalTimeout: 10 * time.Minute,
|
||||
MaxBufferMem: 4 * memory.MiB,
|
||||
MaxExcessRateOptimalThreshold: 0.05,
|
||||
},
|
||||
|
@ -129,7 +129,7 @@ func TestDataRepair(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// repaired segment should not contain any piece in the killed and DQ nodes
|
||||
metainfoService := satellite.Metainfo.Service
|
||||
@ -253,7 +253,7 @@ func TestCorruptDataRepair_Failed(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// repair should update audit status as fail
|
||||
node, err = overlay.Get(ctx, corruptedNodeID)
|
||||
@ -371,7 +371,7 @@ func TestCorruptDataRepair_Succeed(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// repair should update audit status as fail
|
||||
node, err = overlay.Get(ctx, corruptedNodeID)
|
||||
@ -463,7 +463,7 @@ func TestRemoveDeletedSegmentFromQueue(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// Verify that the segment was removed
|
||||
count, err = satellite.DB.RepairQueue().Count(ctx)
|
||||
@ -546,7 +546,7 @@ func TestRemoveIrreparableSegmentFromQueue(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// Verify that the segment was removed
|
||||
count, err = satellite.DB.RepairQueue().Count(ctx)
|
||||
@ -634,7 +634,7 @@ func TestRepairMultipleDisqualified(t *testing.T) {
|
||||
|
||||
satellite.Repair.Checker.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// kill nodes kept alive to ensure repair worked
|
||||
for _, node := range planet.StorageNodes {
|
||||
@ -729,7 +729,7 @@ func TestDataRepairOverride_HigherLimit(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// repair should have been done, due to the override
|
||||
metainfoService := satellite.Metainfo.Service
|
||||
@ -815,7 +815,7 @@ func TestDataRepairOverride_LowerLimit(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// Increase offline count by the difference to trigger repair
|
||||
toKill += repairThreshold - repairOverride
|
||||
@ -844,7 +844,7 @@ func TestDataRepairOverride_LowerLimit(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// repair should have been done, due to the override
|
||||
metainfoService := satellite.Metainfo.Service
|
||||
@ -950,7 +950,7 @@ func TestDataRepairUploadLimit(t *testing.T) {
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.Limiter.Wait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// Get the pointer after repair to check the nodes where the pieces are
|
||||
// stored
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
"github.com/spacemonkeygo/monkit/v3"
|
||||
"github.com/zeebo/errs"
|
||||
"go.uber.org/zap"
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"storj.io/common/memory"
|
||||
"storj.io/common/pb"
|
||||
@ -30,6 +31,7 @@ type Config struct {
|
||||
Interval time.Duration `help:"how frequently repairer should try and repair more data" releaseDefault:"5m0s" devDefault:"1m0s"`
|
||||
Timeout time.Duration `help:"time limit for uploading repaired pieces to new storage nodes" default:"5m0s"`
|
||||
DownloadTimeout time.Duration `help:"time limit for downloading pieces from a node for repair" default:"5m0s"`
|
||||
TotalTimeout time.Duration `help:"time limit for an entire repair job, from queue pop to upload completion" default:"45m"`
|
||||
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4M"`
|
||||
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
|
||||
}
|
||||
@ -38,65 +40,109 @@ type Config struct {
|
||||
//
|
||||
// architecture: Worker
|
||||
type Service struct {
|
||||
log *zap.Logger
|
||||
queue queue.RepairQueue
|
||||
config *Config
|
||||
Limiter *sync2.Limiter
|
||||
Loop *sync2.Cycle
|
||||
repairer *SegmentRepairer
|
||||
log *zap.Logger
|
||||
queue queue.RepairQueue
|
||||
config *Config
|
||||
JobLimiter *semaphore.Weighted
|
||||
Loop *sync2.Cycle
|
||||
repairer *SegmentRepairer
|
||||
}
|
||||
|
||||
// NewService creates repairing service
|
||||
func NewService(log *zap.Logger, queue queue.RepairQueue, config *Config, repairer *SegmentRepairer) *Service {
|
||||
return &Service{
|
||||
log: log,
|
||||
queue: queue,
|
||||
config: config,
|
||||
Limiter: sync2.NewLimiter(config.MaxRepair),
|
||||
Loop: sync2.NewCycle(config.Interval),
|
||||
repairer: repairer,
|
||||
log: log,
|
||||
queue: queue,
|
||||
config: config,
|
||||
JobLimiter: semaphore.NewWeighted(int64(config.MaxRepair)),
|
||||
Loop: sync2.NewCycle(config.Interval),
|
||||
repairer: repairer,
|
||||
}
|
||||
}
|
||||
|
||||
// Close closes resources
|
||||
func (service *Service) Close() error { return nil }
|
||||
|
||||
// WaitForPendingRepairs waits for all ongoing repairs to complete.
|
||||
//
|
||||
// NB: this assumes that service.config.MaxRepair will never be changed once this Service instance
|
||||
// is initialized. If that is not a valid assumption, we should keep a copy of its initial value to
|
||||
// use here instead.
|
||||
func (service *Service) WaitForPendingRepairs() {
|
||||
// Acquire and then release the entire capacity of the semaphore, ensuring that
|
||||
// it is completely empty (or, at least it was empty at some point).
|
||||
//
|
||||
// No error return is possible here; context.Background() can't be canceled
|
||||
_ = service.JobLimiter.Acquire(context.Background(), int64(service.config.MaxRepair))
|
||||
service.JobLimiter.Release(int64(service.config.MaxRepair))
|
||||
}
|
||||
|
||||
// Run runs the repairer service
|
||||
func (service *Service) Run(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
// wait for all repairs to complete
|
||||
defer service.Limiter.Wait()
|
||||
// Wait for all repairs to complete
|
||||
defer service.WaitForPendingRepairs()
|
||||
|
||||
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
||||
return service.Loop.Run(ctx, service.processWhileQueueHasItems)
|
||||
}
|
||||
|
||||
// processWhileQueueHasItems keeps calling process() until the queue is empty or something
|
||||
// else goes wrong in fetching from the queue.
|
||||
func (service *Service) processWhileQueueHasItems(ctx context.Context) error {
|
||||
for {
|
||||
err := service.process(ctx)
|
||||
if err != nil {
|
||||
if storage.ErrEmptyQueue.Has(err) {
|
||||
return nil
|
||||
}
|
||||
service.log.Error("process", zap.Error(Error.Wrap(err)))
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// process picks items from repair queue and spawns a repair worker
|
||||
func (service *Service) process(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
for {
|
||||
seg, err := service.queue.Select(ctx)
|
||||
if err != nil {
|
||||
if storage.ErrEmptyQueue.Has(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
service.log.Info("Retrieved segment from repair queue", zap.Binary("Segment", seg.GetPath()))
|
||||
|
||||
service.Limiter.Go(ctx, func() {
|
||||
err := service.worker(ctx, seg)
|
||||
if err != nil {
|
||||
service.log.Error("repair worker failed:", zap.Binary("Segment", seg.GetPath()), zap.Error(err))
|
||||
}
|
||||
})
|
||||
// wait until we are allowed to spawn a new job
|
||||
if err := service.JobLimiter.Acquire(ctx, 1); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// IMPORTANT: this timeout must be started before service.queue.Select(), in case
|
||||
// service.queue.Select() takes some non-negligible amount of time, so that we can depend on
|
||||
// repair jobs being given up within some set interval after the time in the 'attempted'
|
||||
// column in the queue table.
|
||||
//
|
||||
// This is the reason why we are using a semaphore in this somewhat awkward way instead of
|
||||
// using a simpler sync2.Limiter pattern. We don't want this timeout to include the waiting
|
||||
// time from the semaphore acquisition, but it _must_ include the queue fetch time. At the
|
||||
// same time, we don't want to do the queue pop in a separate goroutine, because we want to
|
||||
// return from service.Run when queue fetch fails.
|
||||
ctx, cancel := context.WithTimeout(ctx, service.config.TotalTimeout)
|
||||
|
||||
seg, err := service.queue.Select(ctx)
|
||||
if err != nil {
|
||||
service.JobLimiter.Release(1)
|
||||
cancel()
|
||||
return err
|
||||
}
|
||||
service.log.Info("Retrieved segment from repair queue", zap.Binary("Segment", seg.GetPath()))
|
||||
|
||||
// this goroutine inherits the JobLimiter semaphore acquisition and is now responsible
|
||||
// for releasing it.
|
||||
go func() {
|
||||
defer service.JobLimiter.Release(1)
|
||||
defer cancel()
|
||||
|
||||
if err := service.worker(ctx, seg); err != nil {
|
||||
service.log.Error("repair worker failed:", zap.Binary("Segment", seg.GetPath()), zap.Error(err))
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (service *Service) worker(ctx context.Context, seg *pb.InjuredSegment) (err error) {
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -444,6 +444,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# time limit for uploading repaired pieces to new storage nodes
|
||||
# repairer.timeout: 5m0s
|
||||
|
||||
# time limit for an entire repair job, from queue pop to upload completion
|
||||
# repairer.total-timeout: 45m0s
|
||||
|
||||
# how often to flush the reported serial rollups to the database
|
||||
# reported-rollup.interval: 5m0s
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user