satellite/repair/checker: buffer repair queue

Integrate previous changes. Speed up the segment loop by batch inserting
into repair queue.

Change-Id: Ib9f4962d91960d21bad298f7771345b0dd270276
This commit is contained in:
Erik van Velzen 2022-04-30 00:33:08 +02:00
parent 9b49ffe48a
commit db1cc8ca95
4 changed files with 46 additions and 25 deletions

View File

@ -35,15 +35,16 @@ var (
//
// architecture: Chore
type Checker struct {
logger *zap.Logger
repairQueue queue.RepairQueue
metabase *metabase.DB
segmentLoop *segmentloop.Service
nodestate *ReliabilityCache
statsCollector *statsCollector
repairOverrides RepairOverridesMap
nodeFailureRate float64
Loop *sync2.Cycle
logger *zap.Logger
repairQueue queue.RepairQueue
metabase *metabase.DB
segmentLoop *segmentloop.Service
nodestate *ReliabilityCache
statsCollector *statsCollector
repairOverrides RepairOverridesMap
nodeFailureRate float64
repairQueueBatchSize int
Loop *sync2.Cycle
}
// NewChecker creates a new instance of checker.
@ -51,13 +52,14 @@ func NewChecker(logger *zap.Logger, repairQueue queue.RepairQueue, metabase *met
return &Checker{
logger: logger,
repairQueue: repairQueue,
metabase: metabase,
segmentLoop: segmentLoop,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
statsCollector: newStatsCollector(),
repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate,
repairQueue: repairQueue,
metabase: metabase,
segmentLoop: segmentLoop,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
statsCollector: newStatsCollector(),
repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate,
repairQueueBatchSize: config.RepairQueueInsertBatchSize,
Loop: sync2.NewCycle(config.Interval),
}
@ -92,6 +94,10 @@ func (checker *Checker) getNodesEstimate(ctx context.Context) (int, error) {
return totalNumNodes, nil
}
func (checker *Checker) createInsertBuffer() *queue.InsertBuffer {
return queue.NewInsertBuffer(checker.repairQueue, checker.repairQueueBatchSize)
}
// RefreshReliabilityCache forces refreshing node online status cache.
func (checker *Checker) RefreshReliabilityCache(ctx context.Context) error {
return checker.nodestate.Refresh(ctx)
@ -110,7 +116,7 @@ func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error)
startTime := time.Now()
observer := &checkerObserver{
repairQueue: checker.repairQueue,
repairQueue: checker.createInsertBuffer(),
nodestate: checker.nodestate,
statsCollector: checker.statsCollector,
monStats: aggregateStats{},
@ -127,6 +133,11 @@ func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error)
return nil
}
err = observer.repairQueue.Flush(ctx)
if err != nil {
return Error.Wrap(err)
}
// remove all segments which were not seen as unhealthy by this checker iteration
healthyDeleted, err := checker.repairQueue.Clean(ctx, startTime)
if err != nil {
@ -163,7 +174,7 @@ var _ segmentloop.Observer = (*checkerObserver)(nil)
//
// architecture: Observer
type checkerObserver struct {
repairQueue queue.RepairQueue
repairQueue *queue.InsertBuffer
nodestate *ReliabilityCache
statsCollector *statsCollector
monStats aggregateStats // TODO(cam): once we verify statsCollector reports data correctly, remove this
@ -282,22 +293,22 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *segmentl
stats.injuredSegmentHealth.Observe(segmentHealth)
obs.monStats.remoteSegmentsNeedingRepair++
stats.iterationAggregates.remoteSegmentsNeedingRepair++
alreadyInserted, err := obs.repairQueue.Insert(ctx, &queue.InjuredSegment{
err := obs.repairQueue.Insert(ctx, &queue.InjuredSegment{
StreamID: segment.StreamID,
Position: segment.Position,
UpdatedAt: time.Now().UTC(),
SegmentHealth: segmentHealth,
}, func() {
// Counters are increased after the queue has determined
// that the segment wasn't already queued for repair.
obs.monStats.newRemoteSegmentsNeedingRepair++
stats.iterationAggregates.newRemoteSegmentsNeedingRepair++
})
if err != nil {
obs.log.Error("error adding injured segment to queue", zap.Error(err))
return nil
}
if !alreadyInserted {
obs.monStats.newRemoteSegmentsNeedingRepair++
stats.iterationAggregates.newRemoteSegmentsNeedingRepair++
}
// monitor irreperable segments
if numHealthy < required {
if !containsStreamID(obs.monStats.objectsLost, segment.StreamID) {

View File

@ -21,7 +21,8 @@ type Config struct {
RepairOverrides RepairOverrides `help:"comma-separated override values for repair threshold in the format k/o/n-override (min/optimal/total-override)" releaseDefault:"29/80/110-52,29/80/95-52,29/80/130-52" devDefault:""`
// Node failure rate is an estimation based on a 6 hour checker run interval (4 checker iterations per day), a network of about 9200 nodes, and about 2 nodes churning per day.
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
}
// RepairOverride is a configuration struct that contains an override repair

View File

@ -5,8 +5,12 @@ package queue
import (
"context"
"github.com/spacemonkeygo/monkit/v3"
)
var mon = monkit.Package()
// InsertBuffer exposes a synchronous API to buffer a batch of segments
// and insert them at once. Not threadsafe. Call Flush() before discarding.
type InsertBuffer struct {
@ -56,6 +60,8 @@ func (r *InsertBuffer) Insert(
// Flush inserts the remaining segments into the database.
func (r *InsertBuffer) Flush(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
newlyInsertedSegments, err := r.queue.InsertBatch(ctx, r.batch)
if err != nil {
return err

View File

@ -112,6 +112,9 @@
# comma-separated override values for repair threshold in the format k/o/n-override (min/optimal/total-override)
# checker.repair-overrides: 29/80/110-52,29/80/95-52,29/80/130-52
# Number of damaged segments to buffer in-memory before flushing to the repair queue
# checker.repair-queue-insert-batch-size: 100
# percent of held amount disposed to node after leaving withheld
compensation.dispose-percent: 50