satellite/repair: add flag for de-clumping behavior

It seems that the "what pieces are clumped" code does not work right, so
this logic is causing repair overload or other repair failures.

Hide it behind a flag while we figure out what is going on, so that
repair can still work in the meantime.

Change-Id: If83ef7895cba870353a67ab13573193d92fff80b
This commit is contained in:
paul cannon 2023-05-18 13:47:23 -05:00
parent c0e7f463fe
commit de737bdee9
6 changed files with 66 additions and 28 deletions

View File

@ -23,6 +23,7 @@ type Config struct {
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
DoDeclumping bool `help:"Treat pieces on the same network as in need of repair" default:"false"`
}
// RepairOverride is a configuration struct that contains an override repair

View File

@ -39,6 +39,7 @@ type Observer struct {
repairOverrides RepairOverridesMap
nodeFailureRate float64
repairQueueBatchSize int
doDeclumping bool
// the following are reset on each iteration
startTime time.Time
@ -59,6 +60,7 @@ func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *ove
repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate,
repairQueueBatchSize: config.RepairQueueInsertBatchSize,
doDeclumping: config.DoDeclumping,
statsCollector: make(map[string]*observerRSStats),
}
}
@ -226,6 +228,7 @@ type observerFork struct {
nodeFailureRate float64
getNodesEstimate func(ctx context.Context) (int, error)
log *zap.Logger
doDeclumping bool
lastStreamID uuid.UUID
totalStats aggregateStats
@ -244,6 +247,7 @@ func newObserverFork(observer *Observer) rangedloop.Partial {
nodeFailureRate: observer.nodeFailureRate,
getNodesEstimate: observer.getNodesEstimate,
log: observer.logger,
doDeclumping: observer.doDeclumping,
getObserverStats: observer.getObserverStats,
}
}
@ -332,19 +336,23 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
return Error.New("error getting missing pieces: %w", err)
}
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
nodeIDs[i] = p.StorageNode
var clumpedPieces metabase.Pieces
var lastNets []string
if fork.doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
nodeIDs[i] = p.StorageNode
}
lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++
return errs.Combine(Error.New("error determining node last_nets"), err)
}
clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
}
lastNets, err := fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++
return errs.Combine(Error.New("error determining node last_nets"), err)
}
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked

View File

@ -3227,7 +3227,13 @@ func TestRepairClumpedPieces(t *testing.T) {
StorageNodeCount: 6,
UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: testplanet.ReconfigureRS(2, 3, 4, 4),
Satellite: testplanet.Combine(
testplanet.ReconfigureRS(2, 3, 4, 4),
func(log *zap.Logger, index int, config *satellite.Config) {
config.Checker.DoDeclumping = true
config.Repairer.DoDeclumping = true
},
),
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
uplinkPeer := planet.Uplinks[0]

View File

@ -35,6 +35,7 @@ type Config struct {
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
UseRangedLoop bool `help:"whether to enable repair checker observer with ranged loop" default:"true"`
DoDeclumping bool `help:"repair pieces on the same network to other nodes" default:"false"`
}
// Service contains the information needed to run the repair service.

View File

@ -86,6 +86,7 @@ type SegmentRepairer struct {
reporter audit.Reporter
reputationUpdateEnabled bool
doDeclumping bool
// multiplierOptimalThreshold is the value that multiplied by the optimal
// threshold results in the maximum limit of number of nodes to upload
@ -133,6 +134,7 @@ func NewSegmentRepairer(
repairOverrides: repairOverrides.GetMap(),
reporter: reporter,
reputationUpdateEnabled: config.ReputationUpdateEnabled,
doDeclumping: config.DoDeclumping,
nowFn: time.Now,
}
@ -196,20 +198,25 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
return false, overlayQueryError.New("error identifying missing pieces: %w", err)
}
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
nodeIDs[i] = p.StorageNode
}
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, nodeIDs)
if err != nil {
return false, metainfoGetError.Wrap(err)
}
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
clumpedPiecesSet := make(map[uint16]bool)
for _, clumpedPiece := range clumpedPieces {
clumpedPiecesSet[clumpedPiece.Number] = true
var clumpedPieces metabase.Pieces
var clumpedPiecesSet map[uint16]bool
if repairer.doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
nodeIDs[i] = p.StorageNode
}
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, nodeIDs)
if err != nil {
return false, metainfoGetError.Wrap(err)
}
clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
clumpedPiecesSet = make(map[uint16]bool)
for _, clumpedPiece := range clumpedPieces {
clumpedPiecesSet[clumpedPiece.Number] = true
}
}
numRetrievable := len(pieces) - len(missingPieces)
@ -258,7 +265,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) {
mon.Meter("repair_unnecessary").Mark(1) //mon:locked
stats.repairUnnecessary.Mark(1)
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold))
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold), zap.Int("numClumped", len(clumpedPieces)))
return true, nil
}
@ -598,6 +605,15 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
mon.IntVal("segment_repair_count").Observe(repairCount) //mon:locked
stats.segmentRepairCount.Observe(repairCount)
repairer.log.Debug("repaired segment",
zap.Stringer("Stream ID", segment.StreamID),
zap.Uint64("Position", segment.Position.Encode()),
zap.Int("clumped pieces", len(clumpedPieces)),
zap.Int("in excluded countries", numHealthyInExcludedCountries),
zap.Int("removed pieces", len(toRemove)),
zap.Int("repaired pieces", len(repairedPieces)),
zap.Int("healthy before repair", numHealthy),
zap.Int("healthy after repair", healthyAfterRepair))
return true, nil
}

View File

@ -88,6 +88,9 @@
# number of workers to run audits on segments
# audit.worker-concurrency: 2
# Treat pieces on the same network as in need of repair
# checker.do-declumping: false
# how frequently checker should check for bad segments
# checker.interval: 30s
@ -925,6 +928,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# ratio where to consider processed count as supicious
# ranged-loop.suspicious-processed-ratio: 0.03
# repair pieces on the same network to other nodes
# repairer.do-declumping: false
# time limit for downloading pieces from a node for repair
# repairer.download-timeout: 5m0s