satellite/repair: add flag for de-clumping behavior

It seems that the "what pieces are clumped" code does not work right, so
this logic is causing repair overload or other repair failures.

Hide it behind a flag while we figure out what is going on, so that
repair can still work in the meantime.

Change-Id: If83ef7895cba870353a67ab13573193d92fff80b
This commit is contained in:
paul cannon 2023-05-18 13:47:23 -05:00
parent c0e7f463fe
commit de737bdee9
6 changed files with 66 additions and 28 deletions

View File

@ -23,6 +23,7 @@ type Config struct {
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration. // This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" ` NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" ` RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
DoDeclumping bool `help:"Treat pieces on the same network as in need of repair" default:"false"`
} }
// RepairOverride is a configuration struct that contains an override repair // RepairOverride is a configuration struct that contains an override repair

View File

@ -39,6 +39,7 @@ type Observer struct {
repairOverrides RepairOverridesMap repairOverrides RepairOverridesMap
nodeFailureRate float64 nodeFailureRate float64
repairQueueBatchSize int repairQueueBatchSize int
doDeclumping bool
// the following are reset on each iteration // the following are reset on each iteration
startTime time.Time startTime time.Time
@ -59,6 +60,7 @@ func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *ove
repairOverrides: config.RepairOverrides.GetMap(), repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate, nodeFailureRate: config.NodeFailureRate,
repairQueueBatchSize: config.RepairQueueInsertBatchSize, repairQueueBatchSize: config.RepairQueueInsertBatchSize,
doDeclumping: config.DoDeclumping,
statsCollector: make(map[string]*observerRSStats), statsCollector: make(map[string]*observerRSStats),
} }
} }
@ -226,6 +228,7 @@ type observerFork struct {
nodeFailureRate float64 nodeFailureRate float64
getNodesEstimate func(ctx context.Context) (int, error) getNodesEstimate func(ctx context.Context) (int, error)
log *zap.Logger log *zap.Logger
doDeclumping bool
lastStreamID uuid.UUID lastStreamID uuid.UUID
totalStats aggregateStats totalStats aggregateStats
@ -244,6 +247,7 @@ func newObserverFork(observer *Observer) rangedloop.Partial {
nodeFailureRate: observer.nodeFailureRate, nodeFailureRate: observer.nodeFailureRate,
getNodesEstimate: observer.getNodesEstimate, getNodesEstimate: observer.getNodesEstimate,
log: observer.logger, log: observer.logger,
doDeclumping: observer.doDeclumping,
getObserverStats: observer.getObserverStats, getObserverStats: observer.getObserverStats,
} }
} }
@ -332,19 +336,23 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
return Error.New("error getting missing pieces: %w", err) return Error.New("error getting missing pieces: %w", err)
} }
var clumpedPieces metabase.Pieces
var lastNets []string
if fork.doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are // if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy. // to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces)) nodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces { for i, p := range pieces {
nodeIDs[i] = p.StorageNode nodeIDs[i] = p.StorageNode
} }
lastNets, err := fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs) lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
if err != nil { if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++ fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++ stats.iterationAggregates.remoteSegmentsFailedToCheck++
return errs.Combine(Error.New("error determining node last_nets"), err) return errs.Combine(Error.New("error determining node last_nets"), err)
} }
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets) clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
}
numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces) numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked

View File

@ -3227,7 +3227,13 @@ func TestRepairClumpedPieces(t *testing.T) {
StorageNodeCount: 6, StorageNodeCount: 6,
UplinkCount: 1, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{ Reconfigure: testplanet.Reconfigure{
Satellite: testplanet.ReconfigureRS(2, 3, 4, 4), Satellite: testplanet.Combine(
testplanet.ReconfigureRS(2, 3, 4, 4),
func(log *zap.Logger, index int, config *satellite.Config) {
config.Checker.DoDeclumping = true
config.Repairer.DoDeclumping = true
},
),
}, },
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
uplinkPeer := planet.Uplinks[0] uplinkPeer := planet.Uplinks[0]

View File

@ -35,6 +35,7 @@ type Config struct {
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"` InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"` ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
UseRangedLoop bool `help:"whether to enable repair checker observer with ranged loop" default:"true"` UseRangedLoop bool `help:"whether to enable repair checker observer with ranged loop" default:"true"`
DoDeclumping bool `help:"repair pieces on the same network to other nodes" default:"false"`
} }
// Service contains the information needed to run the repair service. // Service contains the information needed to run the repair service.

View File

@ -86,6 +86,7 @@ type SegmentRepairer struct {
reporter audit.Reporter reporter audit.Reporter
reputationUpdateEnabled bool reputationUpdateEnabled bool
doDeclumping bool
// multiplierOptimalThreshold is the value that multiplied by the optimal // multiplierOptimalThreshold is the value that multiplied by the optimal
// threshold results in the maximum limit of number of nodes to upload // threshold results in the maximum limit of number of nodes to upload
@ -133,6 +134,7 @@ func NewSegmentRepairer(
repairOverrides: repairOverrides.GetMap(), repairOverrides: repairOverrides.GetMap(),
reporter: reporter, reporter: reporter,
reputationUpdateEnabled: config.ReputationUpdateEnabled, reputationUpdateEnabled: config.ReputationUpdateEnabled,
doDeclumping: config.DoDeclumping,
nowFn: time.Now, nowFn: time.Now,
} }
@ -196,6 +198,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
return false, overlayQueryError.New("error identifying missing pieces: %w", err) return false, overlayQueryError.New("error identifying missing pieces: %w", err)
} }
var clumpedPieces metabase.Pieces
var clumpedPiecesSet map[uint16]bool
if repairer.doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are // if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy. // to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces)) nodeIDs := make([]storj.NodeID, len(pieces))
@ -206,12 +211,14 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
if err != nil { if err != nil {
return false, metainfoGetError.Wrap(err) return false, metainfoGetError.Wrap(err)
} }
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets) clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
clumpedPiecesSet := make(map[uint16]bool) clumpedPiecesSet = make(map[uint16]bool)
for _, clumpedPiece := range clumpedPieces { for _, clumpedPiece := range clumpedPieces {
clumpedPiecesSet[clumpedPiece.Number] = true clumpedPiecesSet[clumpedPiece.Number] = true
} }
}
numRetrievable := len(pieces) - len(missingPieces) numRetrievable := len(pieces) - len(missingPieces)
numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces) numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
// irreparable segment // irreparable segment
@ -258,7 +265,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) { if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) {
mon.Meter("repair_unnecessary").Mark(1) //mon:locked mon.Meter("repair_unnecessary").Mark(1) //mon:locked
stats.repairUnnecessary.Mark(1) stats.repairUnnecessary.Mark(1)
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold)) repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold), zap.Int("numClumped", len(clumpedPieces)))
return true, nil return true, nil
} }
@ -598,6 +605,15 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
mon.IntVal("segment_repair_count").Observe(repairCount) //mon:locked mon.IntVal("segment_repair_count").Observe(repairCount) //mon:locked
stats.segmentRepairCount.Observe(repairCount) stats.segmentRepairCount.Observe(repairCount)
repairer.log.Debug("repaired segment",
zap.Stringer("Stream ID", segment.StreamID),
zap.Uint64("Position", segment.Position.Encode()),
zap.Int("clumped pieces", len(clumpedPieces)),
zap.Int("in excluded countries", numHealthyInExcludedCountries),
zap.Int("removed pieces", len(toRemove)),
zap.Int("repaired pieces", len(repairedPieces)),
zap.Int("healthy before repair", numHealthy),
zap.Int("healthy after repair", healthyAfterRepair))
return true, nil return true, nil
} }

View File

@ -88,6 +88,9 @@
# number of workers to run audits on segments # number of workers to run audits on segments
# audit.worker-concurrency: 2 # audit.worker-concurrency: 2
# Treat pieces on the same network as in need of repair
# checker.do-declumping: false
# how frequently checker should check for bad segments # how frequently checker should check for bad segments
# checker.interval: 30s # checker.interval: 30s
@ -925,6 +928,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# ratio where to consider processed count as supicious # ratio where to consider processed count as supicious
# ranged-loop.suspicious-processed-ratio: 0.03 # ranged-loop.suspicious-processed-ratio: 0.03
# repair pieces on the same network to other nodes
# repairer.do-declumping: false
# time limit for downloading pieces from a node for repair # time limit for downloading pieces from a node for repair
# repairer.download-timeout: 5m0s # repairer.download-timeout: 5m0s