satellite/repair: add flag for de-clumping behavior
It seems that the "what pieces are clumped" code does not work right, so this logic is causing repair overload or other repair failures. Hide it behind a flag while we figure out what is going on, so that repair can still work in the meantime. Change-Id: If83ef7895cba870353a67ab13573193d92fff80b
This commit is contained in:
parent
c0e7f463fe
commit
de737bdee9
@ -23,6 +23,7 @@ type Config struct {
|
||||
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
|
||||
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
|
||||
RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
|
||||
DoDeclumping bool `help:"Treat pieces on the same network as in need of repair" default:"false"`
|
||||
}
|
||||
|
||||
// RepairOverride is a configuration struct that contains an override repair
|
||||
|
@ -39,6 +39,7 @@ type Observer struct {
|
||||
repairOverrides RepairOverridesMap
|
||||
nodeFailureRate float64
|
||||
repairQueueBatchSize int
|
||||
doDeclumping bool
|
||||
|
||||
// the following are reset on each iteration
|
||||
startTime time.Time
|
||||
@ -59,6 +60,7 @@ func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *ove
|
||||
repairOverrides: config.RepairOverrides.GetMap(),
|
||||
nodeFailureRate: config.NodeFailureRate,
|
||||
repairQueueBatchSize: config.RepairQueueInsertBatchSize,
|
||||
doDeclumping: config.DoDeclumping,
|
||||
statsCollector: make(map[string]*observerRSStats),
|
||||
}
|
||||
}
|
||||
@ -226,6 +228,7 @@ type observerFork struct {
|
||||
nodeFailureRate float64
|
||||
getNodesEstimate func(ctx context.Context) (int, error)
|
||||
log *zap.Logger
|
||||
doDeclumping bool
|
||||
lastStreamID uuid.UUID
|
||||
totalStats aggregateStats
|
||||
|
||||
@ -244,6 +247,7 @@ func newObserverFork(observer *Observer) rangedloop.Partial {
|
||||
nodeFailureRate: observer.nodeFailureRate,
|
||||
getNodesEstimate: observer.getNodesEstimate,
|
||||
log: observer.logger,
|
||||
doDeclumping: observer.doDeclumping,
|
||||
getObserverStats: observer.getObserverStats,
|
||||
}
|
||||
}
|
||||
@ -332,19 +336,23 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
|
||||
return Error.New("error getting missing pieces: %w", err)
|
||||
}
|
||||
|
||||
// if multiple pieces are on the same last_net, keep only the first one. The rest are
|
||||
// to be considered retrievable but unhealthy.
|
||||
nodeIDs := make([]storj.NodeID, len(pieces))
|
||||
for i, p := range pieces {
|
||||
nodeIDs[i] = p.StorageNode
|
||||
var clumpedPieces metabase.Pieces
|
||||
var lastNets []string
|
||||
if fork.doDeclumping {
|
||||
// if multiple pieces are on the same last_net, keep only the first one. The rest are
|
||||
// to be considered retrievable but unhealthy.
|
||||
nodeIDs := make([]storj.NodeID, len(pieces))
|
||||
for i, p := range pieces {
|
||||
nodeIDs[i] = p.StorageNode
|
||||
}
|
||||
lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
|
||||
if err != nil {
|
||||
fork.totalStats.remoteSegmentsFailedToCheck++
|
||||
stats.iterationAggregates.remoteSegmentsFailedToCheck++
|
||||
return errs.Combine(Error.New("error determining node last_nets"), err)
|
||||
}
|
||||
clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
|
||||
}
|
||||
lastNets, err := fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
|
||||
if err != nil {
|
||||
fork.totalStats.remoteSegmentsFailedToCheck++
|
||||
stats.iterationAggregates.remoteSegmentsFailedToCheck++
|
||||
return errs.Combine(Error.New("error determining node last_nets"), err)
|
||||
}
|
||||
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
|
||||
|
||||
numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
|
||||
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
|
||||
|
@ -3227,7 +3227,13 @@ func TestRepairClumpedPieces(t *testing.T) {
|
||||
StorageNodeCount: 6,
|
||||
UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: testplanet.ReconfigureRS(2, 3, 4, 4),
|
||||
Satellite: testplanet.Combine(
|
||||
testplanet.ReconfigureRS(2, 3, 4, 4),
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Checker.DoDeclumping = true
|
||||
config.Repairer.DoDeclumping = true
|
||||
},
|
||||
),
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
uplinkPeer := planet.Uplinks[0]
|
||||
|
@ -35,6 +35,7 @@ type Config struct {
|
||||
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
|
||||
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
|
||||
UseRangedLoop bool `help:"whether to enable repair checker observer with ranged loop" default:"true"`
|
||||
DoDeclumping bool `help:"repair pieces on the same network to other nodes" default:"false"`
|
||||
}
|
||||
|
||||
// Service contains the information needed to run the repair service.
|
||||
|
@ -86,6 +86,7 @@ type SegmentRepairer struct {
|
||||
reporter audit.Reporter
|
||||
|
||||
reputationUpdateEnabled bool
|
||||
doDeclumping bool
|
||||
|
||||
// multiplierOptimalThreshold is the value that multiplied by the optimal
|
||||
// threshold results in the maximum limit of number of nodes to upload
|
||||
@ -133,6 +134,7 @@ func NewSegmentRepairer(
|
||||
repairOverrides: repairOverrides.GetMap(),
|
||||
reporter: reporter,
|
||||
reputationUpdateEnabled: config.ReputationUpdateEnabled,
|
||||
doDeclumping: config.DoDeclumping,
|
||||
|
||||
nowFn: time.Now,
|
||||
}
|
||||
@ -196,20 +198,25 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
return false, overlayQueryError.New("error identifying missing pieces: %w", err)
|
||||
}
|
||||
|
||||
// if multiple pieces are on the same last_net, keep only the first one. The rest are
|
||||
// to be considered retrievable but unhealthy.
|
||||
nodeIDs := make([]storj.NodeID, len(pieces))
|
||||
for i, p := range pieces {
|
||||
nodeIDs[i] = p.StorageNode
|
||||
}
|
||||
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, nodeIDs)
|
||||
if err != nil {
|
||||
return false, metainfoGetError.Wrap(err)
|
||||
}
|
||||
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
|
||||
clumpedPiecesSet := make(map[uint16]bool)
|
||||
for _, clumpedPiece := range clumpedPieces {
|
||||
clumpedPiecesSet[clumpedPiece.Number] = true
|
||||
var clumpedPieces metabase.Pieces
|
||||
var clumpedPiecesSet map[uint16]bool
|
||||
if repairer.doDeclumping {
|
||||
// if multiple pieces are on the same last_net, keep only the first one. The rest are
|
||||
// to be considered retrievable but unhealthy.
|
||||
nodeIDs := make([]storj.NodeID, len(pieces))
|
||||
for i, p := range pieces {
|
||||
nodeIDs[i] = p.StorageNode
|
||||
}
|
||||
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, nodeIDs)
|
||||
if err != nil {
|
||||
return false, metainfoGetError.Wrap(err)
|
||||
}
|
||||
clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
|
||||
clumpedPiecesSet = make(map[uint16]bool)
|
||||
for _, clumpedPiece := range clumpedPieces {
|
||||
clumpedPiecesSet[clumpedPiece.Number] = true
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
numRetrievable := len(pieces) - len(missingPieces)
|
||||
@ -258,7 +265,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) {
|
||||
mon.Meter("repair_unnecessary").Mark(1) //mon:locked
|
||||
stats.repairUnnecessary.Mark(1)
|
||||
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold))
|
||||
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold), zap.Int("numClumped", len(clumpedPieces)))
|
||||
return true, nil
|
||||
}
|
||||
|
||||
@ -598,6 +605,15 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
mon.IntVal("segment_repair_count").Observe(repairCount) //mon:locked
|
||||
stats.segmentRepairCount.Observe(repairCount)
|
||||
|
||||
repairer.log.Debug("repaired segment",
|
||||
zap.Stringer("Stream ID", segment.StreamID),
|
||||
zap.Uint64("Position", segment.Position.Encode()),
|
||||
zap.Int("clumped pieces", len(clumpedPieces)),
|
||||
zap.Int("in excluded countries", numHealthyInExcludedCountries),
|
||||
zap.Int("removed pieces", len(toRemove)),
|
||||
zap.Int("repaired pieces", len(repairedPieces)),
|
||||
zap.Int("healthy before repair", numHealthy),
|
||||
zap.Int("healthy after repair", healthyAfterRepair))
|
||||
return true, nil
|
||||
}
|
||||
|
||||
|
6
scripts/testdata/satellite-config.yaml.lock
vendored
6
scripts/testdata/satellite-config.yaml.lock
vendored
@ -88,6 +88,9 @@
|
||||
# number of workers to run audits on segments
|
||||
# audit.worker-concurrency: 2
|
||||
|
||||
# Treat pieces on the same network as in need of repair
|
||||
# checker.do-declumping: false
|
||||
|
||||
# how frequently checker should check for bad segments
|
||||
# checker.interval: 30s
|
||||
|
||||
@ -925,6 +928,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# ratio where to consider processed count as supicious
|
||||
# ranged-loop.suspicious-processed-ratio: 0.03
|
||||
|
||||
# repair pieces on the same network to other nodes
|
||||
# repairer.do-declumping: false
|
||||
|
||||
# time limit for downloading pieces from a node for repair
|
||||
# repairer.download-timeout: 5m0s
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user