satellite/repair: add flag for de-clumping behavior

It seems that the "what pieces are clumped" code does not work right, so this logic is causing repair overload or other repair failures. Hide it behind a flag while we figure out what is going on, so that repair can still work in the meantime. Change-Id: If83ef7895cba870353a67ab13573193d92fff80b
2023-05-18 13:47:23 -05:00 · 2023-05-18 13:47:23 -05:00 · de737bdee9
commit de737bdee9
parent c0e7f463fe
6 changed files with 66 additions and 28 deletions
--- a/satellite/repair/checker/config.go
+++ b/satellite/repair/checker/config.go
@ -23,6 +23,7 @@ type Config struct {
 	// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
 	NodeFailureRate            float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
 	RepairQueueInsertBatchSize int     `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
 	DoDeclumping               bool    `help:"Treat pieces on the same network as in need of repair" default:"false"`
 }
 // RepairOverride is a configuration struct that contains an override repair
--- a/satellite/repair/checker/observer.go
+++ b/satellite/repair/checker/observer.go
@ -39,6 +39,7 @@ type Observer struct {
 	repairOverrides      RepairOverridesMap
 	nodeFailureRate      float64
 	repairQueueBatchSize int
 	doDeclumping         bool
 	// the following are reset on each iteration
 	startTime  time.Time
@ -59,6 +60,7 @@ func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *ove
 		repairOverrides:      config.RepairOverrides.GetMap(),
 		nodeFailureRate:      config.NodeFailureRate,
 		repairQueueBatchSize: config.RepairQueueInsertBatchSize,
 		doDeclumping:         config.DoDeclumping,
 		statsCollector:       make(map[string]*observerRSStats),
 	}
 }
@ -226,6 +228,7 @@ type observerFork struct {
 	nodeFailureRate  float64
 	getNodesEstimate func(ctx context.Context) (int, error)
 	log              *zap.Logger
 	doDeclumping     bool
 	lastStreamID     uuid.UUID
 	totalStats       aggregateStats
@ -244,6 +247,7 @@ func newObserverFork(observer *Observer) rangedloop.Partial {
 		nodeFailureRate:  observer.nodeFailureRate,
 		getNodesEstimate: observer.getNodesEstimate,
 		log:              observer.logger,
 		doDeclumping:     observer.doDeclumping,
 		getObserverStats: observer.getObserverStats,
 	}
 }
@ -332,19 +336,23 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
 		return Error.New("error getting missing pieces: %w", err)
 	}
 	var clumpedPieces metabase.Pieces
 	var lastNets []string
 	if fork.doDeclumping {
 		// if multiple pieces are on the same last_net, keep only the first one. The rest are
 		// to be considered retrievable but unhealthy.
 		nodeIDs := make([]storj.NodeID, len(pieces))
 		for i, p := range pieces {
 			nodeIDs[i] = p.StorageNode
 		}
-	lastNets, err := fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
+		lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
 		if err != nil {
 			fork.totalStats.remoteSegmentsFailedToCheck++
 			stats.iterationAggregates.remoteSegmentsFailedToCheck++
 			return errs.Combine(Error.New("error determining node last_nets"), err)
 		}
-	clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
+		clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
 	}
 	numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
 	mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
--- a/satellite/repair/repair_test.go
+++ b/satellite/repair/repair_test.go
@ -3227,7 +3227,13 @@ func TestRepairClumpedPieces(t *testing.T) {
 		StorageNodeCount: 6,
 		UplinkCount:      1,
 		Reconfigure: testplanet.Reconfigure{
-			Satellite: testplanet.ReconfigureRS(2, 3, 4, 4),
+			Satellite: testplanet.Combine(
 				testplanet.ReconfigureRS(2, 3, 4, 4),
 				func(log *zap.Logger, index int, config *satellite.Config) {
 					config.Checker.DoDeclumping = true
 					config.Repairer.DoDeclumping = true
 				},
 			),
 		},
 	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
 		uplinkPeer := planet.Uplinks[0]
--- a/satellite/repair/repairer/repairer.go
+++ b/satellite/repair/repairer/repairer.go
@ -35,6 +35,7 @@ type Config struct {
 	InMemoryRepair                bool          `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
 	ReputationUpdateEnabled       bool          `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
 	UseRangedLoop                 bool          `help:"whether to enable repair checker observer with ranged loop" default:"true"`
 	DoDeclumping                  bool          `help:"repair pieces on the same network to other nodes" default:"false"`
 }
 // Service contains the information needed to run the repair service.
--- a/satellite/repair/repairer/segments.go
+++ b/satellite/repair/repairer/segments.go
@ -86,6 +86,7 @@ type SegmentRepairer struct {
 	reporter       audit.Reporter
 	reputationUpdateEnabled bool
 	doDeclumping            bool
 	// multiplierOptimalThreshold is the value that multiplied by the optimal
 	// threshold results in the maximum limit of number of nodes to upload
@ -133,6 +134,7 @@ func NewSegmentRepairer(
 		repairOverrides:            repairOverrides.GetMap(),
 		reporter:                   reporter,
 		reputationUpdateEnabled:    config.ReputationUpdateEnabled,
 		doDeclumping:               config.DoDeclumping,
 		nowFn: time.Now,
 	}
@ -196,6 +198,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 		return false, overlayQueryError.New("error identifying missing pieces: %w", err)
 	}
 	var clumpedPieces metabase.Pieces
 	var clumpedPiecesSet map[uint16]bool
 	if repairer.doDeclumping {
 		// if multiple pieces are on the same last_net, keep only the first one. The rest are
 		// to be considered retrievable but unhealthy.
 		nodeIDs := make([]storj.NodeID, len(pieces))
@ -206,12 +211,14 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 		if err != nil {
 			return false, metainfoGetError.Wrap(err)
 		}
-	clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
+		clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
-	clumpedPiecesSet := make(map[uint16]bool)
+		clumpedPiecesSet = make(map[uint16]bool)
 		for _, clumpedPiece := range clumpedPieces {
 			clumpedPiecesSet[clumpedPiece.Number] = true
 		}
 	}
 	numRetrievable := len(pieces) - len(missingPieces)
 	numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
 	// irreparable segment
@ -258,7 +265,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 	if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) {
 		mon.Meter("repair_unnecessary").Mark(1) //mon:locked
 		stats.repairUnnecessary.Mark(1)
-		repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold))
+		repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold), zap.Int("numClumped", len(clumpedPieces)))
 		return true, nil
 	}
@ -598,6 +605,15 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 	mon.IntVal("segment_repair_count").Observe(repairCount) //mon:locked
 	stats.segmentRepairCount.Observe(repairCount)
 	repairer.log.Debug("repaired segment",
 		zap.Stringer("Stream ID", segment.StreamID),
 		zap.Uint64("Position", segment.Position.Encode()),
 		zap.Int("clumped pieces", len(clumpedPieces)),
 		zap.Int("in excluded countries", numHealthyInExcludedCountries),
 		zap.Int("removed pieces", len(toRemove)),
 		zap.Int("repaired pieces", len(repairedPieces)),
 		zap.Int("healthy before repair", numHealthy),
 		zap.Int("healthy after repair", healthyAfterRepair))
 	return true, nil
 }
--- a/scripts/testdata/satellite-config.yaml.lock
+++ b/scripts/testdata/satellite-config.yaml.lock
@ -88,6 +88,9 @@
 # number of workers to run audits on segments
 # audit.worker-concurrency: 2
 # Treat pieces on the same network as in need of repair
 # checker.do-declumping: false
 # how frequently checker should check for bad segments
 # checker.interval: 30s
@ -925,6 +928,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
 # ratio where to consider processed count as supicious
 # ranged-loop.suspicious-processed-ratio: 0.03
 # repair pieces on the same network to other nodes
 # repairer.do-declumping: false
 # time limit for downloading pieces from a node for repair
 # repairer.download-timeout: 5m0s