satellite/repair: add flag for de-clumping behavior

It seems that the "what pieces are clumped" code does not work right, so this logic is causing repair overload or other repair failures. Hide it behind a flag while we figure out what is going on, so that repair can still work in the meantime. Change-Id: If83ef7895cba870353a67ab13573193d92fff80b
2023-05-18 13:47:23 -05:00 · 2023-05-18 13:47:23 -05:00 · de737bdee9
commit de737bdee9
parent c0e7f463fe
6 changed files with 66 additions and 28 deletions
--- a/satellite/repair/checker/config.go
+++ b/satellite/repair/checker/config.go
@ -23,6 +23,7 @@ type Config struct {
 	// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
 	NodeFailureRate            float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
 	RepairQueueInsertBatchSize int     `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
+	DoDeclumping               bool    `help:"Treat pieces on the same network as in need of repair" default:"false"`
 }

 // RepairOverride is a configuration struct that contains an override repair
--- a/satellite/repair/checker/observer.go
+++ b/satellite/repair/checker/observer.go
@ -39,6 +39,7 @@ type Observer struct {
 	repairOverrides      RepairOverridesMap
 	nodeFailureRate      float64
 	repairQueueBatchSize int
+	doDeclumping         bool

 	// the following are reset on each iteration
 	startTime  time.Time
@ -59,6 +60,7 @@ func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *ove
 		repairOverrides:      config.RepairOverrides.GetMap(),
 		nodeFailureRate:      config.NodeFailureRate,
 		repairQueueBatchSize: config.RepairQueueInsertBatchSize,
+		doDeclumping:         config.DoDeclumping,
 		statsCollector:       make(map[string]*observerRSStats),
 	}
 }
@ -226,6 +228,7 @@ type observerFork struct {
 	nodeFailureRate  float64
 	getNodesEstimate func(ctx context.Context) (int, error)
 	log              *zap.Logger
+	doDeclumping     bool
 	lastStreamID     uuid.UUID
 	totalStats       aggregateStats

@ -244,6 +247,7 @@ func newObserverFork(observer *Observer) rangedloop.Partial {
 		nodeFailureRate:  observer.nodeFailureRate,
 		getNodesEstimate: observer.getNodesEstimate,
 		log:              observer.logger,
+		doDeclumping:     observer.doDeclumping,
 		getObserverStats: observer.getObserverStats,
 	}
 }
@ -332,19 +336,23 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
 		return Error.New("error getting missing pieces: %w", err)
 	}

-	// if multiple pieces are on the same last_net, keep only the first one. The rest are
-	// to be considered retrievable but unhealthy.
-	nodeIDs := make([]storj.NodeID, len(pieces))
-	for i, p := range pieces {
-		nodeIDs[i] = p.StorageNode
+	var clumpedPieces metabase.Pieces
+	var lastNets []string
+	if fork.doDeclumping {
+		// if multiple pieces are on the same last_net, keep only the first one. The rest are
+		// to be considered retrievable but unhealthy.
+		nodeIDs := make([]storj.NodeID, len(pieces))
+		for i, p := range pieces {
+			nodeIDs[i] = p.StorageNode
+		}
+		lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
+		if err != nil {
+			fork.totalStats.remoteSegmentsFailedToCheck++
+			stats.iterationAggregates.remoteSegmentsFailedToCheck++
+			return errs.Combine(Error.New("error determining node last_nets"), err)
+		}
+		clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
 	}
-	lastNets, err := fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
-	if err != nil {
-		fork.totalStats.remoteSegmentsFailedToCheck++
-		stats.iterationAggregates.remoteSegmentsFailedToCheck++
-		return errs.Combine(Error.New("error determining node last_nets"), err)
-	}
-	clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)

 	numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
 	mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
--- a/satellite/repair/repair_test.go
+++ b/satellite/repair/repair_test.go
@ -3227,7 +3227,13 @@ func TestRepairClumpedPieces(t *testing.T) {
 		StorageNodeCount: 6,
 		UplinkCount:      1,
 		Reconfigure: testplanet.Reconfigure{
-			Satellite: testplanet.ReconfigureRS(2, 3, 4, 4),
+			Satellite: testplanet.Combine(
+				testplanet.ReconfigureRS(2, 3, 4, 4),
+				func(log *zap.Logger, index int, config *satellite.Config) {
+					config.Checker.DoDeclumping = true
+					config.Repairer.DoDeclumping = true
+				},
+			),
 		},
 	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
 		uplinkPeer := planet.Uplinks[0]
--- a/satellite/repair/repairer/repairer.go
+++ b/satellite/repair/repairer/repairer.go
@ -35,6 +35,7 @@ type Config struct {
 	InMemoryRepair                bool          `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
 	ReputationUpdateEnabled       bool          `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
 	UseRangedLoop                 bool          `help:"whether to enable repair checker observer with ranged loop" default:"true"`
+	DoDeclumping                  bool          `help:"repair pieces on the same network to other nodes" default:"false"`
 }

 // Service contains the information needed to run the repair service.
--- a/satellite/repair/repairer/segments.go
+++ b/satellite/repair/repairer/segments.go
@ -86,6 +86,7 @@ type SegmentRepairer struct {
 	reporter       audit.Reporter

 	reputationUpdateEnabled bool
+	doDeclumping            bool

 	// multiplierOptimalThreshold is the value that multiplied by the optimal
 	// threshold results in the maximum limit of number of nodes to upload
@ -133,6 +134,7 @@ func NewSegmentRepairer(
 		repairOverrides:            repairOverrides.GetMap(),
 		reporter:                   reporter,
 		reputationUpdateEnabled:    config.ReputationUpdateEnabled,
+		doDeclumping:               config.DoDeclumping,

 		nowFn: time.Now,
 	}
@ -196,20 +198,25 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 		return false, overlayQueryError.New("error identifying missing pieces: %w", err)
 	}

-	// if multiple pieces are on the same last_net, keep only the first one. The rest are
-	// to be considered retrievable but unhealthy.
-	nodeIDs := make([]storj.NodeID, len(pieces))
-	for i, p := range pieces {
-		nodeIDs[i] = p.StorageNode
-	}
-	lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, nodeIDs)
-	if err != nil {
-		return false, metainfoGetError.Wrap(err)
-	}
-	clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
-	clumpedPiecesSet := make(map[uint16]bool)
-	for _, clumpedPiece := range clumpedPieces {
-		clumpedPiecesSet[clumpedPiece.Number] = true
+	var clumpedPieces metabase.Pieces
+	var clumpedPiecesSet map[uint16]bool
+	if repairer.doDeclumping {
+		// if multiple pieces are on the same last_net, keep only the first one. The rest are
+		// to be considered retrievable but unhealthy.
+		nodeIDs := make([]storj.NodeID, len(pieces))
+		for i, p := range pieces {
+			nodeIDs[i] = p.StorageNode
+		}
+		lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, nodeIDs)
+		if err != nil {
+			return false, metainfoGetError.Wrap(err)
+		}
+		clumpedPieces = repair.FindClumpedPieces(segment.Pieces, lastNets)
+		clumpedPiecesSet = make(map[uint16]bool)
+		for _, clumpedPiece := range clumpedPieces {
+			clumpedPiecesSet[clumpedPiece.Number] = true
+		}
+
 	}

 	numRetrievable := len(pieces) - len(missingPieces)
@ -258,7 +265,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 	if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) {
 		mon.Meter("repair_unnecessary").Mark(1) //mon:locked
 		stats.repairUnnecessary.Mark(1)
-		repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold))
+		repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold), zap.Int("numClumped", len(clumpedPieces)))
 		return true, nil
 	}

@ -598,6 +605,15 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
 	mon.IntVal("segment_repair_count").Observe(repairCount) //mon:locked
 	stats.segmentRepairCount.Observe(repairCount)

+	repairer.log.Debug("repaired segment",
+		zap.Stringer("Stream ID", segment.StreamID),
+		zap.Uint64("Position", segment.Position.Encode()),
+		zap.Int("clumped pieces", len(clumpedPieces)),
+		zap.Int("in excluded countries", numHealthyInExcludedCountries),
+		zap.Int("removed pieces", len(toRemove)),
+		zap.Int("repaired pieces", len(repairedPieces)),
+		zap.Int("healthy before repair", numHealthy),
+		zap.Int("healthy after repair", healthyAfterRepair))
 	return true, nil
 }

--- a/scripts/testdata/satellite-config.yaml.lock
+++ b/scripts/testdata/satellite-config.yaml.lock
@ -88,6 +88,9 @@
 # number of workers to run audits on segments
 # audit.worker-concurrency: 2

+# Treat pieces on the same network as in need of repair
+# checker.do-declumping: false
+
 # how frequently checker should check for bad segments
 # checker.interval: 30s

@ -925,6 +928,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
 # ratio where to consider processed count as supicious
 # ranged-loop.suspicious-processed-ratio: 0.03

+# repair pieces on the same network to other nodes
+# repairer.do-declumping: false
+
 # time limit for downloading pieces from a node for repair
 # repairer.download-timeout: 5m0s