satellite/repair: don't reuse allNodeIDs

We were reusing a slice to save on allocations, but it turns out the
function using it was being called in multiple goroutines at the same
time.

This is definitely a problem with repairer/segments.go. I'm not 100%
sure if it also is a problem with checker/observer.go, but I'm making
the change there as well to be on the safe side for now.

Repair workers only ran with this bug on testing satellites, and it
looks like the worst that could have happened was that we repaired
pieces off of well-behaved, non-clumped, in-placement nodes by mistake.

Change-Id: I33c112b05941b63d066caab6a34a543840c6b85d
This commit is contained in:
paul cannon 2023-06-06 10:28:04 -05:00
parent c858479ef0
commit 25a5df9752
2 changed files with 9 additions and 24 deletions

View File

@ -234,7 +234,6 @@ type observerFork struct {
doPlacementCheck bool
lastStreamID uuid.UUID
totalStats aggregateStats
allNodeIDs []storj.NodeID
getObserverStats func(string) *observerRSStats
}
@ -341,15 +340,9 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
return Error.New("error getting missing pieces: %w", err)
}
// reuse allNodeIDs slice if its large enough
if cap(fork.allNodeIDs) < len(pieces) {
fork.allNodeIDs = make([]storj.NodeID, len(pieces))
} else {
fork.allNodeIDs = fork.allNodeIDs[:len(pieces)]
}
allNodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
fork.allNodeIDs[i] = p.StorageNode
allNodeIDs[i] = p.StorageNode
}
var clumpedPieces metabase.Pieces
@ -357,7 +350,7 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
if fork.doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, fork.allNodeIDs)
lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, allNodeIDs)
if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++
@ -368,7 +361,7 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
numPiecesOutOfPlacement := 0
if fork.doPlacementCheck && segment.Placement != storj.EveryCountry {
outOfPlacementNodes, err := fork.overlayService.GetNodesOutOfPlacement(ctx, fork.allNodeIDs, segment.Placement)
outOfPlacementNodes, err := fork.overlayService.GetNodesOutOfPlacement(ctx, allNodeIDs, segment.Placement)
if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++

View File

@ -99,8 +99,6 @@ type SegmentRepairer struct {
// repairOverrides is the set of values configured by the checker to override the repair threshold for various RS schemes.
repairOverrides checker.RepairOverridesMap
allNodeIDs []storj.NodeID
nowFn func() time.Time
OnTestingCheckSegmentAlteredHook func()
OnTestingPiecesReportHook func(pieces FetchResultReport)
@ -199,18 +197,12 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
pieces := segment.Pieces
// reuse allNodeIDs slice if its large enough
if cap(repairer.allNodeIDs) < len(pieces) {
repairer.allNodeIDs = make([]storj.NodeID, len(pieces))
} else {
repairer.allNodeIDs = repairer.allNodeIDs[:len(pieces)]
}
allNodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
repairer.allNodeIDs[i] = p.StorageNode
allNodeIDs[i] = p.StorageNode
}
excludeNodeIDs := repairer.allNodeIDs
excludeNodeIDs := allNodeIDs
missingPieces, err := repairer.overlay.GetMissingPieces(ctx, pieces)
if err != nil {
@ -222,7 +214,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
if repairer.doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, repairer.allNodeIDs)
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, allNodeIDs)
if err != nil {
return false, metainfoGetError.Wrap(err)
}
@ -237,7 +229,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
var outOfPlacementPiecesSet map[uint16]bool
if repairer.doPlacementCheck && segment.Placement != storj.EveryCountry {
var err error
outOfPlacementNodes, err := repairer.overlay.GetNodesOutOfPlacement(ctx, repairer.allNodeIDs, segment.Placement)
outOfPlacementNodes, err := repairer.overlay.GetNodesOutOfPlacement(ctx, allNodeIDs, segment.Placement)
if err != nil {
return false, metainfoGetError.Wrap(err)
}