satellite/repair: don't reuse allNodeIDs
We were reusing a slice to save on allocations, but it turns out the function using it was being called in multiple goroutines at the same time. This is definitely a problem with repairer/segments.go. I'm not 100% sure if it also is a problem with checker/observer.go, but I'm making the change there as well to be on the safe side for now. Repair workers only ran with this bug on testing satellites, and it looks like the worst that could have happened was that we repaired pieces off of well-behaved, non-clumped, in-placement nodes by mistake. Change-Id: I33c112b05941b63d066caab6a34a543840c6b85d
This commit is contained in:
parent
c858479ef0
commit
25a5df9752
@ -234,7 +234,6 @@ type observerFork struct {
|
||||
doPlacementCheck bool
|
||||
lastStreamID uuid.UUID
|
||||
totalStats aggregateStats
|
||||
allNodeIDs []storj.NodeID
|
||||
|
||||
getObserverStats func(string) *observerRSStats
|
||||
}
|
||||
@ -341,15 +340,9 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
|
||||
return Error.New("error getting missing pieces: %w", err)
|
||||
}
|
||||
|
||||
// reuse allNodeIDs slice if its large enough
|
||||
if cap(fork.allNodeIDs) < len(pieces) {
|
||||
fork.allNodeIDs = make([]storj.NodeID, len(pieces))
|
||||
} else {
|
||||
fork.allNodeIDs = fork.allNodeIDs[:len(pieces)]
|
||||
}
|
||||
|
||||
allNodeIDs := make([]storj.NodeID, len(pieces))
|
||||
for i, p := range pieces {
|
||||
fork.allNodeIDs[i] = p.StorageNode
|
||||
allNodeIDs[i] = p.StorageNode
|
||||
}
|
||||
|
||||
var clumpedPieces metabase.Pieces
|
||||
@ -357,7 +350,7 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
|
||||
if fork.doDeclumping {
|
||||
// if multiple pieces are on the same last_net, keep only the first one. The rest are
|
||||
// to be considered retrievable but unhealthy.
|
||||
lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, fork.allNodeIDs)
|
||||
lastNets, err = fork.overlayService.GetNodesNetworkInOrder(ctx, allNodeIDs)
|
||||
if err != nil {
|
||||
fork.totalStats.remoteSegmentsFailedToCheck++
|
||||
stats.iterationAggregates.remoteSegmentsFailedToCheck++
|
||||
@ -368,7 +361,7 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
|
||||
|
||||
numPiecesOutOfPlacement := 0
|
||||
if fork.doPlacementCheck && segment.Placement != storj.EveryCountry {
|
||||
outOfPlacementNodes, err := fork.overlayService.GetNodesOutOfPlacement(ctx, fork.allNodeIDs, segment.Placement)
|
||||
outOfPlacementNodes, err := fork.overlayService.GetNodesOutOfPlacement(ctx, allNodeIDs, segment.Placement)
|
||||
if err != nil {
|
||||
fork.totalStats.remoteSegmentsFailedToCheck++
|
||||
stats.iterationAggregates.remoteSegmentsFailedToCheck++
|
||||
|
@ -99,8 +99,6 @@ type SegmentRepairer struct {
|
||||
// repairOverrides is the set of values configured by the checker to override the repair threshold for various RS schemes.
|
||||
repairOverrides checker.RepairOverridesMap
|
||||
|
||||
allNodeIDs []storj.NodeID
|
||||
|
||||
nowFn func() time.Time
|
||||
OnTestingCheckSegmentAlteredHook func()
|
||||
OnTestingPiecesReportHook func(pieces FetchResultReport)
|
||||
@ -199,18 +197,12 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
|
||||
pieces := segment.Pieces
|
||||
|
||||
// reuse allNodeIDs slice if its large enough
|
||||
if cap(repairer.allNodeIDs) < len(pieces) {
|
||||
repairer.allNodeIDs = make([]storj.NodeID, len(pieces))
|
||||
} else {
|
||||
repairer.allNodeIDs = repairer.allNodeIDs[:len(pieces)]
|
||||
}
|
||||
|
||||
allNodeIDs := make([]storj.NodeID, len(pieces))
|
||||
for i, p := range pieces {
|
||||
repairer.allNodeIDs[i] = p.StorageNode
|
||||
allNodeIDs[i] = p.StorageNode
|
||||
}
|
||||
|
||||
excludeNodeIDs := repairer.allNodeIDs
|
||||
excludeNodeIDs := allNodeIDs
|
||||
|
||||
missingPieces, err := repairer.overlay.GetMissingPieces(ctx, pieces)
|
||||
if err != nil {
|
||||
@ -222,7 +214,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
if repairer.doDeclumping {
|
||||
// if multiple pieces are on the same last_net, keep only the first one. The rest are
|
||||
// to be considered retrievable but unhealthy.
|
||||
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, repairer.allNodeIDs)
|
||||
lastNets, err := repairer.overlay.GetNodesNetworkInOrder(ctx, allNodeIDs)
|
||||
if err != nil {
|
||||
return false, metainfoGetError.Wrap(err)
|
||||
}
|
||||
@ -237,7 +229,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
var outOfPlacementPiecesSet map[uint16]bool
|
||||
if repairer.doPlacementCheck && segment.Placement != storj.EveryCountry {
|
||||
var err error
|
||||
outOfPlacementNodes, err := repairer.overlay.GetNodesOutOfPlacement(ctx, repairer.allNodeIDs, segment.Placement)
|
||||
outOfPlacementNodes, err := repairer.overlay.GetNodesOutOfPlacement(ctx, allNodeIDs, segment.Placement)
|
||||
if err != nil {
|
||||
return false, metainfoGetError.Wrap(err)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user