From c856d45cc049132cb1caa0b8951d5a264bd65dcf Mon Sep 17 00:00:00 2001 From: paul cannon Date: Thu, 18 May 2023 16:34:31 -0500 Subject: [PATCH] satellite/overlay: fix GetNodesNetworkInOrder We were using the UploadSelectionCache previously, which does _not_ have all nodes, or even all online nodes, in it. So all nodes with less than MinimumVersion, or with less than MinimumDiskSpace, or nodes suspended for unknown audit errors, or nodes that have started graceful exit, were all missing, and ended up having empty last_nets. Even with all that, I'm kind of surprised how many nodes this involved, but using the upload selection cache was definitely wrong. This change uses the download selection cache instead, which excludes nodes only when they are disqualified, gracefully exited (completely), or offline. Change-Id: Iaa07c988aa29c1eb05796ac48a6f19d69f5826c1 --- satellite/nodeselection/uploadselection/state.go | 16 ---------------- satellite/overlay/service.go | 14 +++++++++++++- satellite/overlay/uploadselection.go | 10 ---------- satellite/repair/checker/observer_test.go | 4 ++-- satellite/repair/repair_test.go | 2 +- 5 files changed, 16 insertions(+), 30 deletions(-) diff --git a/satellite/nodeselection/uploadselection/state.go b/satellite/nodeselection/uploadselection/state.go index 30da0efb7..ad8857743 100644 --- a/satellite/nodeselection/uploadselection/state.go +++ b/satellite/nodeselection/uploadselection/state.go @@ -129,22 +129,6 @@ func (state *State) Select(ctx context.Context, request Request) (_ []*Node, err return selected, nil } -// GetNodesNetwork returns the cached network for each given node ID. -func (state *State) GetNodesNetwork(ctx context.Context, nodeIDs []storj.NodeID) (nets []string) { - defer mon.Task()(&ctx)(nil) - - state.mu.RLock() - defer state.mu.RUnlock() - - nets = make([]string, len(nodeIDs)) - for i, nodeID := range nodeIDs { - if net, ok := state.netByID[nodeID]; ok { - nets[i] = net - } - } - return nets -} - // Stats returns state information. func (state *State) Stats() Stats { state.mu.RLock() diff --git a/satellite/overlay/service.go b/satellite/overlay/service.go index e2e579900..2ccf8c620 100644 --- a/satellite/overlay/service.go +++ b/satellite/overlay/service.go @@ -430,7 +430,19 @@ func (service *Service) IsOnline(node *NodeDossier) bool { // requested node is not in the database, an empty string will be returned corresponding // to that node's last_net. func (service *Service) GetNodesNetworkInOrder(ctx context.Context, nodeIDs []storj.NodeID) (lastNets []string, err error) { - return service.UploadSelectionCache.GetNodesNetwork(ctx, nodeIDs) + defer mon.Task()(&ctx)(nil) + + nodes, err := service.DownloadSelectionCache.GetNodes(ctx, nodeIDs) + if err != nil { + return nil, err + } + lastNets = make([]string, len(nodeIDs)) + for i, nodeID := range nodeIDs { + if selectedNode, ok := nodes[nodeID]; ok { + lastNets[i] = selectedNode.LastNet + } + } + return lastNets, nil } // FindStorageNodesForGracefulExit searches the overlay network for nodes that meet the provided requirements for graceful-exit requests. diff --git a/satellite/overlay/uploadselection.go b/satellite/overlay/uploadselection.go index 306696059..dec3ebb61 100644 --- a/satellite/overlay/uploadselection.go +++ b/satellite/overlay/uploadselection.go @@ -10,7 +10,6 @@ import ( "go.uber.org/zap" "storj.io/common/pb" - "storj.io/common/storj" "storj.io/common/sync2" "storj.io/storj/satellite/nodeselection/uploadselection" ) @@ -117,15 +116,6 @@ func (cache *UploadSelectionCache) Size(ctx context.Context) (reputableNodeCount return stats.Reputable, stats.New, nil } -// GetNodesNetwork returns the cached network for each given node ID. -func (cache *UploadSelectionCache) GetNodesNetwork(ctx context.Context, nodeIDs []storj.NodeID) (nets []string, err error) { - state, err := cache.cache.Get(ctx, time.Now()) - if err != nil { - return nil, Error.Wrap(err) - } - return state.GetNodesNetwork(ctx, nodeIDs), nil -} - func convNodesToSelectedNodes(nodes []*uploadselection.Node) (xs []*SelectedNode) { for _, n := range nodes { xs = append(xs, &SelectedNode{ diff --git a/satellite/repair/checker/observer_test.go b/satellite/repair/checker/observer_test.go index c6ffd1a51..bdfba53d7 100644 --- a/satellite/repair/checker/observer_test.go +++ b/satellite/repair/checker/observer_test.go @@ -302,7 +302,7 @@ func TestCleanRepairQueueObserver(t *testing.T) { } require.NoError(t, observer.RefreshReliabilityCache(ctx)) - require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.UploadSelectionCache.Refresh(ctx)) + require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx)) // check that repair queue is empty to avoid false positive count, err := repairQueue.Count(ctx) @@ -324,7 +324,7 @@ func TestCleanRepairQueueObserver(t *testing.T) { } require.NoError(t, observer.RefreshReliabilityCache(ctx)) - require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.UploadSelectionCache.Refresh(ctx)) + require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx)) // The checker will not insert/update the now healthy segments causing // them to be removed from the queue at the end of the checker iteration diff --git a/satellite/repair/repair_test.go b/satellite/repair/repair_test.go index 3d6e39c57..de98c7e22 100644 --- a/satellite/repair/repair_test.go +++ b/satellite/repair/repair_test.go @@ -3285,7 +3285,7 @@ func TestRepairClumpedPieces(t *testing.T) { } err = satellite.DB.OverlayCache().UpdateCheckIn(ctx, checkInInfo, time.Now().UTC(), overlay.NodeSelectionConfig{}) require.NoError(t, err) - err = satellite.RangedLoop.Overlay.Service.UploadSelectionCache.Refresh(ctx) + err = satellite.RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx) require.NoError(t, err) // running repair checker again should put the segment into the repair queue