satellite/overlay: fix GetNodesNetworkInOrder

We were using the UploadSelectionCache previously, which does _not_ have
all nodes, or even all online nodes, in it. So all nodes with less than
MinimumVersion, or with less than MinimumDiskSpace, or nodes suspended
for unknown audit errors, or nodes that have started graceful exit, were
all missing, and ended up having empty last_nets. Even with all that,
I'm kind of surprised how many nodes this involved, but using the upload
selection cache was definitely wrong.

This change uses the download selection cache instead, which excludes
nodes only when they are disqualified, gracefully exited (completely),
or offline.

Change-Id: Iaa07c988aa29c1eb05796ac48a6f19d69f5826c1
This commit is contained in:
paul cannon 2023-05-18 16:34:31 -05:00 committed by Michal Niewrzal
parent 9b3488276d
commit c856d45cc0
5 changed files with 16 additions and 30 deletions

View File

@ -129,22 +129,6 @@ func (state *State) Select(ctx context.Context, request Request) (_ []*Node, err
return selected, nil
}
// GetNodesNetwork returns the cached network for each given node ID.
func (state *State) GetNodesNetwork(ctx context.Context, nodeIDs []storj.NodeID) (nets []string) {
defer mon.Task()(&ctx)(nil)
state.mu.RLock()
defer state.mu.RUnlock()
nets = make([]string, len(nodeIDs))
for i, nodeID := range nodeIDs {
if net, ok := state.netByID[nodeID]; ok {
nets[i] = net
}
}
return nets
}
// Stats returns state information.
func (state *State) Stats() Stats {
state.mu.RLock()

View File

@ -430,7 +430,19 @@ func (service *Service) IsOnline(node *NodeDossier) bool {
// requested node is not in the database, an empty string will be returned corresponding
// to that node's last_net.
func (service *Service) GetNodesNetworkInOrder(ctx context.Context, nodeIDs []storj.NodeID) (lastNets []string, err error) {
return service.UploadSelectionCache.GetNodesNetwork(ctx, nodeIDs)
defer mon.Task()(&ctx)(nil)
nodes, err := service.DownloadSelectionCache.GetNodes(ctx, nodeIDs)
if err != nil {
return nil, err
}
lastNets = make([]string, len(nodeIDs))
for i, nodeID := range nodeIDs {
if selectedNode, ok := nodes[nodeID]; ok {
lastNets[i] = selectedNode.LastNet
}
}
return lastNets, nil
}
// FindStorageNodesForGracefulExit searches the overlay network for nodes that meet the provided requirements for graceful-exit requests.

View File

@ -10,7 +10,6 @@ import (
"go.uber.org/zap"
"storj.io/common/pb"
"storj.io/common/storj"
"storj.io/common/sync2"
"storj.io/storj/satellite/nodeselection/uploadselection"
)
@ -117,15 +116,6 @@ func (cache *UploadSelectionCache) Size(ctx context.Context) (reputableNodeCount
return stats.Reputable, stats.New, nil
}
// GetNodesNetwork returns the cached network for each given node ID.
func (cache *UploadSelectionCache) GetNodesNetwork(ctx context.Context, nodeIDs []storj.NodeID) (nets []string, err error) {
state, err := cache.cache.Get(ctx, time.Now())
if err != nil {
return nil, Error.Wrap(err)
}
return state.GetNodesNetwork(ctx, nodeIDs), nil
}
func convNodesToSelectedNodes(nodes []*uploadselection.Node) (xs []*SelectedNode) {
for _, n := range nodes {
xs = append(xs, &SelectedNode{

View File

@ -302,7 +302,7 @@ func TestCleanRepairQueueObserver(t *testing.T) {
}
require.NoError(t, observer.RefreshReliabilityCache(ctx))
require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.UploadSelectionCache.Refresh(ctx))
require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx))
// check that repair queue is empty to avoid false positive
count, err := repairQueue.Count(ctx)
@ -324,7 +324,7 @@ func TestCleanRepairQueueObserver(t *testing.T) {
}
require.NoError(t, observer.RefreshReliabilityCache(ctx))
require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.UploadSelectionCache.Refresh(ctx))
require.NoError(t, planet.Satellites[0].RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx))
// The checker will not insert/update the now healthy segments causing
// them to be removed from the queue at the end of the checker iteration

View File

@ -3285,7 +3285,7 @@ func TestRepairClumpedPieces(t *testing.T) {
}
err = satellite.DB.OverlayCache().UpdateCheckIn(ctx, checkInInfo, time.Now().UTC(), overlay.NodeSelectionConfig{})
require.NoError(t, err)
err = satellite.RangedLoop.Overlay.Service.UploadSelectionCache.Refresh(ctx)
err = satellite.RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx)
require.NoError(t, err)
// running repair checker again should put the segment into the repair queue