satellite/overlay: refactor Reliable to be used with repair checker

Currently we are using Reliable to get missing pieces for repair
checker. The issue is that now checker is looking at more things than
just missing pieces (clumped/off, placement pieces) and using only node
ID is not enough. We have issue where we are skipping offline nodes from
clumped and off placement pieces check.

Reliable was refactored to get data (e.g. country, lastNet) about all
reliable nodes. List is split into online and offline. This data will be
cached for quick use by repair checker. It will be also possible to
check nodes metadata like country code or lastNet.

We are also slowly moving `RepairExcludedCountryCodes` config from
overlay to repair which makes more sens for it.

This this first part of changes.

https://github.com/storj/storj/issues/5998

Change-Id: If534342488c0e440affc2894a8fbda6507b8959d
This commit is contained in:
Michal Niewrzal 2023-06-29 10:38:47 +02:00
parent 500b6244f8
commit f2cd7b0928
12 changed files with 162 additions and 123 deletions

View File

@ -105,9 +105,9 @@ func TestDownloadWithSomeNodesOffline(t *testing.T) {
}
// confirm that we marked the correct number of storage nodes as offline
nodes, err := satellite.Overlay.Service.Reliable(ctx)
online, _, err := satellite.Overlay.Service.Reliable(ctx)
require.NoError(t, err)
require.Len(t, nodes, len(planet.StorageNodes)-toKill)
require.Len(t, online, len(planet.StorageNodes)-toKill)
// we should be able to download data without any of the original nodes
newData, err := ul.Download(ctx, satellite, "testbucket", "test/path")

View File

@ -426,6 +426,7 @@ func TestAllInOne(t *testing.T) {
satellite.DB.RepairQueue(),
satellite.Overlay.Service,
satellite.Config.Checker,
[]string{},
),
})

View File

@ -66,8 +66,8 @@ type DB interface {
KnownReliableInExcludedCountries(context.Context, *NodeCriteria, storj.NodeIDList) (storj.NodeIDList, error)
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList, onlineWindow, asOfSystemInterval time.Duration) (online []uploadselection.SelectedNode, offline []uploadselection.SelectedNode, err error)
// Reliable returns all nodes that are reliable
Reliable(context.Context, *NodeCriteria) (storj.NodeIDList, error)
// Reliable returns all nodes that are reliable, online and offline.
Reliable(ctx context.Context, onlineWindow, asOfSystemInterval time.Duration) (online []uploadselection.SelectedNode, offline []uploadselection.SelectedNode, err error)
// UpdateReputation updates the DB columns for all reputation fields in ReputationStatus.
UpdateReputation(ctx context.Context, id storj.NodeID, request ReputationUpdate) error
// UpdateNodeInfo updates node dossier with info requested from the node itself like node type, email, wallet, capacity, and version.
@ -577,15 +577,12 @@ func (service *Service) KnownReliable(ctx context.Context, nodeIDs storj.NodeIDL
return service.db.KnownReliable(ctx, nodeIDs, service.config.Node.OnlineWindow, 0)
}
// Reliable filters a set of nodes that are reliable, independent of new.
func (service *Service) Reliable(ctx context.Context) (nodes storj.NodeIDList, err error) {
// Reliable returns all nodes that are reliable, online and offline.
func (service *Service) Reliable(ctx context.Context) (online []uploadselection.SelectedNode, offline []uploadselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
criteria := &NodeCriteria{
OnlineWindow: service.config.Node.OnlineWindow,
}
criteria.ExcludedCountries = service.config.RepairExcludedCountryCodes
return service.db.Reliable(ctx, criteria)
// TODO add as of system time
return service.db.Reliable(ctx, service.config.Node.OnlineWindow, 0)
}
// UpdateReputation updates the DB columns for any of the reputation fields.

View File

@ -818,28 +818,6 @@ func TestVetAndUnvetNode(t *testing.T) {
})
}
func TestReliable(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
service := planet.Satellites[0].Overlay.Service
node := planet.StorageNodes[0]
nodes, err := service.Reliable(ctx)
require.NoError(t, err)
require.Len(t, nodes, 2)
err = planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, node.ID(), "FR")
require.NoError(t, err)
// first node should be excluded from Reliable result because of country code
nodes, err = service.Reliable(ctx)
require.NoError(t, err)
require.Len(t, nodes, 1)
require.NotEqual(t, node.ID(), nodes[0])
})
}
func TestKnownReliableInExcludedCountries(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
@ -847,15 +825,15 @@ func TestKnownReliableInExcludedCountries(t *testing.T) {
service := planet.Satellites[0].Overlay.Service
node := planet.StorageNodes[0]
nodes, err := service.Reliable(ctx)
onlineNodes, _, err := service.Reliable(ctx)
require.NoError(t, err)
require.Len(t, nodes, 2)
require.Len(t, onlineNodes, 2)
err = planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, node.ID(), "FR")
require.NoError(t, err)
// first node should be excluded from Reliable result because of country code
nodes, err = service.KnownReliableInExcludedCountries(ctx, nodes)
nodes, err := service.KnownReliableInExcludedCountries(ctx, storj.NodeIDList{onlineNodes[0].ID, onlineNodes[1].ID})
require.NoError(t, err)
require.Len(t, nodes, 1)
require.Equal(t, node.ID(), nodes[0])

View File

@ -102,18 +102,13 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
storj.NodeID{7}, storj.NodeID{8},
storj.NodeID{9},
}
criteria := &overlay.NodeCriteria{
OnlineWindow: time.Hour,
ExcludedCountries: []string{"FR", "BE"},
}
contains := func(nodeID storj.NodeID) func(node uploadselection.SelectedNode) bool {
return func(node uploadselection.SelectedNode) bool {
return node.ID == nodeID
}
}
online, offline, err := cache.KnownReliable(ctx, nodeIds, criteria.OnlineWindow, criteria.AsOfSystemInterval)
online, offline, err := cache.KnownReliable(ctx, nodeIds, time.Hour, 0)
require.NoError(t, err)
// unrealiable nodes shouldn't be in results
@ -124,19 +119,26 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
require.False(t, slices.ContainsFunc(append(online, offline...), contains(storj.NodeID{9}))) // not in db
require.True(t, slices.ContainsFunc(offline, contains(storj.NodeID{4}))) // offline
// KnownReliable is not excluding by country anymore
require.True(t, slices.ContainsFunc(online, contains(storj.NodeID{7}))) // excluded country
require.Len(t, append(online, offline...), 4)
valid, err := cache.Reliable(ctx, criteria)
online, offline, err = cache.Reliable(ctx, time.Hour, 0)
require.NoError(t, err)
require.NotContains(t, valid, storj.NodeID{2}) // disqualified
require.NotContains(t, valid, storj.NodeID{3}) // unknown audit suspended
require.NotContains(t, valid, storj.NodeID{4}) // offline
require.NotContains(t, valid, storj.NodeID{5}) // gracefully exited
require.NotContains(t, valid, storj.NodeID{6}) // offline suspended
require.NotContains(t, valid, storj.NodeID{7}) // excluded country
require.NotContains(t, valid, storj.NodeID{9}) // not in db
require.Len(t, valid, 2)
require.False(t, slices.ContainsFunc(append(online, offline...), contains(storj.NodeID{2}))) // disqualified
require.False(t, slices.ContainsFunc(append(online, offline...), contains(storj.NodeID{3}))) // unknown audit suspended
require.False(t, slices.ContainsFunc(append(online, offline...), contains(storj.NodeID{5}))) // gracefully exited
require.False(t, slices.ContainsFunc(append(online, offline...), contains(storj.NodeID{6}))) // offline suspended
require.False(t, slices.ContainsFunc(append(online, offline...), contains(storj.NodeID{9}))) // not in db
require.True(t, slices.ContainsFunc(offline, contains(storj.NodeID{4}))) // offline
// Reliable is not excluding by country anymore
require.True(t, slices.ContainsFunc(online, contains(storj.NodeID{7}))) // excluded country
require.Len(t, append(online, offline...), 4)
}
{ // TestUpdateOperator

View File

@ -156,6 +156,7 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
peer.DB.RepairQueue(),
peer.Overlay.Service,
config.Checker,
config.Overlay.RepairExcludedCountryCodes,
)
}

View File

@ -51,12 +51,13 @@ type Observer struct {
}
// NewObserver creates new checker observer instance.
func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, config Config) *Observer {
// TODO move excludedCountries into config but share it somehow with segment repairer.
func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, config Config, excludedCountries []string) *Observer {
return &Observer{
logger: logger,
repairQueue: repairQueue,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness, excludedCountries),
overlayService: overlay,
repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate,

View File

@ -554,7 +554,7 @@ func BenchmarkRemoteSegment(b *testing.B) {
}
observer := checker.NewObserver(zap.NewNop(), planet.Satellites[0].DB.RepairQueue(),
planet.Satellites[0].Auditor.Overlay, planet.Satellites[0].Config.Checker)
planet.Satellites[0].Auditor.Overlay, planet.Satellites[0].Config.Checker, []string{})
segments, err := planet.Satellites[0].Metabase.DB.TestingAllSegments(ctx)
require.NoError(b, err)

View File

@ -10,6 +10,7 @@ import (
"time"
"storj.io/common/storj"
"storj.io/common/storj/location"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/overlay"
)
@ -19,10 +20,11 @@ import (
//
// architecture: Service
type ReliabilityCache struct {
overlay *overlay.Service
staleness time.Duration
mu sync.Mutex
state atomic.Value // contains immutable *reliabilityState
overlay *overlay.Service
staleness time.Duration
excludedCountryCodes map[location.CountryCode]struct{}
mu sync.Mutex
state atomic.Value // contains immutable *reliabilityState
}
// reliabilityState.
@ -32,10 +34,18 @@ type reliabilityState struct {
}
// NewReliabilityCache creates a new reliability checking cache.
func NewReliabilityCache(overlay *overlay.Service, staleness time.Duration) *ReliabilityCache {
func NewReliabilityCache(overlay *overlay.Service, staleness time.Duration, excludedCountries []string) *ReliabilityCache {
excludedCountryCodes := make(map[location.CountryCode]struct{})
for _, countryCode := range excludedCountries {
if cc := location.ToCountryCode(countryCode); cc != location.None {
excludedCountryCodes[location.ToCountryCode(countryCode)] = struct{}{}
}
}
return &ReliabilityCache{
overlay: overlay,
staleness: staleness,
overlay: overlay,
staleness: staleness,
excludedCountryCodes: excludedCountryCodes,
}
}
@ -114,17 +124,19 @@ func (cache *ReliabilityCache) Refresh(ctx context.Context) (err error) {
func (cache *ReliabilityCache) refreshLocked(ctx context.Context) (_ *reliabilityState, err error) {
defer mon.Task()(&ctx)(&err)
nodes, err := cache.overlay.Reliable(ctx)
online, _, err := cache.overlay.Reliable(ctx)
if err != nil {
return nil, Error.Wrap(err)
}
state := &reliabilityState{
created: time.Now(),
reliable: make(map[storj.NodeID]struct{}, len(nodes)),
reliable: make(map[storj.NodeID]struct{}, len(online)),
}
for _, id := range nodes {
state.reliable[id] = struct{}{}
for _, node := range online {
if _, ok := cache.excludedCountryCodes[node.CountryCode]; !ok {
state.reliable[node.ID] = struct{}{}
}
}
cache.state.Store(state)

View File

@ -12,11 +12,11 @@ import (
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/nodeevents"
"storj.io/storj/satellite/nodeselection/uploadselection"
"storj.io/storj/satellite/overlay"
)
@ -35,7 +35,7 @@ func TestReliabilityCache_Concurrent(t *testing.T) {
ctx.Go(func() error { return overlayCache.Run(cacheCtx) })
defer ctx.Check(overlayCache.Close)
cache := NewReliabilityCache(overlayCache, time.Millisecond)
cache := NewReliabilityCache(overlayCache, time.Millisecond, []string{})
var group errgroup.Group
for i := 0; i < 10; i++ {
group.Go(func() error {
@ -55,11 +55,11 @@ func TestReliabilityCache_Concurrent(t *testing.T) {
type fakeOverlayDB struct{ overlay.DB }
type fakeNodeEvents struct{ nodeevents.DB }
func (fakeOverlayDB) Reliable(context.Context, *overlay.NodeCriteria) (storj.NodeIDList, error) {
return storj.NodeIDList{
testrand.NodeID(),
testrand.NodeID(),
testrand.NodeID(),
testrand.NodeID(),
}, nil
func (fakeOverlayDB) Reliable(context.Context, time.Duration, time.Duration) ([]uploadselection.SelectedNode, []uploadselection.SelectedNode, error) {
return []uploadselection.SelectedNode{
{ID: testrand.NodeID()},
{ID: testrand.NodeID()},
{ID: testrand.NodeID()},
{ID: testrand.NodeID()},
}, nil, nil
}

View File

@ -483,19 +483,11 @@ func (cache *overlaycache) knownReliable(ctx context.Context, nodeIDs storj.Node
`, pgutil.NodeIDArray(nodeIDs), time.Now().Add(-onlineWindow),
))(func(rows tagsql.Rows) error {
for rows.Next() {
var onlineNode bool
var node uploadselection.SelectedNode
node.Address = &pb.NodeAddress{}
var lastIPPort sql.NullString
err = rows.Scan(&node.ID, &node.Address.Address, &node.LastNet, &lastIPPort, &node.CountryCode, &onlineNode)
node, onlineNode, err := scanSelectedNode(rows)
if err != nil {
return err
}
if lastIPPort.Valid {
node.LastIPPort = lastIPPort.String
}
if onlineNode {
online = append(online, node)
} else {
@ -508,63 +500,67 @@ func (cache *overlaycache) knownReliable(ctx context.Context, nodeIDs storj.Node
return online, offline, Error.Wrap(err)
}
// Reliable returns all reliable nodes.
func (cache *overlaycache) Reliable(ctx context.Context, criteria *overlay.NodeCriteria) (nodes storj.NodeIDList, err error) {
// Reliable returns all nodes that are reliable, online and offline.
func (cache *overlaycache) Reliable(ctx context.Context, onlineWindow, asOfSystemInterval time.Duration) (online []uploadselection.SelectedNode, offline []uploadselection.SelectedNode, err error) {
for {
nodes, err = cache.reliable(ctx, criteria)
online, offline, err = cache.reliable(ctx, onlineWindow, asOfSystemInterval)
if err != nil {
if cockroachutil.NeedsRetry(err) {
continue
}
return nodes, err
return nil, nil, err
}
break
}
return nodes, err
return online, offline, nil
}
func (cache *overlaycache) reliable(ctx context.Context, criteria *overlay.NodeCriteria) (nodes storj.NodeIDList, err error) {
args := []interface{}{
time.Now().Add(-criteria.OnlineWindow),
}
func (cache *overlaycache) reliable(ctx context.Context, onlineWindow, asOfSystemInterval time.Duration) (online []uploadselection.SelectedNode, offline []uploadselection.SelectedNode, err error) {
defer mon.Task()(&ctx)(&err)
// When this config is not set, it's a string slice with one empty string. I added some sanity checks to make sure we don't
// dereference a nil pointer or index an element that doesn't exist.
var excludedCountriesCondition string
if criteria.ExcludedCountries != nil && len(criteria.ExcludedCountries) != 0 && criteria.ExcludedCountries[0] != "" {
excludedCountriesCondition = "AND country_code NOT IN (SELECT UNNEST($2::TEXT[]))"
args = append(args, pgutil.TextArray(criteria.ExcludedCountries))
}
// get reliable and online nodes
rows, err := cache.db.Query(ctx, cache.db.Rebind(`
SELECT id
err = withRows(cache.db.Query(ctx, `
SELECT id, address, last_net, last_ip_port, country_code, last_contact_success > $1 as online
FROM nodes
`+cache.db.impl.AsOfSystemInterval(criteria.AsOfSystemInterval)+`
`+cache.db.impl.AsOfSystemInterval(asOfSystemInterval)+`
WHERE disqualified IS NULL
AND unknown_audit_suspended IS NULL
AND offline_suspended IS NULL
AND exit_finished_at IS NULL
AND last_contact_success > $1
`+excludedCountriesCondition+`
`), args...)
if err != nil {
return nil, err
}
defer func() {
err = errs.Combine(err, rows.Close())
}()
AND unknown_audit_suspended IS NULL
AND offline_suspended IS NULL
AND exit_finished_at IS NULL
`, time.Now().Add(-onlineWindow),
))(func(rows tagsql.Rows) error {
for rows.Next() {
node, onlineNode, err := scanSelectedNode(rows)
if err != nil {
return err
}
for rows.Next() {
var id storj.NodeID
err = rows.Scan(&id)
if err != nil {
return nil, err
if onlineNode {
online = append(online, node)
} else {
offline = append(offline, node)
}
}
nodes = append(nodes, id)
return nil
})
return online, offline, Error.Wrap(err)
}
func scanSelectedNode(rows tagsql.Rows) (uploadselection.SelectedNode, bool, error) {
var onlineNode bool
var node uploadselection.SelectedNode
node.Address = &pb.NodeAddress{}
var lastIPPort sql.NullString
err := rows.Scan(&node.ID, &node.Address.Address, &node.LastNet, &lastIPPort, &node.CountryCode, &onlineNode)
if err != nil {
return uploadselection.SelectedNode{}, false, err
}
return nodes, Error.Wrap(rows.Err())
if lastIPPort.Valid {
node.LastIPPort = lastIPPort.String
}
return node, onlineNode, nil
}
// UpdateReputation updates the DB columns for any of the reputation fields in ReputationUpdate.

View File

@ -496,8 +496,59 @@ func TestOverlayCache_KnownReliable(t *testing.T) {
require.ElementsMatch(t, tc.Offline, offline)
}
// test empty id list
_, _, err := cache.KnownReliable(ctx, storj.NodeIDList{}, 1*time.Hour, 0)
require.Error(t, err)
// test as of system time
_, _, err = cache.KnownReliable(ctx, ids(allNodes...), 1*time.Hour, -1*time.Microsecond)
require.NoError(t, err)
})
}
func TestOverlayCache_Reliable(t *testing.T) {
satellitedbtest.Run(t, func(ctx *testcontext.Context, t *testing.T, db satellite.DB) {
cache := db.OverlayCache()
allNodes := []uploadselection.SelectedNode{
addNode(ctx, t, cache, "online", "127.0.0.1", true, false, false, false, false),
addNode(ctx, t, cache, "offline", "127.0.0.2", false, false, false, false, false),
addNode(ctx, t, cache, "disqalified", "127.0.0.3", false, true, false, false, false),
addNode(ctx, t, cache, "audit-suspended", "127.0.0.4", false, false, true, false, false),
addNode(ctx, t, cache, "offline-suspended", "127.0.0.5", false, false, false, true, false),
addNode(ctx, t, cache, "exited", "127.0.0.6", false, false, false, false, true),
}
type testCase struct {
OnlineWindow time.Duration
Online []uploadselection.SelectedNode
Offline []uploadselection.SelectedNode
}
for i, tc := range []testCase{
{
OnlineWindow: 1 * time.Hour,
Online: []uploadselection.SelectedNode{allNodes[0]},
Offline: []uploadselection.SelectedNode{allNodes[1]},
},
{
OnlineWindow: 20 * time.Hour,
Online: []uploadselection.SelectedNode{allNodes[0], allNodes[1]},
},
{
OnlineWindow: 1 * time.Microsecond,
Offline: []uploadselection.SelectedNode{allNodes[0], allNodes[1]},
},
} {
online, offline, err := cache.Reliable(ctx, tc.OnlineWindow, 0)
require.NoError(t, err)
require.ElementsMatch(t, tc.Online, online, "#%d", i)
require.ElementsMatch(t, tc.Offline, offline, "#%d", i)
}
// test as of system time
_, _, err := cache.Reliable(ctx, 1*time.Hour, -1*time.Microsecond)
require.NoError(t, err)
})
}