discovery: remove graveyard (#2145)
This commit is contained in:
parent
a55df84bf7
commit
43d4f3daf5
@ -458,7 +458,6 @@ func (planet *Planet) newSatellites(count int) ([]*satellite.Peer, error) {
|
||||
},
|
||||
},
|
||||
Discovery: discovery.Config{
|
||||
GraveyardInterval: 1 * time.Second,
|
||||
DiscoveryInterval: 1 * time.Second,
|
||||
RefreshInterval: 1 * time.Second,
|
||||
RefreshLimit: 100,
|
||||
|
@ -31,7 +31,6 @@ func TestDataRepair(t *testing.T) {
|
||||
// stop discovery service so that we do not get a race condition when we delete nodes from overlay cache
|
||||
satellite.Discovery.Service.Discovery.Stop()
|
||||
satellite.Discovery.Service.Refresh.Stop()
|
||||
satellite.Discovery.Service.Graveyard.Stop()
|
||||
|
||||
satellite.Repair.Checker.Loop.Pause()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
|
@ -29,7 +29,6 @@ var (
|
||||
// Config loads on the configuration values for the cache
|
||||
type Config struct {
|
||||
RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
|
||||
GraveyardInterval time.Duration `help:"the interval at which the the graveyard tries to resurrect nodes" default:"30s"`
|
||||
DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
|
||||
RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"`
|
||||
}
|
||||
@ -45,7 +44,6 @@ type Discovery struct {
|
||||
refreshLimit int
|
||||
|
||||
Refresh sync2.Cycle
|
||||
Graveyard sync2.Cycle
|
||||
Discovery sync2.Cycle
|
||||
}
|
||||
|
||||
@ -61,7 +59,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
|
||||
}
|
||||
|
||||
discovery.Refresh.SetInterval(config.RefreshInterval)
|
||||
discovery.Graveyard.SetInterval(config.GraveyardInterval)
|
||||
discovery.Discovery.SetInterval(config.DiscoveryInterval)
|
||||
|
||||
return discovery
|
||||
@ -70,7 +67,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
|
||||
// Close closes resources
|
||||
func (discovery *Discovery) Close() error {
|
||||
discovery.Refresh.Close()
|
||||
discovery.Graveyard.Close()
|
||||
discovery.Discovery.Close()
|
||||
return nil
|
||||
}
|
||||
@ -87,13 +83,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
|
||||
}
|
||||
return nil
|
||||
})
|
||||
discovery.Graveyard.Start(ctx, &group, func(ctx context.Context) error {
|
||||
err := discovery.searchGraveyard(ctx)
|
||||
if err != nil {
|
||||
discovery.log.Error("graveyard resurrection failed: ", zap.Error(err))
|
||||
}
|
||||
return nil
|
||||
})
|
||||
discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
|
||||
err := discovery.discover(ctx)
|
||||
if err != nil {
|
||||
@ -110,13 +99,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
|
||||
func (discovery *Discovery) refresh(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
nodes := discovery.kad.Seen()
|
||||
for _, v := range nodes {
|
||||
if err := discovery.cache.Put(ctx, v.Id, *v); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
@ -169,46 +151,6 @@ func (discovery *Discovery) refresh(ctx context.Context) (err error) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// graveyard attempts to ping all nodes in the Seen() map from Kademlia and adds them to the cache
|
||||
// if they respond. This is an attempt to resurrect nodes that may have gone offline in the last hour
|
||||
// and were removed from the cache due to an unsuccessful response.
|
||||
func (discovery *Discovery) searchGraveyard(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
seen := discovery.kad.Seen()
|
||||
|
||||
var errors errs.Group
|
||||
for _, n := range seen {
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
ping, err := discovery.kad.Ping(ctx, *n)
|
||||
if err != nil {
|
||||
discovery.log.Debug("could not ping node in graveyard check")
|
||||
// we don't want to report the ping error to ErrorGroup because it's to be expected here.
|
||||
continue
|
||||
}
|
||||
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
err = discovery.cache.Put(ctx, ping.Id, ping)
|
||||
if err != nil {
|
||||
discovery.log.Warn("could not update node uptime")
|
||||
errors.Add(err)
|
||||
}
|
||||
|
||||
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
|
||||
if err != nil {
|
||||
discovery.log.Warn("could not update node uptime")
|
||||
errors.Add(err)
|
||||
}
|
||||
}
|
||||
return errors.Err()
|
||||
}
|
||||
|
||||
// Discovery runs lookups for random node ID's to find new nodes in the network
|
||||
func (discovery *Discovery) discover(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
@ -27,7 +27,7 @@ func TestCache_Refresh(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestCache_Graveyard(t *testing.T) {
|
||||
func TestCache_Discovery(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
@ -38,7 +38,6 @@ func TestCache_Graveyard(t *testing.T) {
|
||||
satellite.Kademlia.Service.RefreshBuckets.Pause()
|
||||
|
||||
satellite.Discovery.Service.Refresh.Pause()
|
||||
satellite.Discovery.Service.Graveyard.Pause()
|
||||
satellite.Discovery.Service.Discovery.Pause()
|
||||
|
||||
overlay := satellite.Overlay.Service
|
||||
@ -51,7 +50,7 @@ func TestCache_Graveyard(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
assert.False(t, overlay.IsOnline(node))
|
||||
|
||||
satellite.Discovery.Service.Graveyard.TriggerWait()
|
||||
satellite.Discovery.Service.Discovery.TriggerWait()
|
||||
|
||||
found, err := overlay.Get(ctx, offlineID)
|
||||
assert.NoError(t, err)
|
||||
|
@ -327,17 +327,6 @@ func (k *Kademlia) lookup(ctx context.Context, nodeID storj.NodeID, isBootstrap
|
||||
return *target, nil
|
||||
}
|
||||
|
||||
// Seen returns all nodes that this kademlia instance has successfully communicated with
|
||||
func (k *Kademlia) Seen() []*pb.Node {
|
||||
nodes := []*pb.Node{}
|
||||
k.routingTable.mutex.Lock()
|
||||
for _, v := range k.routingTable.seen {
|
||||
nodes = append(nodes, pb.CopyNode(v))
|
||||
}
|
||||
k.routingTable.mutex.Unlock()
|
||||
return nodes
|
||||
}
|
||||
|
||||
// GetNodesWithinKBucket returns all the routing nodes in the specified k-bucket
|
||||
func (k *Kademlia) GetNodesWithinKBucket(bID bucketID) ([]*pb.Node, error) {
|
||||
return k.routingTable.getUnmarshaledNodesFromBucket(bID)
|
||||
|
@ -27,7 +27,10 @@ func TestLookupNodes(t *testing.T) {
|
||||
k := planet.Satellites[0].Kademlia.Service
|
||||
k.WaitForBootstrap() // redundant, but leaving here to be clear
|
||||
|
||||
seen := k.Seen()
|
||||
seen, err := k.DumpNodes(ctx)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
assert.NotEqual(t, len(seen), 0)
|
||||
assert.NotNil(t, seen)
|
||||
|
||||
|
@ -62,7 +62,6 @@ type RoutingTable struct {
|
||||
transport *pb.NodeTransport
|
||||
mutex *sync.Mutex
|
||||
rcMutex *sync.Mutex
|
||||
seen map[storj.NodeID]*pb.Node
|
||||
replacementCache map[bucketID][]*pb.Node
|
||||
bucketSize int // max number of nodes stored in a kbucket = 20 (k)
|
||||
rcBucketSize int // replacementCache bucket max length
|
||||
@ -87,7 +86,6 @@ func NewRoutingTable(logger *zap.Logger, localNode *overlay.NodeDossier, kdb, nd
|
||||
|
||||
mutex: &sync.Mutex{},
|
||||
rcMutex: &sync.Mutex{},
|
||||
seen: make(map[storj.NodeID]*pb.Node),
|
||||
replacementCache: make(map[bucketID][]*pb.Node),
|
||||
|
||||
bucketSize: config.BucketSize,
|
||||
@ -227,9 +225,6 @@ func (rt *RoutingTable) ConnectionSuccess(node *pb.Node) (err error) {
|
||||
return nil
|
||||
}
|
||||
|
||||
rt.mutex.Lock()
|
||||
rt.seen[node.Id] = node
|
||||
rt.mutex.Unlock()
|
||||
v, err := rt.nodeBucketDB.Get(ctx, storage.Key(node.Id.Bytes()))
|
||||
if err != nil && !storage.ErrKeyNotFound.Has(err) {
|
||||
return RoutingErr.New("could not get node %s", err)
|
||||
|
@ -45,7 +45,6 @@ func newTestRoutingTable(local *overlay.NodeDossier, opts routingTableOpts) (*Ro
|
||||
|
||||
mutex: &sync.Mutex{},
|
||||
rcMutex: &sync.Mutex{},
|
||||
seen: make(map[storj.NodeID]*pb.Node),
|
||||
replacementCache: make(map[bucketID][]*pb.Node),
|
||||
|
||||
bucketSize: opts.bucketSize,
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -61,9 +61,6 @@ defaults: "release"
|
||||
# the interval at which the satellite attempts to find new nodes via random node ID lookups
|
||||
# discovery.discovery-interval: 1s
|
||||
|
||||
# the interval at which the the graveyard tries to resurrect nodes
|
||||
# discovery.graveyard-interval: 30s
|
||||
|
||||
# the interval at which the cache refreshes itself in seconds
|
||||
# discovery.refresh-interval: 1s
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user