discovery: remove graveyard (#2145)
This commit is contained in:
parent
a55df84bf7
commit
43d4f3daf5
@ -458,7 +458,6 @@ func (planet *Planet) newSatellites(count int) ([]*satellite.Peer, error) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
Discovery: discovery.Config{
|
Discovery: discovery.Config{
|
||||||
GraveyardInterval: 1 * time.Second,
|
|
||||||
DiscoveryInterval: 1 * time.Second,
|
DiscoveryInterval: 1 * time.Second,
|
||||||
RefreshInterval: 1 * time.Second,
|
RefreshInterval: 1 * time.Second,
|
||||||
RefreshLimit: 100,
|
RefreshLimit: 100,
|
||||||
|
@ -31,7 +31,6 @@ func TestDataRepair(t *testing.T) {
|
|||||||
// stop discovery service so that we do not get a race condition when we delete nodes from overlay cache
|
// stop discovery service so that we do not get a race condition when we delete nodes from overlay cache
|
||||||
satellite.Discovery.Service.Discovery.Stop()
|
satellite.Discovery.Service.Discovery.Stop()
|
||||||
satellite.Discovery.Service.Refresh.Stop()
|
satellite.Discovery.Service.Refresh.Stop()
|
||||||
satellite.Discovery.Service.Graveyard.Stop()
|
|
||||||
|
|
||||||
satellite.Repair.Checker.Loop.Pause()
|
satellite.Repair.Checker.Loop.Pause()
|
||||||
satellite.Repair.Repairer.Loop.Pause()
|
satellite.Repair.Repairer.Loop.Pause()
|
||||||
|
@ -29,7 +29,6 @@ var (
|
|||||||
// Config loads on the configuration values for the cache
|
// Config loads on the configuration values for the cache
|
||||||
type Config struct {
|
type Config struct {
|
||||||
RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
|
RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
|
||||||
GraveyardInterval time.Duration `help:"the interval at which the the graveyard tries to resurrect nodes" default:"30s"`
|
|
||||||
DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
|
DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
|
||||||
RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"`
|
RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"`
|
||||||
}
|
}
|
||||||
@ -45,7 +44,6 @@ type Discovery struct {
|
|||||||
refreshLimit int
|
refreshLimit int
|
||||||
|
|
||||||
Refresh sync2.Cycle
|
Refresh sync2.Cycle
|
||||||
Graveyard sync2.Cycle
|
|
||||||
Discovery sync2.Cycle
|
Discovery sync2.Cycle
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,7 +59,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
|
|||||||
}
|
}
|
||||||
|
|
||||||
discovery.Refresh.SetInterval(config.RefreshInterval)
|
discovery.Refresh.SetInterval(config.RefreshInterval)
|
||||||
discovery.Graveyard.SetInterval(config.GraveyardInterval)
|
|
||||||
discovery.Discovery.SetInterval(config.DiscoveryInterval)
|
discovery.Discovery.SetInterval(config.DiscoveryInterval)
|
||||||
|
|
||||||
return discovery
|
return discovery
|
||||||
@ -70,7 +67,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
|
|||||||
// Close closes resources
|
// Close closes resources
|
||||||
func (discovery *Discovery) Close() error {
|
func (discovery *Discovery) Close() error {
|
||||||
discovery.Refresh.Close()
|
discovery.Refresh.Close()
|
||||||
discovery.Graveyard.Close()
|
|
||||||
discovery.Discovery.Close()
|
discovery.Discovery.Close()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -87,13 +83,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
discovery.Graveyard.Start(ctx, &group, func(ctx context.Context) error {
|
|
||||||
err := discovery.searchGraveyard(ctx)
|
|
||||||
if err != nil {
|
|
||||||
discovery.log.Error("graveyard resurrection failed: ", zap.Error(err))
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
|
discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
|
||||||
err := discovery.discover(ctx)
|
err := discovery.discover(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -110,13 +99,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
|
|||||||
func (discovery *Discovery) refresh(ctx context.Context) (err error) {
|
func (discovery *Discovery) refresh(ctx context.Context) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
nodes := discovery.kad.Seen()
|
|
||||||
for _, v := range nodes {
|
|
||||||
if err := discovery.cache.Put(ctx, v.Id, *v); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
|
list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Error.Wrap(err)
|
return Error.Wrap(err)
|
||||||
@ -169,46 +151,6 @@ func (discovery *Discovery) refresh(ctx context.Context) (err error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// graveyard attempts to ping all nodes in the Seen() map from Kademlia and adds them to the cache
|
|
||||||
// if they respond. This is an attempt to resurrect nodes that may have gone offline in the last hour
|
|
||||||
// and were removed from the cache due to an unsuccessful response.
|
|
||||||
func (discovery *Discovery) searchGraveyard(ctx context.Context) (err error) {
|
|
||||||
defer mon.Task()(&ctx)(&err)
|
|
||||||
|
|
||||||
seen := discovery.kad.Seen()
|
|
||||||
|
|
||||||
var errors errs.Group
|
|
||||||
for _, n := range seen {
|
|
||||||
if ctx.Err() != nil {
|
|
||||||
return ctx.Err()
|
|
||||||
}
|
|
||||||
|
|
||||||
ping, err := discovery.kad.Ping(ctx, *n)
|
|
||||||
if err != nil {
|
|
||||||
discovery.log.Debug("could not ping node in graveyard check")
|
|
||||||
// we don't want to report the ping error to ErrorGroup because it's to be expected here.
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if ctx.Err() != nil {
|
|
||||||
return ctx.Err()
|
|
||||||
}
|
|
||||||
|
|
||||||
err = discovery.cache.Put(ctx, ping.Id, ping)
|
|
||||||
if err != nil {
|
|
||||||
discovery.log.Warn("could not update node uptime")
|
|
||||||
errors.Add(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
|
|
||||||
if err != nil {
|
|
||||||
discovery.log.Warn("could not update node uptime")
|
|
||||||
errors.Add(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return errors.Err()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Discovery runs lookups for random node ID's to find new nodes in the network
|
// Discovery runs lookups for random node ID's to find new nodes in the network
|
||||||
func (discovery *Discovery) discover(ctx context.Context) (err error) {
|
func (discovery *Discovery) discover(ctx context.Context) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
@ -27,7 +27,7 @@ func TestCache_Refresh(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCache_Graveyard(t *testing.T) {
|
func TestCache_Discovery(t *testing.T) {
|
||||||
testplanet.Run(t, testplanet.Config{
|
testplanet.Run(t, testplanet.Config{
|
||||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||||
@ -38,7 +38,6 @@ func TestCache_Graveyard(t *testing.T) {
|
|||||||
satellite.Kademlia.Service.RefreshBuckets.Pause()
|
satellite.Kademlia.Service.RefreshBuckets.Pause()
|
||||||
|
|
||||||
satellite.Discovery.Service.Refresh.Pause()
|
satellite.Discovery.Service.Refresh.Pause()
|
||||||
satellite.Discovery.Service.Graveyard.Pause()
|
|
||||||
satellite.Discovery.Service.Discovery.Pause()
|
satellite.Discovery.Service.Discovery.Pause()
|
||||||
|
|
||||||
overlay := satellite.Overlay.Service
|
overlay := satellite.Overlay.Service
|
||||||
@ -51,7 +50,7 @@ func TestCache_Graveyard(t *testing.T) {
|
|||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.False(t, overlay.IsOnline(node))
|
assert.False(t, overlay.IsOnline(node))
|
||||||
|
|
||||||
satellite.Discovery.Service.Graveyard.TriggerWait()
|
satellite.Discovery.Service.Discovery.TriggerWait()
|
||||||
|
|
||||||
found, err := overlay.Get(ctx, offlineID)
|
found, err := overlay.Get(ctx, offlineID)
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
|
@ -327,17 +327,6 @@ func (k *Kademlia) lookup(ctx context.Context, nodeID storj.NodeID, isBootstrap
|
|||||||
return *target, nil
|
return *target, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Seen returns all nodes that this kademlia instance has successfully communicated with
|
|
||||||
func (k *Kademlia) Seen() []*pb.Node {
|
|
||||||
nodes := []*pb.Node{}
|
|
||||||
k.routingTable.mutex.Lock()
|
|
||||||
for _, v := range k.routingTable.seen {
|
|
||||||
nodes = append(nodes, pb.CopyNode(v))
|
|
||||||
}
|
|
||||||
k.routingTable.mutex.Unlock()
|
|
||||||
return nodes
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetNodesWithinKBucket returns all the routing nodes in the specified k-bucket
|
// GetNodesWithinKBucket returns all the routing nodes in the specified k-bucket
|
||||||
func (k *Kademlia) GetNodesWithinKBucket(bID bucketID) ([]*pb.Node, error) {
|
func (k *Kademlia) GetNodesWithinKBucket(bID bucketID) ([]*pb.Node, error) {
|
||||||
return k.routingTable.getUnmarshaledNodesFromBucket(bID)
|
return k.routingTable.getUnmarshaledNodesFromBucket(bID)
|
||||||
|
@ -27,7 +27,10 @@ func TestLookupNodes(t *testing.T) {
|
|||||||
k := planet.Satellites[0].Kademlia.Service
|
k := planet.Satellites[0].Kademlia.Service
|
||||||
k.WaitForBootstrap() // redundant, but leaving here to be clear
|
k.WaitForBootstrap() // redundant, but leaving here to be clear
|
||||||
|
|
||||||
seen := k.Seen()
|
seen, err := k.DumpNodes(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
assert.NotEqual(t, len(seen), 0)
|
assert.NotEqual(t, len(seen), 0)
|
||||||
assert.NotNil(t, seen)
|
assert.NotNil(t, seen)
|
||||||
|
|
||||||
|
@ -62,7 +62,6 @@ type RoutingTable struct {
|
|||||||
transport *pb.NodeTransport
|
transport *pb.NodeTransport
|
||||||
mutex *sync.Mutex
|
mutex *sync.Mutex
|
||||||
rcMutex *sync.Mutex
|
rcMutex *sync.Mutex
|
||||||
seen map[storj.NodeID]*pb.Node
|
|
||||||
replacementCache map[bucketID][]*pb.Node
|
replacementCache map[bucketID][]*pb.Node
|
||||||
bucketSize int // max number of nodes stored in a kbucket = 20 (k)
|
bucketSize int // max number of nodes stored in a kbucket = 20 (k)
|
||||||
rcBucketSize int // replacementCache bucket max length
|
rcBucketSize int // replacementCache bucket max length
|
||||||
@ -87,7 +86,6 @@ func NewRoutingTable(logger *zap.Logger, localNode *overlay.NodeDossier, kdb, nd
|
|||||||
|
|
||||||
mutex: &sync.Mutex{},
|
mutex: &sync.Mutex{},
|
||||||
rcMutex: &sync.Mutex{},
|
rcMutex: &sync.Mutex{},
|
||||||
seen: make(map[storj.NodeID]*pb.Node),
|
|
||||||
replacementCache: make(map[bucketID][]*pb.Node),
|
replacementCache: make(map[bucketID][]*pb.Node),
|
||||||
|
|
||||||
bucketSize: config.BucketSize,
|
bucketSize: config.BucketSize,
|
||||||
@ -227,9 +225,6 @@ func (rt *RoutingTable) ConnectionSuccess(node *pb.Node) (err error) {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
rt.mutex.Lock()
|
|
||||||
rt.seen[node.Id] = node
|
|
||||||
rt.mutex.Unlock()
|
|
||||||
v, err := rt.nodeBucketDB.Get(ctx, storage.Key(node.Id.Bytes()))
|
v, err := rt.nodeBucketDB.Get(ctx, storage.Key(node.Id.Bytes()))
|
||||||
if err != nil && !storage.ErrKeyNotFound.Has(err) {
|
if err != nil && !storage.ErrKeyNotFound.Has(err) {
|
||||||
return RoutingErr.New("could not get node %s", err)
|
return RoutingErr.New("could not get node %s", err)
|
||||||
|
@ -45,7 +45,6 @@ func newTestRoutingTable(local *overlay.NodeDossier, opts routingTableOpts) (*Ro
|
|||||||
|
|
||||||
mutex: &sync.Mutex{},
|
mutex: &sync.Mutex{},
|
||||||
rcMutex: &sync.Mutex{},
|
rcMutex: &sync.Mutex{},
|
||||||
seen: make(map[storj.NodeID]*pb.Node),
|
|
||||||
replacementCache: make(map[bucketID][]*pb.Node),
|
replacementCache: make(map[bucketID][]*pb.Node),
|
||||||
|
|
||||||
bucketSize: opts.bucketSize,
|
bucketSize: opts.bucketSize,
|
||||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -61,9 +61,6 @@ defaults: "release"
|
|||||||
# the interval at which the satellite attempts to find new nodes via random node ID lookups
|
# the interval at which the satellite attempts to find new nodes via random node ID lookups
|
||||||
# discovery.discovery-interval: 1s
|
# discovery.discovery-interval: 1s
|
||||||
|
|
||||||
# the interval at which the the graveyard tries to resurrect nodes
|
|
||||||
# discovery.graveyard-interval: 30s
|
|
||||||
|
|
||||||
# the interval at which the cache refreshes itself in seconds
|
# the interval at which the cache refreshes itself in seconds
|
||||||
# discovery.refresh-interval: 1s
|
# discovery.refresh-interval: 1s
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user