discovery: remove graveyard (#2145)

This commit is contained in:
JT Olio 2019-06-06 23:40:51 -06:00 committed by Egon Elbre
parent a55df84bf7
commit 43d4f3daf5
9 changed files with 6 additions and 84 deletions

View File

@ -458,7 +458,6 @@ func (planet *Planet) newSatellites(count int) ([]*satellite.Peer, error) {
}, },
}, },
Discovery: discovery.Config{ Discovery: discovery.Config{
GraveyardInterval: 1 * time.Second,
DiscoveryInterval: 1 * time.Second, DiscoveryInterval: 1 * time.Second,
RefreshInterval: 1 * time.Second, RefreshInterval: 1 * time.Second,
RefreshLimit: 100, RefreshLimit: 100,

View File

@ -31,7 +31,6 @@ func TestDataRepair(t *testing.T) {
// stop discovery service so that we do not get a race condition when we delete nodes from overlay cache // stop discovery service so that we do not get a race condition when we delete nodes from overlay cache
satellite.Discovery.Service.Discovery.Stop() satellite.Discovery.Service.Discovery.Stop()
satellite.Discovery.Service.Refresh.Stop() satellite.Discovery.Service.Refresh.Stop()
satellite.Discovery.Service.Graveyard.Stop()
satellite.Repair.Checker.Loop.Pause() satellite.Repair.Checker.Loop.Pause()
satellite.Repair.Repairer.Loop.Pause() satellite.Repair.Repairer.Loop.Pause()

View File

@ -29,7 +29,6 @@ var (
// Config loads on the configuration values for the cache // Config loads on the configuration values for the cache
type Config struct { type Config struct {
RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"` RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
GraveyardInterval time.Duration `help:"the interval at which the the graveyard tries to resurrect nodes" default:"30s"`
DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"` DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"` RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"`
} }
@ -45,7 +44,6 @@ type Discovery struct {
refreshLimit int refreshLimit int
Refresh sync2.Cycle Refresh sync2.Cycle
Graveyard sync2.Cycle
Discovery sync2.Cycle Discovery sync2.Cycle
} }
@ -61,7 +59,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
} }
discovery.Refresh.SetInterval(config.RefreshInterval) discovery.Refresh.SetInterval(config.RefreshInterval)
discovery.Graveyard.SetInterval(config.GraveyardInterval)
discovery.Discovery.SetInterval(config.DiscoveryInterval) discovery.Discovery.SetInterval(config.DiscoveryInterval)
return discovery return discovery
@ -70,7 +67,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
// Close closes resources // Close closes resources
func (discovery *Discovery) Close() error { func (discovery *Discovery) Close() error {
discovery.Refresh.Close() discovery.Refresh.Close()
discovery.Graveyard.Close()
discovery.Discovery.Close() discovery.Discovery.Close()
return nil return nil
} }
@ -87,13 +83,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
} }
return nil return nil
}) })
discovery.Graveyard.Start(ctx, &group, func(ctx context.Context) error {
err := discovery.searchGraveyard(ctx)
if err != nil {
discovery.log.Error("graveyard resurrection failed: ", zap.Error(err))
}
return nil
})
discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error { discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
err := discovery.discover(ctx) err := discovery.discover(ctx)
if err != nil { if err != nil {
@ -110,13 +99,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
func (discovery *Discovery) refresh(ctx context.Context) (err error) { func (discovery *Discovery) refresh(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err) defer mon.Task()(&ctx)(&err)
nodes := discovery.kad.Seen()
for _, v := range nodes {
if err := discovery.cache.Put(ctx, v.Id, *v); err != nil {
return err
}
}
list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit) list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
if err != nil { if err != nil {
return Error.Wrap(err) return Error.Wrap(err)
@ -169,46 +151,6 @@ func (discovery *Discovery) refresh(ctx context.Context) (err error) {
return nil return nil
} }
// graveyard attempts to ping all nodes in the Seen() map from Kademlia and adds them to the cache
// if they respond. This is an attempt to resurrect nodes that may have gone offline in the last hour
// and were removed from the cache due to an unsuccessful response.
func (discovery *Discovery) searchGraveyard(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
seen := discovery.kad.Seen()
var errors errs.Group
for _, n := range seen {
if ctx.Err() != nil {
return ctx.Err()
}
ping, err := discovery.kad.Ping(ctx, *n)
if err != nil {
discovery.log.Debug("could not ping node in graveyard check")
// we don't want to report the ping error to ErrorGroup because it's to be expected here.
continue
}
if ctx.Err() != nil {
return ctx.Err()
}
err = discovery.cache.Put(ctx, ping.Id, ping)
if err != nil {
discovery.log.Warn("could not update node uptime")
errors.Add(err)
}
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
if err != nil {
discovery.log.Warn("could not update node uptime")
errors.Add(err)
}
}
return errors.Err()
}
// Discovery runs lookups for random node ID's to find new nodes in the network // Discovery runs lookups for random node ID's to find new nodes in the network
func (discovery *Discovery) discover(ctx context.Context) (err error) { func (discovery *Discovery) discover(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err) defer mon.Task()(&ctx)(&err)

View File

@ -27,7 +27,7 @@ func TestCache_Refresh(t *testing.T) {
}) })
} }
func TestCache_Graveyard(t *testing.T) { func TestCache_Discovery(t *testing.T) {
testplanet.Run(t, testplanet.Config{ testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0, SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
@ -38,7 +38,6 @@ func TestCache_Graveyard(t *testing.T) {
satellite.Kademlia.Service.RefreshBuckets.Pause() satellite.Kademlia.Service.RefreshBuckets.Pause()
satellite.Discovery.Service.Refresh.Pause() satellite.Discovery.Service.Refresh.Pause()
satellite.Discovery.Service.Graveyard.Pause()
satellite.Discovery.Service.Discovery.Pause() satellite.Discovery.Service.Discovery.Pause()
overlay := satellite.Overlay.Service overlay := satellite.Overlay.Service
@ -51,7 +50,7 @@ func TestCache_Graveyard(t *testing.T) {
assert.NoError(t, err) assert.NoError(t, err)
assert.False(t, overlay.IsOnline(node)) assert.False(t, overlay.IsOnline(node))
satellite.Discovery.Service.Graveyard.TriggerWait() satellite.Discovery.Service.Discovery.TriggerWait()
found, err := overlay.Get(ctx, offlineID) found, err := overlay.Get(ctx, offlineID)
assert.NoError(t, err) assert.NoError(t, err)

View File

@ -327,17 +327,6 @@ func (k *Kademlia) lookup(ctx context.Context, nodeID storj.NodeID, isBootstrap
return *target, nil return *target, nil
} }
// Seen returns all nodes that this kademlia instance has successfully communicated with
func (k *Kademlia) Seen() []*pb.Node {
nodes := []*pb.Node{}
k.routingTable.mutex.Lock()
for _, v := range k.routingTable.seen {
nodes = append(nodes, pb.CopyNode(v))
}
k.routingTable.mutex.Unlock()
return nodes
}
// GetNodesWithinKBucket returns all the routing nodes in the specified k-bucket // GetNodesWithinKBucket returns all the routing nodes in the specified k-bucket
func (k *Kademlia) GetNodesWithinKBucket(bID bucketID) ([]*pb.Node, error) { func (k *Kademlia) GetNodesWithinKBucket(bID bucketID) ([]*pb.Node, error) {
return k.routingTable.getUnmarshaledNodesFromBucket(bID) return k.routingTable.getUnmarshaledNodesFromBucket(bID)

View File

@ -27,7 +27,10 @@ func TestLookupNodes(t *testing.T) {
k := planet.Satellites[0].Kademlia.Service k := planet.Satellites[0].Kademlia.Service
k.WaitForBootstrap() // redundant, but leaving here to be clear k.WaitForBootstrap() // redundant, but leaving here to be clear
seen := k.Seen() seen, err := k.DumpNodes(ctx)
if err != nil {
t.Fatal(err)
}
assert.NotEqual(t, len(seen), 0) assert.NotEqual(t, len(seen), 0)
assert.NotNil(t, seen) assert.NotNil(t, seen)

View File

@ -62,7 +62,6 @@ type RoutingTable struct {
transport *pb.NodeTransport transport *pb.NodeTransport
mutex *sync.Mutex mutex *sync.Mutex
rcMutex *sync.Mutex rcMutex *sync.Mutex
seen map[storj.NodeID]*pb.Node
replacementCache map[bucketID][]*pb.Node replacementCache map[bucketID][]*pb.Node
bucketSize int // max number of nodes stored in a kbucket = 20 (k) bucketSize int // max number of nodes stored in a kbucket = 20 (k)
rcBucketSize int // replacementCache bucket max length rcBucketSize int // replacementCache bucket max length
@ -87,7 +86,6 @@ func NewRoutingTable(logger *zap.Logger, localNode *overlay.NodeDossier, kdb, nd
mutex: &sync.Mutex{}, mutex: &sync.Mutex{},
rcMutex: &sync.Mutex{}, rcMutex: &sync.Mutex{},
seen: make(map[storj.NodeID]*pb.Node),
replacementCache: make(map[bucketID][]*pb.Node), replacementCache: make(map[bucketID][]*pb.Node),
bucketSize: config.BucketSize, bucketSize: config.BucketSize,
@ -227,9 +225,6 @@ func (rt *RoutingTable) ConnectionSuccess(node *pb.Node) (err error) {
return nil return nil
} }
rt.mutex.Lock()
rt.seen[node.Id] = node
rt.mutex.Unlock()
v, err := rt.nodeBucketDB.Get(ctx, storage.Key(node.Id.Bytes())) v, err := rt.nodeBucketDB.Get(ctx, storage.Key(node.Id.Bytes()))
if err != nil && !storage.ErrKeyNotFound.Has(err) { if err != nil && !storage.ErrKeyNotFound.Has(err) {
return RoutingErr.New("could not get node %s", err) return RoutingErr.New("could not get node %s", err)

View File

@ -45,7 +45,6 @@ func newTestRoutingTable(local *overlay.NodeDossier, opts routingTableOpts) (*Ro
mutex: &sync.Mutex{}, mutex: &sync.Mutex{},
rcMutex: &sync.Mutex{}, rcMutex: &sync.Mutex{},
seen: make(map[storj.NodeID]*pb.Node),
replacementCache: make(map[bucketID][]*pb.Node), replacementCache: make(map[bucketID][]*pb.Node),
bucketSize: opts.bucketSize, bucketSize: opts.bucketSize,

View File

@ -61,9 +61,6 @@ defaults: "release"
# the interval at which the satellite attempts to find new nodes via random node ID lookups # the interval at which the satellite attempts to find new nodes via random node ID lookups
# discovery.discovery-interval: 1s # discovery.discovery-interval: 1s
# the interval at which the the graveyard tries to resurrect nodes
# discovery.graveyard-interval: 30s
# the interval at which the cache refreshes itself in seconds # the interval at which the cache refreshes itself in seconds
# discovery.refresh-interval: 1s # discovery.refresh-interval: 1s