discovery: remove graveyard (#2145)

2019-06-06 23:40:51 -06:00 · 2019-06-06 23:40:51 -06:00 · 43d4f3daf5
commit 43d4f3daf5
parent a55df84bf7
9 changed files with 6 additions and 84 deletions
--- a/internal/testplanet/planet.go
+++ b/internal/testplanet/planet.go
@ -458,7 +458,6 @@ func (planet *Planet) newSatellites(count int) ([]*satellite.Peer, error) {
 				},
 			},
 			Discovery: discovery.Config{
-				GraveyardInterval: 1 * time.Second,
 				DiscoveryInterval: 1 * time.Second,
 				RefreshInterval:   1 * time.Second,
 				RefreshLimit:      100,
--- a/pkg/datarepair/datarepair_test.go
+++ b/pkg/datarepair/datarepair_test.go
@ -31,7 +31,6 @@ func TestDataRepair(t *testing.T) {
 		// stop discovery service so that we do not get a race condition when we delete nodes from overlay cache
 		satellite.Discovery.Service.Discovery.Stop()
 		satellite.Discovery.Service.Refresh.Stop()
-		satellite.Discovery.Service.Graveyard.Stop()

 		satellite.Repair.Checker.Loop.Pause()
 		satellite.Repair.Repairer.Loop.Pause()
--- a/pkg/discovery/service.go
+++ b/pkg/discovery/service.go
@ -29,7 +29,6 @@ var (
 // Config loads on the configuration values for the cache
 type Config struct {
 	RefreshInterval   time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
-	GraveyardInterval time.Duration `help:"the interval at which the the graveyard tries to resurrect nodes" default:"30s"`
 	DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
 	RefreshLimit      int           `help:"the amount of nodes refreshed at each interval" default:"100"`
 }
@ -45,7 +44,6 @@ type Discovery struct {
 	refreshLimit  int

 	Refresh   sync2.Cycle
-	Graveyard sync2.Cycle
 	Discovery sync2.Cycle
 }

@ -61,7 +59,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
 	}

 	discovery.Refresh.SetInterval(config.RefreshInterval)
-	discovery.Graveyard.SetInterval(config.GraveyardInterval)
 	discovery.Discovery.SetInterval(config.DiscoveryInterval)

 	return discovery
@ -70,7 +67,6 @@ func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config C
 // Close closes resources
 func (discovery *Discovery) Close() error {
 	discovery.Refresh.Close()
-	discovery.Graveyard.Close()
 	discovery.Discovery.Close()
 	return nil
 }
@ -87,13 +83,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
 		}
 		return nil
 	})
-	discovery.Graveyard.Start(ctx, &group, func(ctx context.Context) error {
-		err := discovery.searchGraveyard(ctx)
-		if err != nil {
-			discovery.log.Error("graveyard resurrection failed: ", zap.Error(err))
-		}
-		return nil
-	})
 	discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
 		err := discovery.discover(ctx)
 		if err != nil {
@ -110,13 +99,6 @@ func (discovery *Discovery) Run(ctx context.Context) (err error) {
 func (discovery *Discovery) refresh(ctx context.Context) (err error) {
 	defer mon.Task()(&ctx)(&err)

-	nodes := discovery.kad.Seen()
-	for _, v := range nodes {
-		if err := discovery.cache.Put(ctx, v.Id, *v); err != nil {
-			return err
-		}
-	}
-
 	list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
 	if err != nil {
 		return Error.Wrap(err)
@ -169,46 +151,6 @@ func (discovery *Discovery) refresh(ctx context.Context) (err error) {
 	return nil
 }

-// graveyard attempts to ping all nodes in the Seen() map from Kademlia and adds them to the cache
-// if they respond. This is an attempt to resurrect nodes that may have gone offline in the last hour
-// and were removed from the cache due to an unsuccessful response.
-func (discovery *Discovery) searchGraveyard(ctx context.Context) (err error) {
-	defer mon.Task()(&ctx)(&err)
-
-	seen := discovery.kad.Seen()
-
-	var errors errs.Group
-	for _, n := range seen {
-		if ctx.Err() != nil {
-			return ctx.Err()
-		}
-
-		ping, err := discovery.kad.Ping(ctx, *n)
-		if err != nil {
-			discovery.log.Debug("could not ping node in graveyard check")
-			// we don't want to report the ping error to ErrorGroup because it's to be expected here.
-			continue
-		}
-
-		if ctx.Err() != nil {
-			return ctx.Err()
-		}
-
-		err = discovery.cache.Put(ctx, ping.Id, ping)
-		if err != nil {
-			discovery.log.Warn("could not update node uptime")
-			errors.Add(err)
-		}
-
-		_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
-		if err != nil {
-			discovery.log.Warn("could not update node uptime")
-			errors.Add(err)
-		}
-	}
-	return errors.Err()
-}
-
 // Discovery runs lookups for random node ID's to find new nodes in the network
 func (discovery *Discovery) discover(ctx context.Context) (err error) {
 	defer mon.Task()(&ctx)(&err)
--- a/pkg/discovery/service_test.go
+++ b/pkg/discovery/service_test.go
@ -27,7 +27,7 @@ func TestCache_Refresh(t *testing.T) {
 	})
 }

-func TestCache_Graveyard(t *testing.T) {
+func TestCache_Discovery(t *testing.T) {
 	testplanet.Run(t, testplanet.Config{
 		SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
 	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
@ -38,7 +38,6 @@ func TestCache_Graveyard(t *testing.T) {
 		satellite.Kademlia.Service.RefreshBuckets.Pause()

 		satellite.Discovery.Service.Refresh.Pause()
-		satellite.Discovery.Service.Graveyard.Pause()
 		satellite.Discovery.Service.Discovery.Pause()

 		overlay := satellite.Overlay.Service
@ -51,7 +50,7 @@ func TestCache_Graveyard(t *testing.T) {
 		assert.NoError(t, err)
 		assert.False(t, overlay.IsOnline(node))

-		satellite.Discovery.Service.Graveyard.TriggerWait()
+		satellite.Discovery.Service.Discovery.TriggerWait()

 		found, err := overlay.Get(ctx, offlineID)
 		assert.NoError(t, err)
--- a/pkg/kademlia/kademlia.go
+++ b/pkg/kademlia/kademlia.go
@ -327,17 +327,6 @@ func (k *Kademlia) lookup(ctx context.Context, nodeID storj.NodeID, isBootstrap
 	return *target, nil
 }

-// Seen returns all nodes that this kademlia instance has successfully communicated with
-func (k *Kademlia) Seen() []*pb.Node {
-	nodes := []*pb.Node{}
-	k.routingTable.mutex.Lock()
-	for _, v := range k.routingTable.seen {
-		nodes = append(nodes, pb.CopyNode(v))
-	}
-	k.routingTable.mutex.Unlock()
-	return nodes
-}
-
 // GetNodesWithinKBucket returns all the routing nodes in the specified k-bucket
 func (k *Kademlia) GetNodesWithinKBucket(bID bucketID) ([]*pb.Node, error) {
 	return k.routingTable.getUnmarshaledNodesFromBucket(bID)
--- a/pkg/kademlia/lookup_test.go
+++ b/pkg/kademlia/lookup_test.go
@ -27,7 +27,10 @@ func TestLookupNodes(t *testing.T) {
 	k := planet.Satellites[0].Kademlia.Service
 	k.WaitForBootstrap() // redundant, but leaving here to be clear

-	seen := k.Seen()
+	seen, err := k.DumpNodes(ctx)
+	if err != nil {
+		t.Fatal(err)
+	}
 	assert.NotEqual(t, len(seen), 0)
 	assert.NotNil(t, seen)

--- a/pkg/kademlia/routing.go
+++ b/pkg/kademlia/routing.go
@ -62,7 +62,6 @@ type RoutingTable struct {
 	transport        *pb.NodeTransport
 	mutex            *sync.Mutex
 	rcMutex          *sync.Mutex
-	seen             map[storj.NodeID]*pb.Node
 	replacementCache map[bucketID][]*pb.Node
 	bucketSize       int // max number of nodes stored in a kbucket = 20 (k)
 	rcBucketSize     int // replacementCache bucket max length
@ -87,7 +86,6 @@ func NewRoutingTable(logger *zap.Logger, localNode *overlay.NodeDossier, kdb, nd

 		mutex:            &sync.Mutex{},
 		rcMutex:          &sync.Mutex{},
-		seen:             make(map[storj.NodeID]*pb.Node),
 		replacementCache: make(map[bucketID][]*pb.Node),

 		bucketSize:   config.BucketSize,
@ -227,9 +225,6 @@ func (rt *RoutingTable) ConnectionSuccess(node *pb.Node) (err error) {
 		return nil
 	}

-	rt.mutex.Lock()
-	rt.seen[node.Id] = node
-	rt.mutex.Unlock()
 	v, err := rt.nodeBucketDB.Get(ctx, storage.Key(node.Id.Bytes()))
 	if err != nil && !storage.ErrKeyNotFound.Has(err) {
 		return RoutingErr.New("could not get node %s", err)
--- a/pkg/kademlia/routing_helpers_test.go
+++ b/pkg/kademlia/routing_helpers_test.go
@ -45,7 +45,6 @@ func newTestRoutingTable(local *overlay.NodeDossier, opts routingTableOpts) (*Ro

 		mutex:            &sync.Mutex{},
 		rcMutex:          &sync.Mutex{},
-		seen:             make(map[storj.NodeID]*pb.Node),
 		replacementCache: make(map[bucketID][]*pb.Node),

 		bucketSize:   opts.bucketSize,
--- a/scripts/testdata/satellite-config.yaml.lock
+++ b/scripts/testdata/satellite-config.yaml.lock
@ -61,9 +61,6 @@ defaults: "release"
 # the interval at which the satellite attempts to find new nodes via random node ID lookups
 # discovery.discovery-interval: 1s

-# the interval at which the the graveyard tries to resurrect nodes
-# discovery.graveyard-interval: 30s
-
 # the interval at which the cache refreshes itself in seconds
 # discovery.refresh-interval: 1s