storj/pkg/kademlia/kademlia.go

440 lines
11 KiB
Go
Raw Normal View History

2019-01-24 20:15:10 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package kademlia
import (
"context"
"math/rand"
2019-02-08 20:35:59 +00:00
"sync/atomic"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
2019-01-25 22:33:20 +00:00
"storj.io/storj/internal/sync2"
2019-01-30 20:47:21 +00:00
"storj.io/storj/pkg/identity"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/storj"
2019-01-18 15:00:56 +00:00
"storj.io/storj/pkg/transport"
"storj.io/storj/storage"
)
var (
// NodeErr is the class for all errors pertaining to node operations
NodeErr = errs.Class("node error")
// BootstrapErr is the class for all errors pertaining to bootstrapping a node
BootstrapErr = errs.Class("bootstrap node error")
// NodeNotFound is returned when a lookup can not produce the requested node
NodeNotFound = errs.Class("node not found")
// TODO: shouldn't default to TCP but not sure what to do yet
defaultTransport = pb.NodeTransport_TCP_TLS_GRPC
defaultRetries = 3
)
type discoveryOptions struct {
concurrency int
retries int
bootstrap bool
bootstrapNodes []pb.Node
}
// Kademlia is an implementation of kademlia adhering to the DHT interface.
type Kademlia struct {
2019-01-25 22:33:20 +00:00
log *zap.Logger
alpha int // alpha is a system wide concurrency parameter
routingTable *RoutingTable
bootstrapNodes []pb.Node
dialer *Dialer
2019-01-28 19:04:42 +00:00
lookups sync2.WorkGroup
2019-01-25 22:33:20 +00:00
bootstrapFinished sync2.Fence
2019-02-08 20:35:59 +00:00
refreshThreshold int64
RefreshBuckets sync2.Cycle
}
2019-01-29 06:51:07 +00:00
// NewService returns a newly configured Kademlia instance
func NewService(log *zap.Logger, self pb.Node, bootstrapNodes []pb.Node, transport transport.Client, alpha int, rt *RoutingTable) (*Kademlia, error) {
k := &Kademlia{
2019-02-08 20:35:59 +00:00
log: log,
alpha: alpha,
routingTable: rt,
bootstrapNodes: bootstrapNodes,
dialer: NewDialer(log.Named("dialer"), transport),
refreshThreshold: int64(time.Minute),
}
2019-01-25 22:33:20 +00:00
return k, nil
}
2019-01-10 13:13:27 +00:00
// Close closes all kademlia connections and prevents new ones from being created.
func (k *Kademlia) Close() error {
2019-01-25 22:33:20 +00:00
dialerErr := k.dialer.Close()
2019-01-28 19:04:42 +00:00
k.lookups.Close()
2019-01-25 22:33:20 +00:00
k.lookups.Wait()
return dialerErr
2019-01-10 13:13:27 +00:00
}
// FindNear returns all nodes from a starting node up to a maximum limit
// stored in the local routing table limiting the result by the specified restrictions
func (k *Kademlia) FindNear(ctx context.Context, start storj.NodeID, limit int, restrictions ...pb.Restriction) ([]*pb.Node, error) {
return k.routingTable.FindNear(start, limit, restrictions...)
}
// GetBucketIds returns a storage.Keys type of bucket ID's in the Kademlia instance
func (k *Kademlia) GetBucketIds() (storage.Keys, error) {
return k.routingTable.GetBucketIds()
}
// Local returns the local nodes ID
func (k *Kademlia) Local() pb.Node {
return k.routingTable.Local()
}
2019-01-10 13:13:27 +00:00
// SetBootstrapNodes sets the bootstrap nodes.
// Must be called before anything starting to use kademlia.
func (k *Kademlia) SetBootstrapNodes(nodes []pb.Node) { k.bootstrapNodes = nodes }
// GetBootstrapNodes gets the bootstrap nodes.
func (k *Kademlia) GetBootstrapNodes() []pb.Node { return k.bootstrapNodes }
// Bootstrap contacts one of a set of pre defined trusted nodes on the network and
// begins populating the local Kademlia node
func (k *Kademlia) Bootstrap(ctx context.Context) error {
2019-01-25 22:33:20 +00:00
defer k.bootstrapFinished.Release()
2019-01-28 19:04:42 +00:00
if !k.lookups.Start() {
return context.Canceled
}
2019-01-25 22:33:20 +00:00
defer k.lookups.Done()
if len(k.bootstrapNodes) == 0 {
2019-01-25 22:33:20 +00:00
k.log.Warn("No bootstrap address specified.")
return nil
}
var errs errs.Group
for _, node := range k.bootstrapNodes {
2019-01-25 22:33:20 +00:00
if ctx.Err() != nil {
errs.Add(ctx.Err())
return errs.Err()
}
_, err := k.dialer.Ping(ctx, node)
if err == nil {
// We have pinged successfully one bootstrap node.
// Clear any errors and break the cycle.
errs = nil
break
}
errs.Add(err)
}
err := errs.Err()
if err != nil {
return err
}
//find nodes most similar to self
2019-01-08 16:01:22 +00:00
k.routingTable.mutex.Lock()
id := k.routingTable.self.Id
k.routingTable.mutex.Unlock()
2019-01-25 22:33:20 +00:00
_, err = k.lookup(ctx, id, true)
// TODO(dylan): We do not currently handle this last bit of behavior.
// ```
// Finally, u refreshes all k-buckets further away than its closest neighbor.
// During the refreshes, u both populates its own k-buckets and inserts
// itself into other nodes' k-buckets as necessary.
// ``
return err
}
2019-01-25 22:33:20 +00:00
// WaitForBootstrap waits for bootstrap pinging has been completed.
func (k *Kademlia) WaitForBootstrap() {
k.bootstrapFinished.Wait()
}
// FetchPeerIdentity connects to a node and returns its peer identity
func (k *Kademlia) FetchPeerIdentity(ctx context.Context, nodeID storj.NodeID) (*identity.PeerIdentity, error) {
if !k.lookups.Start() {
return nil, context.Canceled
}
defer k.lookups.Done()
node, err := k.FindNode(ctx, nodeID)
if err != nil {
return nil, err
}
return k.dialer.FetchPeerIdentity(ctx, node)
}
// Ping checks that the provided node is still accessible on the network
func (k *Kademlia) Ping(ctx context.Context, node pb.Node) (pb.Node, error) {
2019-01-28 19:04:42 +00:00
if !k.lookups.Start() {
return pb.Node{}, context.Canceled
}
2019-01-25 22:33:20 +00:00
defer k.lookups.Done()
2019-01-18 15:00:56 +00:00
ok, err := k.dialer.Ping(ctx, node)
if err != nil {
return pb.Node{}, NodeErr.Wrap(err)
}
if !ok {
return pb.Node{}, NodeErr.New("Failed pinging node")
}
return node, nil
}
// FetchInfo connects to a node address and returns the node info
func (k *Kademlia) FetchInfo(ctx context.Context, address *pb.NodeAddress) (*identity.PeerIdentity, *pb.InfoResponse, error) {
if !k.lookups.Start() {
return nil, nil, context.Canceled
}
defer k.lookups.Done()
id, info, err := k.dialer.FetchInfo(ctx, address)
if err != nil {
return nil, nil, NodeErr.Wrap(err)
}
return id, info, nil
}
// FindNode looks up the provided NodeID first in the local Node, and if it is not found
// begins searching the network for the NodeID. Returns and error if node was not found
func (k *Kademlia) FindNode(ctx context.Context, ID storj.NodeID) (pb.Node, error) {
2019-01-28 19:04:42 +00:00
if !k.lookups.Start() {
return pb.Node{}, context.Canceled
}
2019-01-25 22:33:20 +00:00
defer k.lookups.Done()
return k.lookup(ctx, ID, false)
}
//lookup initiates a kadmelia node lookup
func (k *Kademlia) lookup(ctx context.Context, ID storj.NodeID, isBootstrap bool) (pb.Node, error) {
2019-01-28 19:04:42 +00:00
if !k.lookups.Start() {
return pb.Node{}, context.Canceled
}
2019-01-25 22:33:20 +00:00
defer k.lookups.Done()
kb := k.routingTable.K()
var nodes []*pb.Node
if isBootstrap {
for _, v := range k.bootstrapNodes {
nodes = append(nodes, &v)
}
} else {
var err error
nodes, err = k.routingTable.FindNear(ID, kb)
if err != nil {
return pb.Node{}, err
}
}
2019-01-18 15:00:56 +00:00
lookup := newPeerDiscovery(k.log, k.routingTable.Local(), nodes, k.dialer, ID, discoveryOptions{
concurrency: k.alpha, retries: defaultRetries, bootstrap: isBootstrap, bootstrapNodes: k.bootstrapNodes,
})
target, err := lookup.Run(ctx)
if err != nil {
return pb.Node{}, err
}
bucket, err := k.routingTable.getKBucketID(ID)
if err != nil {
k.log.Warn("Error getting getKBucketID in kad lookup")
} else {
err = k.routingTable.SetBucketTimestamp(bucket[:], time.Now())
if err != nil {
k.log.Warn("Error updating bucket timestamp in kad lookup")
}
}
if target == nil {
if isBootstrap {
return pb.Node{}, nil
}
return pb.Node{}, NodeNotFound.New("")
}
return *target, nil
}
// Seen returns all nodes that this kademlia instance has successfully communicated with
func (k *Kademlia) Seen() []*pb.Node {
nodes := []*pb.Node{}
k.routingTable.mutex.Lock()
for _, v := range k.routingTable.seen {
nodes = append(nodes, pb.CopyNode(v))
}
k.routingTable.mutex.Unlock()
return nodes
}
2019-02-08 20:35:59 +00:00
// SetBucketRefreshThreshold changes the threshold when buckets are considered stale and need refreshing.
func (k *Kademlia) SetBucketRefreshThreshold(threshold time.Duration) {
atomic.StoreInt64(&k.refreshThreshold, int64(threshold))
}
// Run occasionally refreshes stale kad buckets
func (k *Kademlia) Run(ctx context.Context) error {
2019-01-28 19:04:42 +00:00
if !k.lookups.Start() {
return context.Canceled
}
2019-01-25 22:33:20 +00:00
defer k.lookups.Done()
k.RefreshBuckets.SetInterval(5 * time.Minute)
return k.RefreshBuckets.Run(ctx, func(ctx context.Context) error {
2019-02-08 20:35:59 +00:00
threshold := time.Duration(atomic.LoadInt64(&k.refreshThreshold))
err := k.refresh(ctx, threshold)
if err != nil {
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
k.log.Warn("bucket refresh failed", zap.Error(err))
}
return nil
})
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
}
2019-01-10 13:13:27 +00:00
// refresh updates each Kademlia bucket not contacted in the last hour
func (k *Kademlia) refresh(ctx context.Context, threshold time.Duration) error {
bIDs, err := k.routingTable.GetBucketIds()
if err != nil {
return Error.Wrap(err)
}
now := time.Now()
startID := bucketID{}
var errors errs.Group
for _, bID := range bIDs {
endID := keyToBucketID(bID)
ts, tErr := k.routingTable.GetBucketTimestamp(bID)
if tErr != nil {
errors.Add(tErr)
} else if now.After(ts.Add(threshold)) {
rID, _ := randomIDInRange(startID, endID)
_, _ = k.FindNode(ctx, rID) // ignore node not found
}
startID = endID
}
return Error.Wrap(errors.Err())
}
// randomIDInRange finds a random node ID with a range (start..end]
func randomIDInRange(start, end bucketID) (storj.NodeID, error) {
randID := storj.NodeID{}
divergedHigh := false
divergedLow := false
for x := 0; x < len(randID); x++ {
s := byte(0)
if !divergedLow {
s = start[x]
}
e := byte(255)
if !divergedHigh {
e = end[x]
}
if s > e {
return storj.NodeID{}, errs.New("Random id range was invalid")
}
if s == e {
randID[x] = s
} else {
r := s + byte(rand.Intn(int(e-s))) + 1
if r < e {
divergedHigh = true
}
if r > s {
divergedLow = true
}
randID[x] = r
}
}
if !divergedLow {
if !divergedHigh { // start == end
return storj.NodeID{}, errs.New("Random id range was invalid")
} else if randID[len(randID)-1] == start[len(randID)-1] { // start == randID
randID[len(randID)-1] = start[len(randID)-1] + 1
}
}
return randID, nil
}
// Restrict is used to limit nodes returned that don't match the miniumum storage requirements
func Restrict(r pb.Restriction, n []*pb.Node) []*pb.Node {
oper := r.GetOperand()
op := r.GetOperator()
val := r.GetValue()
var comp int64
results := []*pb.Node{}
for _, v := range n {
switch oper {
case pb.Restriction_FREE_BANDWIDTH:
comp = v.GetRestrictions().GetFreeBandwidth()
case pb.Restriction_FREE_DISK:
comp = v.GetRestrictions().GetFreeDisk()
}
switch op {
case pb.Restriction_EQ:
if comp == val {
results = append(results, v)
continue
}
case pb.Restriction_LT:
if comp < val {
results = append(results, v)
continue
}
case pb.Restriction_LTE:
if comp <= val {
results = append(results, v)
continue
}
case pb.Restriction_GT:
if comp > val {
results = append(results, v)
continue
}
case pb.Restriction_GTE:
if comp >= val {
results = append(results, v)
continue
}
}
}
return results
}
func meetsRestrictions(rs []pb.Restriction, n pb.Node) bool {
for _, r := range rs {
oper := r.GetOperand()
op := r.GetOperator()
val := r.GetValue()
var comp int64
switch oper {
case pb.Restriction_FREE_BANDWIDTH:
comp = n.GetRestrictions().GetFreeBandwidth()
case pb.Restriction_FREE_DISK:
comp = n.GetRestrictions().GetFreeDisk()
}
switch op {
case pb.Restriction_EQ:
if comp != val {
return false
}
case pb.Restriction_LT:
if comp >= val {
return false
}
case pb.Restriction_LTE:
if comp > val {
return false
}
case pb.Restriction_GT:
if comp <= val {
return false
}
case pb.Restriction_GTE:
if comp < val {
return false
}
}
}
return true
}