storj/pkg/overlay/cache.go

375 lines
12 KiB
Go
Raw Normal View History

2019-01-24 20:15:10 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
2018-04-18 17:55:28 +01:00
// See LICENSE for copying information.
package overlay
2018-04-18 16:34:15 +01:00
import (
"context"
"errors"
2018-04-18 16:34:15 +01:00
"github.com/zeebo/errs"
"go.uber.org/zap"
2018-11-16 16:31:14 +00:00
"storj.io/storj/pkg/pb"
2018-11-30 13:40:13 +00:00
"storj.io/storj/pkg/storj"
"storj.io/storj/storage"
2018-04-18 16:34:15 +01:00
)
const (
// OverlayBucket is the string representing the bucket used for a bolt-backed overlay dht cache
OverlayBucket = "overlay"
)
2018-12-17 18:47:26 +00:00
// ErrEmptyNode is returned when the nodeID is empty
var ErrEmptyNode = errs.New("empty node ID")
// ErrNodeNotFound is returned if a node does not exist in database
var ErrNodeNotFound = errs.Class("node not found")
2018-12-17 18:47:26 +00:00
// ErrBucketNotFound is returned if a bucket is unable to be found in the routing table
var ErrBucketNotFound = errs.New("bucket not found")
// ErrNotEnoughNodes is when selecting nodes failed with the given parameters
var ErrNotEnoughNodes = errs.Class("not enough nodes")
// OverlayError creates class of errors for stack traces
var OverlayError = errs.Class("overlay error")
Cache (#67) * add reference to dht to overlay client struct * wip * wip * Implement FindNode * get nodes * WIP * Merge in Dennis kademlia code, get it working with our code * ping and moar * WIP trying to get cache working with kademlia * WIP more wiring up * WIP * Update service cli commands * WIP * added GetNodes * added nodes to Kbucket * default transport changed to TCP * GetBuckets interface changed * filling in more routing * timestamp methods * removed store * Added initial network overlay explorer page * Updating and building with dockerfile * Working on adding bootstrap node code * WIP merging in dennis' code * WIP * connects cache to pkg/kademlia implementation * WIP redis cache * testing * Add bootstrap network function for CLI usage * cleanup * call bootstrap on init network * Add BootstrapNetwork function to interface * Merge in dennis kad code * WIP updates to redis/overlay client interface * WIP trying to get the DHT connected to the cache * go mod & test * deps * Bootstrap node now setting up correctly - Need to pass it through CLI commands better * WIP adding refresh and walk functions, added cli flags - added cli flags for custom bootstrap port and ip * PR comments addressed * adding FindStorageNodes to overlay cache * fix GetBucket * using SplitHostPort * Use JoinHostPort * updates to findstoragenodes response and request * WIP merge in progress, having issues with a panic * wip * adjustments * update port for dht bootstrap test * Docker * wip * dockerfile * fixes * makefile changes * Update port in NewKademlia call * Update local kademlia DHT config * kubernetes yaml * cleanup * making tests pass * k8s yaml * lint issues * Edit cli flags to allow for configurable bootstrap IP and Port args * cleanup * cache walking the network now * Rough prototype of Walk function laid out * Move walk function into bootstrap function * Update dht.go * changes to yaml * goimports
2018-06-05 22:06:37 +01:00
// DB implements the database for overlay.Cache
type DB interface {
// SelectStorageNodes looks up nodes based on criteria
SelectStorageNodes(ctx context.Context, count int, criteria *NodeCriteria) ([]*pb.Node, error)
// SelectNewStorageNodes looks up nodes based on new node criteria
SelectNewStorageNodes(ctx context.Context, count int, criteria *NewNodeCriteria) ([]*pb.Node, error)
// Get looks up the node by nodeID
Get(ctx context.Context, nodeID storj.NodeID) (*pb.Node, error)
// GetAll looks up nodes based on the ids from the overlay cache
GetAll(ctx context.Context, nodeIDs storj.NodeIDList) ([]*pb.Node, error)
// List lists nodes starting from cursor
List(ctx context.Context, cursor storj.NodeID, limit int) ([]*pb.Node, error)
// Paginate will page through the database nodes
Paginate(ctx context.Context, offset int64, limit int) ([]*pb.Node, bool, error)
// Update updates node information
Update(ctx context.Context, value *pb.Node) error
// Delete deletes node based on id
Delete(ctx context.Context, id storj.NodeID) error
// CreateStats initializes the stats for node.
CreateStats(ctx context.Context, nodeID storj.NodeID, initial *NodeStats) (stats *NodeStats, err error)
// GetStats returns node stats.
GetStats(ctx context.Context, nodeID storj.NodeID) (stats *NodeStats, err error)
// FindInvalidNodes finds a subset of storagenodes that have stats below provided reputation requirements.
FindInvalidNodes(ctx context.Context, nodeIDs storj.NodeIDList, maxStats *NodeStats) (invalid storj.NodeIDList, err error)
// UpdateStats all parts of single storagenode's stats.
UpdateStats(ctx context.Context, request *UpdateRequest) (stats *NodeStats, err error)
// UpdateOperator updates the email and wallet for a given node ID for satellite payments.
UpdateOperator(ctx context.Context, node storj.NodeID, updatedOperator pb.NodeOperator) (stats *NodeStats, err error)
// UpdateUptime updates a single storagenode's uptime stats.
UpdateUptime(ctx context.Context, nodeID storj.NodeID, isUp bool) (stats *NodeStats, err error)
// UpdateBatch for updating multiple storage nodes' stats.
UpdateBatch(ctx context.Context, requests []*UpdateRequest) (statslist []*NodeStats, failed []*UpdateRequest, err error)
// CreateEntryIfNotExists creates a node stats entry if it didn't already exist.
CreateEntryIfNotExists(ctx context.Context, value *pb.Node) (stats *NodeStats, err error)
}
2019-03-23 08:06:11 +00:00
// FindStorageNodesRequest defines easy request parameters.
type FindStorageNodesRequest struct {
MinimumRequiredNodes int
RequestedCount int
FreeBandwidth int64
FreeDisk int64
ExcludedNodes []storj.NodeID
}
// NodeCriteria are the requirements for selecting nodes
type NodeCriteria struct {
FreeBandwidth int64
FreeDisk int64
AuditCount int64
AuditSuccessRatio float64
UptimeCount int64
UptimeSuccessRatio float64
Excluded []storj.NodeID
}
// NewNodeCriteria are the requirement for selecting new nodes
type NewNodeCriteria struct {
FreeBandwidth int64
FreeDisk int64
AuditThreshold int64
Excluded []storj.NodeID
}
// UpdateRequest is used to update a node status.
type UpdateRequest struct {
NodeID storj.NodeID
AuditSuccess bool
IsUp bool
}
// NodeStats contains statistics about a node.
type NodeStats struct {
NodeID storj.NodeID
AuditSuccessRatio float64
AuditSuccessCount int64
AuditCount int64
UptimeRatio float64
UptimeSuccessCount int64
UptimeCount int64
Operator pb.NodeOperator
}
2019-03-23 08:06:11 +00:00
// Cache is used to store and handle node information
type Cache struct {
2019-03-23 08:06:11 +00:00
log *zap.Logger
db DB
preferences NodeSelectionConfig
2018-04-18 16:34:15 +01:00
}
// NewCache returns a new Cache
func NewCache(log *zap.Logger, db DB, preferences NodeSelectionConfig) *Cache {
2019-03-23 08:06:11 +00:00
return &Cache{
log: log,
db: db,
preferences: preferences,
}
}
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
// Close closes resources
func (cache *Cache) Close() error { return nil }
// Inspect lists limited number of items in the cache
func (cache *Cache) Inspect(ctx context.Context) (storage.Keys, error) {
// TODO: implement inspection tools
return nil, errors.New("not implemented")
}
// List returns a list of nodes from the cache DB
2019-03-23 08:06:11 +00:00
func (cache *Cache) List(ctx context.Context, cursor storj.NodeID, limit int) (_ []*pb.Node, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.List(ctx, cursor, limit)
}
// Paginate returns a list of `limit` nodes starting from `start` offset.
2019-03-23 08:06:11 +00:00
func (cache *Cache) Paginate(ctx context.Context, offset int64, limit int) (_ []*pb.Node, _ bool, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.Paginate(ctx, offset, limit)
}
// Get looks up the provided nodeID from the overlay cache
2019-03-23 08:06:11 +00:00
func (cache *Cache) Get(ctx context.Context, nodeID storj.NodeID) (_ *pb.Node, err error) {
defer mon.Task()(&ctx)(&err)
2018-12-17 18:47:26 +00:00
if nodeID.IsZero() {
return nil, ErrEmptyNode
}
return cache.db.Get(ctx, nodeID)
2018-04-18 16:34:15 +01:00
}
2019-03-23 08:06:11 +00:00
// OfflineNodes returns indices of the nodes that are offline
func (cache *Cache) OfflineNodes(ctx context.Context, nodes []storj.NodeID) (offline []int, err error) {
defer mon.Task()(&ctx)(&err)
2019-03-23 08:06:11 +00:00
// TODO: optimize
results, err := cache.GetAll(ctx, nodes)
if err != nil {
return nil, err
}
for i, r := range results {
if r == nil {
offline = append(offline, i)
}
}
return offline, nil
}
// FindStorageNodes searches the overlay network for nodes that meet the provided requirements
func (cache *Cache) FindStorageNodes(ctx context.Context, req FindStorageNodesRequest) ([]*pb.Node, error) {
return cache.FindStorageNodesWithPreferences(ctx, req, &cache.preferences)
}
// FindStorageNodesWithPreferences searches the overlay network for nodes that meet the provided criteria
func (cache *Cache) FindStorageNodesWithPreferences(ctx context.Context, req FindStorageNodesRequest, preferences *NodeSelectionConfig) (_ []*pb.Node, err error) {
defer mon.Task()(&ctx)(&err)
// TODO: verify logic
// TODO: add sanity limits to requested node count
// TODO: add sanity limits to excluded nodes
2019-03-23 08:06:11 +00:00
reputableNodeCount := req.MinimumRequiredNodes
if reputableNodeCount <= 0 {
2019-03-23 08:06:11 +00:00
reputableNodeCount = req.RequestedCount
}
auditCount := preferences.AuditCount
if auditCount < preferences.NewNodeAuditThreshold {
auditCount = preferences.NewNodeAuditThreshold
}
reputableNodes, err := cache.db.SelectStorageNodes(ctx, reputableNodeCount, &NodeCriteria{
2019-03-23 08:06:11 +00:00
FreeBandwidth: req.FreeBandwidth,
FreeDisk: req.FreeDisk,
AuditCount: auditCount,
AuditSuccessRatio: preferences.AuditSuccessRatio,
UptimeCount: preferences.UptimeCount,
UptimeSuccessRatio: preferences.UptimeRatio,
2019-03-23 08:06:11 +00:00
Excluded: req.ExcludedNodes,
})
if err != nil {
return nil, err
}
newNodeCount := int64(float64(reputableNodeCount) * preferences.NewNodePercentage)
newNodes, err := cache.db.SelectNewStorageNodes(ctx, int(newNodeCount), &NewNodeCriteria{
2019-03-23 08:06:11 +00:00
FreeBandwidth: req.FreeBandwidth,
FreeDisk: req.FreeDisk,
AuditThreshold: preferences.NewNodeAuditThreshold,
2019-03-23 08:06:11 +00:00
Excluded: req.ExcludedNodes,
})
if err != nil {
return nil, err
}
nodes := []*pb.Node{}
nodes = append(nodes, newNodes...)
nodes = append(nodes, reputableNodes...)
if len(reputableNodes) < reputableNodeCount {
return nodes, ErrNotEnoughNodes.New("requested %d found %d", reputableNodeCount, len(reputableNodes))
}
return nodes, nil
}
// GetAll looks up the provided ids from the overlay cache
2019-03-23 08:06:11 +00:00
func (cache *Cache) GetAll(ctx context.Context, ids storj.NodeIDList) (_ []*pb.Node, err error) {
defer mon.Task()(&ctx)(&err)
if len(ids) == 0 {
return nil, OverlayError.New("no ids provided")
}
return cache.db.GetAll(ctx, ids)
}
2019-03-23 08:06:11 +00:00
// Put adds a node id and proto definition into the overlay cache and stat db
func (cache *Cache) Put(ctx context.Context, nodeID storj.NodeID, value pb.Node) (err error) {
defer mon.Task()(&ctx)(&err)
// If we get a Node without an ID (i.e. bootstrap node)
// we don't want to add to the routing tbale
2018-12-17 18:47:26 +00:00
if nodeID.IsZero() {
return nil
}
if nodeID != value.Id {
return errors.New("invalid request")
}
// get existing node rep, or create a new overlay node with 0 rep
stats, err := cache.db.CreateEntryIfNotExists(ctx, &value)
if err != nil {
return err
}
value.Reputation = &pb.NodeStats{
AuditSuccessRatio: stats.AuditSuccessRatio,
AuditSuccessCount: stats.AuditSuccessCount,
AuditCount: stats.AuditCount,
UptimeRatio: stats.UptimeRatio,
UptimeSuccessCount: stats.UptimeSuccessCount,
UptimeCount: stats.UptimeCount,
}
return cache.db.Update(ctx, &value)
2018-04-18 16:34:15 +01:00
}
// Delete will remove the node from the cache. Used when a node hard disconnects or fails
// to pass a PING multiple times.
2019-03-23 08:06:11 +00:00
func (cache *Cache) Delete(ctx context.Context, id storj.NodeID) (err error) {
defer mon.Task()(&ctx)(&err)
if id.IsZero() {
return ErrEmptyNode
}
2019-03-23 08:06:11 +00:00
return cache.db.Delete(ctx, id)
}
// Create adds a new stats entry for node.
func (cache *Cache) Create(ctx context.Context, nodeID storj.NodeID, initial *NodeStats) (stats *NodeStats, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.CreateStats(ctx, nodeID, initial)
}
// GetStats returns node stats.
func (cache *Cache) GetStats(ctx context.Context, nodeID storj.NodeID) (stats *NodeStats, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.GetStats(ctx, nodeID)
}
// FindInvalidNodes finds a subset of storagenodes that have stats below provided reputation requirements.
func (cache *Cache) FindInvalidNodes(ctx context.Context, nodeIDs storj.NodeIDList, maxStats *NodeStats) (invalid storj.NodeIDList, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.FindInvalidNodes(ctx, nodeIDs, maxStats)
}
// UpdateStats all parts of single storagenode's stats.
func (cache *Cache) UpdateStats(ctx context.Context, request *UpdateRequest) (stats *NodeStats, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.UpdateStats(ctx, request)
}
// UpdateOperator updates the email and wallet for a given node ID for satellite payments.
func (cache *Cache) UpdateOperator(ctx context.Context, node storj.NodeID, updatedOperator pb.NodeOperator) (stats *NodeStats, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.UpdateOperator(ctx, node, updatedOperator)
}
// UpdateUptime updates a single storagenode's uptime stats.
func (cache *Cache) UpdateUptime(ctx context.Context, nodeID storj.NodeID, isUp bool) (stats *NodeStats, err error) {
defer mon.Task()(&ctx)(&err)
return cache.db.UpdateUptime(ctx, nodeID, isUp)
}
// ConnFailure implements the Transport Observer `ConnFailure` function
func (cache *Cache) ConnFailure(ctx context.Context, node *pb.Node, failureError error) {
2019-03-23 08:06:11 +00:00
var err error
defer mon.Task()(&ctx)(&err)
// TODO: Kademlia paper specifies 5 unsuccessful PINGs before removing the node
// from our routing table, but this is the cache so maybe we want to treat
// it differently.
_, err = cache.db.UpdateUptime(ctx, node.Id, false)
if err != nil {
zap.L().Debug("error updating uptime for node", zap.Error(err))
}
}
// ConnSuccess implements the Transport Observer `ConnSuccess` function
func (cache *Cache) ConnSuccess(ctx context.Context, node *pb.Node) {
2019-03-23 08:06:11 +00:00
var err error
defer mon.Task()(&ctx)(&err)
err = cache.Put(ctx, node.Id, *node)
if err != nil {
zap.L().Debug("error updating uptime for node", zap.Error(err))
}
_, err = cache.db.UpdateUptime(ctx, node.Id, true)
if err != nil {
zap.L().Debug("error updating node connection info", zap.Error(err))
}
}