bae4c820ee
* Initial Webserver Draft for Version Controlling * Rename type to avoid confusion * Move Function Calls into Version Package * Fix Linting and Language Typos * Fix Linting and Spelling Mistakes * Include Copyright * Include Copyright * Adjust Version-Control Server to return list of Versions * Linting * Improve Request Handling and Readability * Add Configuration File Option Add Systemd Service file * Add Logging to File * Smaller Changes * Add Semantic Versioning and refuses outdated Software from Startup (#1612) * implements internal Semantic Version library * adds version logging + reporting to process * Advance SemVer struct for easier handling * Add Accepted Version Store * Fix Function * Restructure * Type Conversion * Handle Version String properly * Add Note about array index * Set temporary Default Version * Add Copyright * Adding Version to Dashboard * Adding Version Info Log * Renaming and adding CheckerProcess * Iteration Sync * Iteration V2 * linting * made LogAndReportVersion a go routine * Refactor to Go Routine * Add Context to Go Routine and allow Operation if Lookup to Control Server fails * Handle Unmarshal properly * Linting * Relocate Version Checks * Relocating Version Check and specified default Version for now * Linting Error Prevention * Refuse Startup on outdated Version * Add Startup Check Function * Straighten Logging * Dont force Shutdown if --dev flag is set * Create full Service/Peer Structure for ControlServer * Linting * Straighting Naming * Finish VersionControl Service Layout * Improve Error Handling * Change Listening Address * Move Checker Function * Remove VersionControl Peer * Linting * Linting * Create VersionClient Service * Renaming * Add Version Client to Peer Definitions * Linting and Renaming * Linting * Remove Transport Checks for now * Move to Client Side Flag * Remove check * Linting * Transport Client Version Intro * Adding Version Client to Transport Client * Add missing parameter * Adding Version Check, to set Allowed = true * Set Default to true, testing * Restructuring Code * Uplink Changes * Add more proper Defaults * Renaming of Version struct * Dont pass Service use Pointer * Set Defaults for Versioning Checks * Put HTTP Server in go routine * Add Versioncontrol to Storj-Sim * Testplanet Fixes * Linting * Add Error Handling and new Server Struct * Move Lock slightly * Reduce Race Potentials * Remove unnecessary files * Linting * Add Proper Transport Handling * small fixes * add fence for allowed check * Add Startup Version Check and Service Naming * make errormessage private * Add Comments about VersionedClient * Linting * Remove Checks that refuse outgoing connections * Remove release cmd * Add Release Script * Linting * Update to use correct Values * Change Timestamp handling * Adding Protobuf changes back in * Adding SatelliteDB Changes and adding Storj Node Version to PB * Add Migration Table * Add Default Stats for Creation * Move to BigInt * Proper SQL Migration * Ensure minimum Version is passed to the node selection * Linting... * Remove VersionedClient and adjust smaller changes from prior merge * Linting * Fix PB Message Handling and Query for Node Selection * some future-proofing type changes Change-Id: I3cb5018dcccdbc9739fe004d859065992720caaf * fix a compiler error Change-Id: If66bb92d8b98e31cd618ecec9c6448ab9b037fa5 * Comment on Constant for Overlay * Remove NOT NULL and add epoch call as function * add versions to bootstrap and satellites Change-Id: I436944589ea5f21600cdd997742a84fe0b16e47b * Change Update Migration * Fix DB Migration * Increase Timeout temporarily, to see whats going on * Remove unnecessary const and vars Cleanup Function calls from deprecated NodeVersion struct * Updated Protopuf, removed depcreated Code from Inspector * Implement NodeVersion into InfoResponse * Regenerated locked.go * Linting * Fix Tests * Remove unnecessary constant * Update Function and Flag Description * Remove Empty Stat Creation * return properly with error * Remove unnecessary struct * simplify migration step * Update Inspector to return Version Info * Update local Endpoint Version Handling * Reset Travis Timeout * Add Default for CommitHash * single quotes
236 lines
6.5 KiB
Go
236 lines
6.5 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package discovery
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"time"
|
|
|
|
"github.com/zeebo/errs"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"storj.io/storj/internal/sync2"
|
|
"storj.io/storj/pkg/kademlia"
|
|
"storj.io/storj/pkg/overlay"
|
|
"storj.io/storj/pkg/storj"
|
|
)
|
|
|
|
var (
|
|
// mon = monkit.Package() //TODO: check whether this needs monitoring
|
|
|
|
// Error is a general error class of this package
|
|
Error = errs.Class("discovery error")
|
|
)
|
|
|
|
// Config loads on the configuration values for the cache
|
|
type Config struct {
|
|
RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
|
|
GraveyardInterval time.Duration `help:"the interval at which the the graveyard tries to resurrect nodes" default:"30s"`
|
|
DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
|
|
RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"`
|
|
}
|
|
|
|
// Discovery struct loads on cache, kad
|
|
type Discovery struct {
|
|
log *zap.Logger
|
|
cache *overlay.Cache
|
|
kad *kademlia.Kademlia
|
|
|
|
// refreshOffset tracks the offset of the current refresh cycle
|
|
refreshOffset int64
|
|
refreshLimit int
|
|
|
|
Refresh sync2.Cycle
|
|
Graveyard sync2.Cycle
|
|
Discovery sync2.Cycle
|
|
}
|
|
|
|
// New returns a new discovery service.
|
|
func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config Config) *Discovery {
|
|
discovery := &Discovery{
|
|
log: logger,
|
|
cache: ol,
|
|
kad: kad,
|
|
|
|
refreshOffset: 0,
|
|
refreshLimit: config.RefreshLimit,
|
|
}
|
|
|
|
discovery.Refresh.SetInterval(config.RefreshInterval)
|
|
discovery.Graveyard.SetInterval(config.GraveyardInterval)
|
|
discovery.Discovery.SetInterval(config.DiscoveryInterval)
|
|
|
|
return discovery
|
|
}
|
|
|
|
// Close closes resources
|
|
func (discovery *Discovery) Close() error {
|
|
discovery.Refresh.Close()
|
|
discovery.Graveyard.Close()
|
|
discovery.Discovery.Close()
|
|
return nil
|
|
}
|
|
|
|
// Run runs the discovery service
|
|
func (discovery *Discovery) Run(ctx context.Context) error {
|
|
var group errgroup.Group
|
|
discovery.Refresh.Start(ctx, &group, func(ctx context.Context) error {
|
|
err := discovery.refresh(ctx)
|
|
if err != nil {
|
|
discovery.log.Error("error with cache refresh: ", zap.Error(err))
|
|
}
|
|
return nil
|
|
})
|
|
discovery.Graveyard.Start(ctx, &group, func(ctx context.Context) error {
|
|
err := discovery.searchGraveyard(ctx)
|
|
if err != nil {
|
|
discovery.log.Error("graveyard resurrection failed: ", zap.Error(err))
|
|
}
|
|
return nil
|
|
})
|
|
discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
|
|
err := discovery.discover(ctx)
|
|
if err != nil {
|
|
discovery.log.Error("error with cache discovery: ", zap.Error(err))
|
|
}
|
|
return nil
|
|
})
|
|
return group.Wait()
|
|
}
|
|
|
|
// refresh updates the cache db with the current DHT.
|
|
// We currently do not penalize nodes that are unresponsive,
|
|
// but should in the future.
|
|
func (discovery *Discovery) refresh(ctx context.Context) error {
|
|
nodes := discovery.kad.Seen()
|
|
for _, v := range nodes {
|
|
if err := discovery.cache.Put(ctx, v.Id, *v); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
|
|
if err != nil {
|
|
return Error.Wrap(err)
|
|
}
|
|
|
|
// more means there are more rows to page through in the cache
|
|
if more == false {
|
|
discovery.refreshOffset = 0
|
|
} else {
|
|
discovery.refreshOffset = discovery.refreshOffset + int64(len(list))
|
|
}
|
|
|
|
for _, node := range list {
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
|
|
ping, err := discovery.kad.Ping(ctx, node.Node)
|
|
if err != nil {
|
|
discovery.log.Info("could not ping node", zap.String("ID", node.Id.String()), zap.Error(err))
|
|
_, err := discovery.cache.UpdateUptime(ctx, node.Id, false)
|
|
if err != nil {
|
|
discovery.log.Error("could not update node uptime in cache", zap.String("ID", node.Id.String()), zap.Error(err))
|
|
}
|
|
continue
|
|
}
|
|
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
|
|
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
|
|
if err != nil {
|
|
discovery.log.Error("could not update node uptime in cache", zap.String("ID", ping.Id.String()), zap.Error(err))
|
|
}
|
|
err = discovery.cache.Put(ctx, ping.Id, ping)
|
|
if err != nil {
|
|
discovery.log.Error("could not put node into cache", zap.String("ID", ping.Id.String()), zap.Error(err))
|
|
}
|
|
|
|
// update wallet with correct info
|
|
info, err := discovery.kad.FetchInfo(ctx, node.Node)
|
|
if err != nil {
|
|
discovery.log.Warn("could not fetch node info", zap.String("ID", ping.GetAddress().String()))
|
|
continue
|
|
}
|
|
|
|
_, err = discovery.cache.UpdateNodeInfo(ctx, ping.Id, info)
|
|
if err != nil {
|
|
discovery.log.Warn("could not update node operator", zap.String("ID", ping.GetAddress().String()))
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// graveyard attempts to ping all nodes in the Seen() map from Kademlia and adds them to the cache
|
|
// if they respond. This is an attempt to resurrect nodes that may have gone offline in the last hour
|
|
// and were removed from the cache due to an unsuccessful response.
|
|
func (discovery *Discovery) searchGraveyard(ctx context.Context) error {
|
|
seen := discovery.kad.Seen()
|
|
|
|
var errors errs.Group
|
|
for _, n := range seen {
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
|
|
ping, err := discovery.kad.Ping(ctx, *n)
|
|
if err != nil {
|
|
discovery.log.Debug("could not ping node in graveyard check")
|
|
// we don't want to report the ping error to ErrorGroup because it's to be expected here.
|
|
continue
|
|
}
|
|
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
|
|
err = discovery.cache.Put(ctx, ping.Id, ping)
|
|
if err != nil {
|
|
discovery.log.Warn("could not update node uptime")
|
|
errors.Add(err)
|
|
}
|
|
|
|
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
|
|
if err != nil {
|
|
discovery.log.Warn("could not update node uptime")
|
|
errors.Add(err)
|
|
}
|
|
}
|
|
return errors.Err()
|
|
}
|
|
|
|
// Discovery runs lookups for random node ID's to find new nodes in the network
|
|
func (discovery *Discovery) discover(ctx context.Context) error {
|
|
r, err := randomID()
|
|
if err != nil {
|
|
return Error.Wrap(err)
|
|
}
|
|
_, err = discovery.kad.FindNode(ctx, r)
|
|
if err != nil && !kademlia.NodeNotFound.Has(err) {
|
|
return Error.Wrap(err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Walk iterates over each node in each bucket to traverse the network
|
|
func (discovery *Discovery) walk(ctx context.Context) error {
|
|
// TODO: This should walk the cache, rather than be a duplicate of refresh
|
|
return nil
|
|
}
|
|
|
|
func randomID() (storj.NodeID, error) {
|
|
b := make([]byte, 32)
|
|
_, err := rand.Read(b)
|
|
if err != nil {
|
|
return storj.NodeID{}, Error.Wrap(err)
|
|
}
|
|
return storj.NodeIDFromBytes(b)
|
|
}
|