storj/pkg/discovery/service.go
Stefan Benten bae4c820ee
Add Version Information into KAD Network and SatelliteDB & Change Selection Process (#1648)
* Initial Webserver Draft for Version Controlling

* Rename type to avoid confusion

* Move Function Calls into Version Package

* Fix Linting and Language Typos

* Fix Linting and Spelling Mistakes

* Include Copyright

* Include Copyright

* Adjust Version-Control Server to return list of Versions

* Linting

* Improve Request Handling and Readability

* Add Configuration File Option
Add Systemd Service file

* Add Logging to File

* Smaller Changes

* Add Semantic Versioning and refuses outdated Software from Startup (#1612)

* implements internal Semantic Version library

* adds version logging + reporting to process

* Advance SemVer struct for easier handling

* Add Accepted Version Store

* Fix Function

* Restructure

* Type Conversion

* Handle Version String properly

* Add Note about array index

* Set temporary Default Version

* Add Copyright

* Adding Version to Dashboard

* Adding Version Info Log

* Renaming and adding CheckerProcess

* Iteration Sync

* Iteration V2

* linting

* made LogAndReportVersion a go routine

* Refactor to Go Routine

* Add Context to Go Routine and allow Operation if Lookup to Control Server fails

* Handle Unmarshal properly

* Linting

* Relocate Version Checks

* Relocating Version Check and specified default Version for now

* Linting Error Prevention

* Refuse Startup on outdated Version

* Add Startup Check Function

* Straighten Logging

* Dont force Shutdown if --dev flag is set

* Create full Service/Peer Structure for ControlServer

* Linting

* Straighting Naming

* Finish VersionControl Service Layout

* Improve Error Handling

* Change Listening Address

* Move Checker Function

* Remove VersionControl Peer

* Linting

* Linting

* Create VersionClient Service

* Renaming

* Add Version Client to Peer Definitions

* Linting and Renaming

* Linting

* Remove Transport Checks for now

* Move to Client Side Flag

* Remove check

* Linting

* Transport Client Version Intro

* Adding Version Client to Transport Client

* Add missing parameter

* Adding Version Check, to set Allowed = true

* Set Default to true, testing

* Restructuring Code

* Uplink Changes

* Add more proper Defaults

* Renaming of Version struct

* Dont pass Service use Pointer

* Set Defaults for Versioning Checks

* Put HTTP Server in go routine

* Add Versioncontrol to Storj-Sim

* Testplanet Fixes

* Linting

* Add Error Handling and new Server Struct

* Move Lock slightly

* Reduce Race Potentials

* Remove unnecessary files

* Linting

* Add Proper Transport Handling

* small fixes

* add fence for allowed check

* Add Startup Version Check and Service Naming

* make errormessage private

* Add Comments about VersionedClient

* Linting

* Remove Checks that refuse outgoing connections

* Remove release cmd

* Add Release Script

* Linting

* Update to use correct Values

* Change Timestamp handling

* Adding Protobuf changes back in

* Adding SatelliteDB Changes and adding Storj Node Version to PB

* Add Migration Table

* Add Default Stats for Creation

* Move to BigInt

* Proper SQL Migration

* Ensure minimum Version is passed to the node selection

* Linting...

* Remove VersionedClient and adjust smaller changes from prior merge

* Linting

* Fix PB Message Handling and Query for Node Selection

* some future-proofing type changes

Change-Id: I3cb5018dcccdbc9739fe004d859065992720caaf

* fix a compiler error

Change-Id: If66bb92d8b98e31cd618ecec9c6448ab9b037fa5

* Comment on Constant for Overlay

* Remove NOT NULL and add epoch call as function

* add versions to bootstrap and satellites

Change-Id: I436944589ea5f21600cdd997742a84fe0b16e47b

* Change Update Migration

* Fix DB Migration

* Increase Timeout temporarily, to see whats going on

* Remove unnecessary const and vars
Cleanup Function calls from deprecated NodeVersion struct

* Updated Protopuf, removed depcreated Code from Inspector

* Implement NodeVersion into InfoResponse

* Regenerated locked.go

* Linting

* Fix Tests

* Remove unnecessary constant

* Update Function and Flag Description

* Remove Empty Stat Creation

* return properly with error

* Remove unnecessary struct

* simplify migration step

* Update Inspector to return Version Info

* Update local Endpoint Version Handling

* Reset Travis Timeout

* Add Default for CommitHash

* single quotes
2019-04-10 08:04:24 +02:00

236 lines
6.5 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package discovery
import (
"context"
"crypto/rand"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"storj.io/storj/internal/sync2"
"storj.io/storj/pkg/kademlia"
"storj.io/storj/pkg/overlay"
"storj.io/storj/pkg/storj"
)
var (
// mon = monkit.Package() //TODO: check whether this needs monitoring
// Error is a general error class of this package
Error = errs.Class("discovery error")
)
// Config loads on the configuration values for the cache
type Config struct {
RefreshInterval time.Duration `help:"the interval at which the cache refreshes itself in seconds" default:"1s"`
GraveyardInterval time.Duration `help:"the interval at which the the graveyard tries to resurrect nodes" default:"30s"`
DiscoveryInterval time.Duration `help:"the interval at which the satellite attempts to find new nodes via random node ID lookups" default:"1s"`
RefreshLimit int `help:"the amount of nodes refreshed at each interval" default:"100"`
}
// Discovery struct loads on cache, kad
type Discovery struct {
log *zap.Logger
cache *overlay.Cache
kad *kademlia.Kademlia
// refreshOffset tracks the offset of the current refresh cycle
refreshOffset int64
refreshLimit int
Refresh sync2.Cycle
Graveyard sync2.Cycle
Discovery sync2.Cycle
}
// New returns a new discovery service.
func New(logger *zap.Logger, ol *overlay.Cache, kad *kademlia.Kademlia, config Config) *Discovery {
discovery := &Discovery{
log: logger,
cache: ol,
kad: kad,
refreshOffset: 0,
refreshLimit: config.RefreshLimit,
}
discovery.Refresh.SetInterval(config.RefreshInterval)
discovery.Graveyard.SetInterval(config.GraveyardInterval)
discovery.Discovery.SetInterval(config.DiscoveryInterval)
return discovery
}
// Close closes resources
func (discovery *Discovery) Close() error {
discovery.Refresh.Close()
discovery.Graveyard.Close()
discovery.Discovery.Close()
return nil
}
// Run runs the discovery service
func (discovery *Discovery) Run(ctx context.Context) error {
var group errgroup.Group
discovery.Refresh.Start(ctx, &group, func(ctx context.Context) error {
err := discovery.refresh(ctx)
if err != nil {
discovery.log.Error("error with cache refresh: ", zap.Error(err))
}
return nil
})
discovery.Graveyard.Start(ctx, &group, func(ctx context.Context) error {
err := discovery.searchGraveyard(ctx)
if err != nil {
discovery.log.Error("graveyard resurrection failed: ", zap.Error(err))
}
return nil
})
discovery.Discovery.Start(ctx, &group, func(ctx context.Context) error {
err := discovery.discover(ctx)
if err != nil {
discovery.log.Error("error with cache discovery: ", zap.Error(err))
}
return nil
})
return group.Wait()
}
// refresh updates the cache db with the current DHT.
// We currently do not penalize nodes that are unresponsive,
// but should in the future.
func (discovery *Discovery) refresh(ctx context.Context) error {
nodes := discovery.kad.Seen()
for _, v := range nodes {
if err := discovery.cache.Put(ctx, v.Id, *v); err != nil {
return err
}
}
list, more, err := discovery.cache.Paginate(ctx, discovery.refreshOffset, discovery.refreshLimit)
if err != nil {
return Error.Wrap(err)
}
// more means there are more rows to page through in the cache
if more == false {
discovery.refreshOffset = 0
} else {
discovery.refreshOffset = discovery.refreshOffset + int64(len(list))
}
for _, node := range list {
if ctx.Err() != nil {
return ctx.Err()
}
ping, err := discovery.kad.Ping(ctx, node.Node)
if err != nil {
discovery.log.Info("could not ping node", zap.String("ID", node.Id.String()), zap.Error(err))
_, err := discovery.cache.UpdateUptime(ctx, node.Id, false)
if err != nil {
discovery.log.Error("could not update node uptime in cache", zap.String("ID", node.Id.String()), zap.Error(err))
}
continue
}
if ctx.Err() != nil {
return ctx.Err()
}
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
if err != nil {
discovery.log.Error("could not update node uptime in cache", zap.String("ID", ping.Id.String()), zap.Error(err))
}
err = discovery.cache.Put(ctx, ping.Id, ping)
if err != nil {
discovery.log.Error("could not put node into cache", zap.String("ID", ping.Id.String()), zap.Error(err))
}
// update wallet with correct info
info, err := discovery.kad.FetchInfo(ctx, node.Node)
if err != nil {
discovery.log.Warn("could not fetch node info", zap.String("ID", ping.GetAddress().String()))
continue
}
_, err = discovery.cache.UpdateNodeInfo(ctx, ping.Id, info)
if err != nil {
discovery.log.Warn("could not update node operator", zap.String("ID", ping.GetAddress().String()))
}
}
return nil
}
// graveyard attempts to ping all nodes in the Seen() map from Kademlia and adds them to the cache
// if they respond. This is an attempt to resurrect nodes that may have gone offline in the last hour
// and were removed from the cache due to an unsuccessful response.
func (discovery *Discovery) searchGraveyard(ctx context.Context) error {
seen := discovery.kad.Seen()
var errors errs.Group
for _, n := range seen {
if ctx.Err() != nil {
return ctx.Err()
}
ping, err := discovery.kad.Ping(ctx, *n)
if err != nil {
discovery.log.Debug("could not ping node in graveyard check")
// we don't want to report the ping error to ErrorGroup because it's to be expected here.
continue
}
if ctx.Err() != nil {
return ctx.Err()
}
err = discovery.cache.Put(ctx, ping.Id, ping)
if err != nil {
discovery.log.Warn("could not update node uptime")
errors.Add(err)
}
_, err = discovery.cache.UpdateUptime(ctx, ping.Id, true)
if err != nil {
discovery.log.Warn("could not update node uptime")
errors.Add(err)
}
}
return errors.Err()
}
// Discovery runs lookups for random node ID's to find new nodes in the network
func (discovery *Discovery) discover(ctx context.Context) error {
r, err := randomID()
if err != nil {
return Error.Wrap(err)
}
_, err = discovery.kad.FindNode(ctx, r)
if err != nil && !kademlia.NodeNotFound.Has(err) {
return Error.Wrap(err)
}
return nil
}
// Walk iterates over each node in each bucket to traverse the network
func (discovery *Discovery) walk(ctx context.Context) error {
// TODO: This should walk the cache, rather than be a duplicate of refresh
return nil
}
func randomID() (storj.NodeID, error) {
b := make([]byte, 32)
_, err := rand.Read(b)
if err != nil {
return storj.NodeID{}, Error.Wrap(err)
}
return storj.NodeIDFromBytes(b)
}