59b37db670
The current implementation blocks the the startup until one or none of the trusted satellites is able to reach the node via QUIC. This can cause delayed startup. Also, the quic check is done once during startup, and if there is a misconfiguration later, snos would have to restart to node. In this change, we reuse the contact service which pings the satellite periodically for node checkin. During checkin the satellite tries pinging the node back via both TCP and QUIC and reports both statuses. WIth this, we are able to get a periodic update of the QUIC status without restarting the node. Also adds the time the node was last pinged via QUIC to the tooltip on the QUIC status tab. Resolves https://github.com/storj/storj/issues/4398 Change-Id: I18aa2a8e8d44e8187f8f2eb51f398fa6073882a4
238 lines
6.6 KiB
Go
238 lines
6.6 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package contact
|
|
|
|
import (
|
|
"context"
|
|
"math/rand"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
|
"github.com/zeebo/errs"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"storj.io/common/pb"
|
|
"storj.io/common/rpc"
|
|
"storj.io/common/storj"
|
|
"storj.io/common/sync2"
|
|
"storj.io/storj/storagenode/trust"
|
|
)
|
|
|
|
var (
|
|
mon = monkit.Package()
|
|
|
|
// Error is the default error class for contact package.
|
|
Error = errs.Class("contact")
|
|
|
|
errPingSatellite = errs.Class("ping satellite")
|
|
)
|
|
|
|
const initialBackOff = time.Second
|
|
|
|
// Config contains configurable values for contact service.
|
|
type Config struct {
|
|
ExternalAddress string `user:"true" help:"the public address of the node, useful for nodes behind NAT" default:""`
|
|
|
|
// Chore config values
|
|
Interval time.Duration `help:"how frequently the node contact chore should run" releaseDefault:"1h" devDefault:"30s"`
|
|
}
|
|
|
|
// NodeInfo contains information necessary for introducing storagenode to satellite.
|
|
type NodeInfo struct {
|
|
ID storj.NodeID
|
|
Address string
|
|
Version pb.NodeVersion
|
|
Capacity pb.NodeCapacity
|
|
Operator pb.NodeOperator
|
|
}
|
|
|
|
// Service is the contact service between storage nodes and satellites.
|
|
type Service struct {
|
|
log *zap.Logger
|
|
rand *rand.Rand
|
|
dialer rpc.Dialer
|
|
|
|
mu sync.Mutex
|
|
self NodeInfo
|
|
|
|
trust *trust.Pool
|
|
quicStats *QUICStats
|
|
|
|
initialized sync2.Fence
|
|
}
|
|
|
|
// NewService creates a new contact service.
|
|
func NewService(log *zap.Logger, dialer rpc.Dialer, self NodeInfo, trust *trust.Pool, quicStats *QUICStats) *Service {
|
|
return &Service{
|
|
log: log,
|
|
rand: rand.New(rand.NewSource(time.Now().UnixNano())),
|
|
dialer: dialer,
|
|
trust: trust,
|
|
self: self,
|
|
quicStats: quicStats,
|
|
}
|
|
}
|
|
|
|
// PingSatellites attempts to ping all satellites in trusted list until backoff reaches maxInterval.
|
|
func (service *Service) PingSatellites(ctx context.Context, maxInterval time.Duration) (err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
satellites := service.trust.GetSatellites(ctx)
|
|
var group errgroup.Group
|
|
for _, satellite := range satellites {
|
|
satellite := satellite
|
|
group.Go(func() error {
|
|
return service.pingSatellite(ctx, satellite, maxInterval)
|
|
})
|
|
}
|
|
return group.Wait()
|
|
}
|
|
|
|
func (service *Service) pingSatellite(ctx context.Context, satellite storj.NodeID, maxInterval time.Duration) error {
|
|
interval := initialBackOff
|
|
attempts := 0
|
|
for {
|
|
|
|
mon.Meter("satellite_contact_request").Mark(1) //mon:locked
|
|
|
|
err := service.pingSatelliteOnce(ctx, satellite)
|
|
attempts++
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
service.log.Error("ping satellite failed ", zap.Stringer("Satellite ID", satellite), zap.Int("attempts", attempts), zap.Error(err))
|
|
|
|
// Sleeps until interval times out, then continue. Returns if context is cancelled.
|
|
if !sync2.Sleep(ctx, interval) {
|
|
service.log.Info("context cancelled", zap.Stringer("Satellite ID", satellite))
|
|
return nil
|
|
}
|
|
interval *= 2
|
|
if interval >= maxInterval {
|
|
service.log.Info("retries timed out for this cycle", zap.Stringer("Satellite ID", satellite))
|
|
return nil
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
func (service *Service) pingSatelliteOnce(ctx context.Context, id storj.NodeID) (err error) {
|
|
defer mon.Task()(&ctx, id)(&err)
|
|
|
|
conn, err := service.dialSatellite(ctx, id)
|
|
if err != nil {
|
|
return errPingSatellite.Wrap(err)
|
|
}
|
|
defer func() { err = errs.Combine(err, conn.Close()) }()
|
|
|
|
self := service.Local()
|
|
resp, err := pb.NewDRPCNodeClient(conn).CheckIn(ctx, &pb.CheckInRequest{
|
|
Address: self.Address,
|
|
Version: &self.Version,
|
|
Capacity: &self.Capacity,
|
|
Operator: &self.Operator,
|
|
})
|
|
service.quicStats.SetStatus(false)
|
|
if err != nil {
|
|
return errPingSatellite.Wrap(err)
|
|
}
|
|
if resp != nil {
|
|
service.quicStats.SetStatus(resp.PingNodeSuccessQuic)
|
|
|
|
if !resp.PingNodeSuccess {
|
|
return errPingSatellite.New("%s", resp.PingErrorMessage)
|
|
}
|
|
}
|
|
if resp.PingErrorMessage != "" {
|
|
service.log.Warn("Your node is still considered to be online but encountered an error.", zap.Stringer("Satellite ID", id), zap.String("Error", resp.GetPingErrorMessage()))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// RequestPingMeQUIC sends pings request to satellite for a pingBack via QUIC.
|
|
func (service *Service) RequestPingMeQUIC(ctx context.Context) (stats *QUICStats, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
stats = NewQUICStats(true)
|
|
|
|
satellites := service.trust.GetSatellites(ctx)
|
|
if len(satellites) < 1 {
|
|
return nil, errPingSatellite.New("no trusted satellite available")
|
|
}
|
|
|
|
// Shuffle the satellites
|
|
// All the Storagenodes get a default list of trusted satellites (The Storj DCS ones) and
|
|
// most of the SN operators don't change the list, hence if it always starts with
|
|
// the same satellite we are going to put always more pressure on the first trusted
|
|
// satellite on the list. So we iterate over the list of trusted satellites in a
|
|
// random order to avoid putting pressure on the first trusted on the list
|
|
service.rand.Shuffle(len(satellites), func(i, j int) {
|
|
satellites[i], satellites[j] = satellites[j], satellites[i]
|
|
})
|
|
|
|
for _, satellite := range satellites {
|
|
err = service.requestPingMeOnce(ctx, satellite)
|
|
if err != nil {
|
|
stats.SetStatus(false)
|
|
// log warning and try the next trusted satellite
|
|
service.log.Warn("failed PingMe request to satellite", zap.Stringer("Satellite ID", satellite), zap.Error(err))
|
|
continue
|
|
}
|
|
|
|
stats.SetStatus(true)
|
|
|
|
return stats, nil
|
|
}
|
|
|
|
return stats, errPingSatellite.New("failed to ping storage node using QUIC: %q", err)
|
|
}
|
|
|
|
func (service *Service) requestPingMeOnce(ctx context.Context, satellite storj.NodeID) (err error) {
|
|
defer mon.Task()(&ctx, satellite)(&err)
|
|
|
|
conn, err := service.dialSatellite(ctx, satellite)
|
|
if err != nil {
|
|
return errPingSatellite.Wrap(err)
|
|
}
|
|
defer func() { err = errs.Combine(err, conn.Close()) }()
|
|
|
|
node := service.Local()
|
|
_, err = pb.NewDRPCNodeClient(conn).PingMe(ctx, &pb.PingMeRequest{
|
|
Address: node.Address,
|
|
Transport: pb.NodeTransport_QUIC_GRPC,
|
|
})
|
|
if err != nil {
|
|
return errPingSatellite.Wrap(err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (service *Service) dialSatellite(ctx context.Context, id storj.NodeID) (*rpc.Conn, error) {
|
|
nodeurl, err := service.trust.GetNodeURL(ctx, id)
|
|
if err != nil {
|
|
return nil, errPingSatellite.Wrap(err)
|
|
}
|
|
|
|
return service.dialer.DialNodeURL(ctx, nodeurl)
|
|
}
|
|
|
|
// Local returns the storagenode info.
|
|
func (service *Service) Local() NodeInfo {
|
|
service.mu.Lock()
|
|
defer service.mu.Unlock()
|
|
return service.self
|
|
}
|
|
|
|
// UpdateSelf updates the local node with the capacity.
|
|
func (service *Service) UpdateSelf(capacity *pb.NodeCapacity) {
|
|
service.mu.Lock()
|
|
defer service.mu.Unlock()
|
|
if capacity != nil {
|
|
service.self.Capacity = *capacity
|
|
}
|
|
service.initialized.Release()
|
|
}
|