From 59b37db67019590c5a1008d73d23f644cd0dd745 Mon Sep 17 00:00:00 2001 From: Clement Sam Date: Tue, 25 Jan 2022 10:51:40 +0000 Subject: [PATCH] storagenode: overhaul QUIC check implementation The current implementation blocks the the startup until one or none of the trusted satellites is able to reach the node via QUIC. This can cause delayed startup. Also, the quic check is done once during startup, and if there is a misconfiguration later, snos would have to restart to node. In this change, we reuse the contact service which pings the satellite periodically for node checkin. During checkin the satellite tries pinging the node back via both TCP and QUIC and reports both statuses. WIth this, we are able to get a periodic update of the QUIC status without restarting the node. Also adds the time the node was last pinged via QUIC to the tooltip on the QUIC status tab. Resolves https://github.com/storj/storj/issues/4398 Change-Id: I18aa2a8e8d44e8187f8f2eb51f398fa6073882a4 --- storagenode/console/service.go | 19 ++--- storagenode/contact/contact_test.go | 4 +- storagenode/contact/network.go | 75 +++++++++++++++++++ storagenode/contact/service.go | 39 ++++++---- storagenode/peer.go | 43 +++-------- .../src/app/components/SNOContentTitle.vue | 49 +++++++++--- web/storagenode/src/app/store/modules/node.ts | 9 ++- .../src/storagenode/api/storagenode.ts | 2 +- web/storagenode/src/storagenode/sno/sno.ts | 6 +- .../unit/components/DiskStatChart.spec.ts | 5 +- .../payments/EstimationPeriodDropdown.spec.ts | 5 +- web/storagenode/tests/unit/store/node.spec.ts | 11 ++- 12 files changed, 187 insertions(+), 80 deletions(-) create mode 100644 storagenode/contact/network.go diff --git a/storagenode/console/service.go b/storagenode/console/service.go index 2e3db7271..ecee54616 100644 --- a/storagenode/console/service.go +++ b/storagenode/console/service.go @@ -62,7 +62,7 @@ type Service struct { startedAt time.Time versionInfo version.Info - quicEnabled bool + quicStats *contact.QUICStats configuredPort string } @@ -71,7 +71,7 @@ func NewService(log *zap.Logger, bandwidth bandwidth.DB, pieceStore *pieces.Stor allocatedDiskSpace memory.Size, walletAddress string, versionInfo version.Info, trust *trust.Pool, reputationDB reputation.DB, storageUsageDB storageusage.DB, pricingDB pricing.DB, satelliteDB satellites.DB, pingStats *contact.PingStats, contact *contact.Service, estimation *estimatedpayouts.Service, usageCache *pieces.BlobsUsageCache, - walletFeatures operator.WalletFeatures, port string, quicEnabled bool) (*Service, error) { + walletFeatures operator.WalletFeatures, port string, quicStats *contact.QUICStats) (*Service, error) { if log == nil { return nil, errs.New("log can't be nil") } @@ -123,7 +123,7 @@ func NewService(log *zap.Logger, bandwidth bandwidth.DB, pieceStore *pieces.Stor startedAt: time.Now(), versionInfo: versionInfo, walletFeatures: walletFeatures, - quicEnabled: quicEnabled, + quicStats: quicStats, configuredPort: port, }, nil } @@ -156,8 +156,9 @@ type Dashboard struct { StartedAt time.Time `json:"startedAt"` - ConfiguredPort string `json:"configuredPort"` - QUICEnabled bool `json:"quicEnabled"` + ConfiguredPort string `json:"configuredPort"` + QUICStatus string `json:"quicStatus"` + LastQUICPingedAt time.Time `json:"lastQuicPingedAt"` } // GetDashboardData returns stale dashboard data. @@ -174,7 +175,8 @@ func (s *Service) GetDashboardData(ctx context.Context) (_ *Dashboard, err error data.LastPinged = s.pingStats.WhenLastPinged() data.AllowedVersion, data.UpToDate = s.version.IsAllowed(ctx) - data.QUICEnabled = s.quicEnabled + data.QUICStatus = s.quicStats.Status() + data.LastQUICPingedAt = s.quicStats.WhenLastPinged() data.ConfiguredPort = s.configuredPort stats, err := s.reputationDB.All(ctx) @@ -476,8 +478,3 @@ func (s *Service) VerifySatelliteID(ctx context.Context, satelliteID storj.NodeI return nil } - -// SetQUICEnabled sets QUIC status for the SNO dashboard. -func (s *Service) SetQUICEnabled(enabled bool) { - s.quicEnabled = enabled -} diff --git a/storagenode/contact/contact_test.go b/storagenode/contact/contact_test.go index 0d395f30d..ea62d18ec 100644 --- a/storagenode/contact/contact_test.go +++ b/storagenode/contact/contact_test.go @@ -19,6 +19,7 @@ import ( "storj.io/common/testcontext" "storj.io/storj/private/testplanet" "storj.io/storj/satellite" + "storj.io/storj/storagenode/contact" ) func TestStoragenodeContactEndpoint(t *testing.T) { @@ -168,7 +169,8 @@ func TestServiceRequestPingMeQUIC(t *testing.T) { node := planet.StorageNodes[0] node.Contact.Chore.Pause(ctx) - err := node.Contact.Service.RequestPingMeQUIC(ctx) + quicStats, err := node.Contact.Service.RequestPingMeQUIC(ctx) require.NoError(t, err) + require.Equal(t, contact.NetworkStatusOk, quicStats.Status()) }) } diff --git a/storagenode/contact/network.go b/storagenode/contact/network.go new file mode 100644 index 000000000..bc3504c39 --- /dev/null +++ b/storagenode/contact/network.go @@ -0,0 +1,75 @@ +// Copyright (C) 2021 Storj Labs, Inc. +// See LICENSE for copying information. + +package contact + +import ( + "sync" + "time" +) + +const ( + // NetworkStatusOk represents node successfully pinged. + NetworkStatusOk = "OK" + // NetworkStatusMisconfigured means satellite could not ping + // back node due to misconfiguration on the node host. + NetworkStatusMisconfigured = "Misconfigured" + // NetworkStatusDisabled means QUIC is disabled by config. + NetworkStatusDisabled = "Disabled" + // NetworkStatusRefreshing means QUIC check is in progress. + NetworkStatusRefreshing = "Refreshing" +) + +// QUICStats contains information regarding QUIC status of the node. +type QUICStats struct { + status string + enabled bool + + mu sync.Mutex + lastPinged time.Time +} + +// NewQUICStats returns a new QUICStats. +func NewQUICStats(enabled bool) *QUICStats { + stats := &QUICStats{ + enabled: enabled, + status: NetworkStatusRefreshing, + } + + if !enabled { + stats.status = NetworkStatusDisabled + } + return stats +} + +// SetStatus sets the QUIC status during PingMe request. +func (q *QUICStats) SetStatus(pingSuccess bool) { + q.mu.Lock() + defer q.mu.Unlock() + + q.lastPinged = time.Now() + if pingSuccess { + q.status = NetworkStatusOk + return + } + + q.status = NetworkStatusMisconfigured +} + +// Status returns the quic status gathered in a PingMe request. +func (q *QUICStats) Status() string { + q.mu.Lock() + defer q.mu.Unlock() + + if !q.enabled { + return NetworkStatusDisabled + } + return q.status +} + +// WhenLastPinged returns last time someone pinged this node via QUIC. +func (q *QUICStats) WhenLastPinged() (when time.Time) { + q.mu.Lock() + defer q.mu.Unlock() + return q.lastPinged +} diff --git a/storagenode/contact/service.go b/storagenode/contact/service.go index 9d0ec4961..7caa23f4a 100644 --- a/storagenode/contact/service.go +++ b/storagenode/contact/service.go @@ -58,19 +58,21 @@ type Service struct { mu sync.Mutex self NodeInfo - trust *trust.Pool + trust *trust.Pool + quicStats *QUICStats initialized sync2.Fence } // NewService creates a new contact service. -func NewService(log *zap.Logger, dialer rpc.Dialer, self NodeInfo, trust *trust.Pool) *Service { +func NewService(log *zap.Logger, dialer rpc.Dialer, self NodeInfo, trust *trust.Pool, quicStats *QUICStats) *Service { return &Service{ - log: log, - rand: rand.New(rand.NewSource(time.Now().UnixNano())), - dialer: dialer, - trust: trust, - self: self, + log: log, + rand: rand.New(rand.NewSource(time.Now().UnixNano())), + dialer: dialer, + trust: trust, + self: self, + quicStats: quicStats, } } @@ -132,11 +134,16 @@ func (service *Service) pingSatelliteOnce(ctx context.Context, id storj.NodeID) Capacity: &self.Capacity, Operator: &self.Operator, }) + service.quicStats.SetStatus(false) if err != nil { return errPingSatellite.Wrap(err) } - if resp != nil && !resp.PingNodeSuccess { - return errPingSatellite.New("%s", resp.PingErrorMessage) + if resp != nil { + service.quicStats.SetStatus(resp.PingNodeSuccessQuic) + + if !resp.PingNodeSuccess { + return errPingSatellite.New("%s", resp.PingErrorMessage) + } } if resp.PingErrorMessage != "" { service.log.Warn("Your node is still considered to be online but encountered an error.", zap.Stringer("Satellite ID", id), zap.String("Error", resp.GetPingErrorMessage())) @@ -145,12 +152,14 @@ func (service *Service) pingSatelliteOnce(ctx context.Context, id storj.NodeID) } // RequestPingMeQUIC sends pings request to satellite for a pingBack via QUIC. -func (service *Service) RequestPingMeQUIC(ctx context.Context) (err error) { +func (service *Service) RequestPingMeQUIC(ctx context.Context) (stats *QUICStats, err error) { defer mon.Task()(&ctx)(&err) + stats = NewQUICStats(true) + satellites := service.trust.GetSatellites(ctx) if len(satellites) < 1 { - return errPingSatellite.New("no trusted satellite available") + return nil, errPingSatellite.New("no trusted satellite available") } // Shuffle the satellites @@ -166,14 +175,18 @@ func (service *Service) RequestPingMeQUIC(ctx context.Context) (err error) { for _, satellite := range satellites { err = service.requestPingMeOnce(ctx, satellite) if err != nil { + stats.SetStatus(false) // log warning and try the next trusted satellite service.log.Warn("failed PingMe request to satellite", zap.Stringer("Satellite ID", satellite), zap.Error(err)) continue } - return nil + + stats.SetStatus(true) + + return stats, nil } - return errPingSatellite.New("failed to ping storage node using QUIC: %q", err) + return stats, errPingSatellite.New("failed to ping storage node using QUIC: %q", err) } func (service *Service) requestPingMeOnce(ctx context.Context, satellite storj.NodeID) (err error) { diff --git a/storagenode/peer.go b/storagenode/peer.go index fdc39e93c..db90652cc 100644 --- a/storagenode/peer.go +++ b/storagenode/peer.go @@ -237,6 +237,7 @@ type Peer struct { Chore *contact.Chore Endpoint *contact.Endpoint PingStats *contact.PingStats + QUICStats *contact.QUICStats } Estimation struct { @@ -433,7 +434,8 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten Version: *pbVersion, } peer.Contact.PingStats = new(contact.PingStats) - peer.Contact.Service = contact.NewService(peer.Log.Named("contact:service"), peer.Dialer, self, peer.Storage2.Trust) + peer.Contact.QUICStats = contact.NewQUICStats(peer.Server.IsQUICEnabled()) + peer.Contact.Service = contact.NewService(peer.Log.Named("contact:service"), peer.Dialer, self, peer.Storage2.Trust, peer.Contact.QUICStats) peer.Contact.Chore = contact.NewChore(peer.Log.Named("contact:chore"), config.Contact.Interval, peer.Contact.Service) peer.Services.Add(lifecycle.Item{ @@ -679,7 +681,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten peer.Storage2.BlobsCache, config.Operator.WalletFeatures, port, - false, + peer.Contact.QUICStats, ) if err != nil { return nil, errs.Combine(err, peer.Close()) @@ -707,7 +709,13 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten peer.Payout.Service, peer.Console.Listener, ) - // NOTE: Console service is added to peer services during peer run to allow for QUIC checkins + + // add console service to peer services + peer.Services.Add(lifecycle.Item{ + Name: "console:endpoint", + Run: peer.Console.Endpoint.Run, + Close: peer.Console.Endpoint.Close, + }) } { // setup storage inspector @@ -859,32 +867,6 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten return peer, nil } -// addConsoleService completes the SNO dashboard setup and adds the console service -// to the peer services. -func (peer *Peer) addConsoleService(ctx context.Context) { - // perform QUIC checks - quicEnabled := peer.Server.IsQUICEnabled() - if quicEnabled { - if err := peer.Contact.Service.RequestPingMeQUIC(ctx); err != nil { - peer.Log.Warn("failed QUIC check", zap.Error(err)) - quicEnabled = false - } else { - peer.Log.Debug("QUIC check success") - } - } else { - peer.Log.Warn("UDP Port not configured for QUIC") - } - - peer.Console.Service.SetQUICEnabled(quicEnabled) - - // add console service to peer services - peer.Services.Add(lifecycle.Item{ - Name: "console:endpoint", - Run: peer.Console.Endpoint.Run, - Close: peer.Console.Endpoint.Close, - }) -} - // Run runs storage node until it's either closed or it errors. func (peer *Peer) Run(ctx context.Context) (err error) { defer mon.Task()(&ctx)(&err) @@ -903,9 +885,6 @@ func (peer *Peer) Run(ctx context.Context) (err error) { group, ctx := errgroup.WithContext(ctx) peer.Servers.Run(ctx, group) - // complete SNO dashboard setup and add console service to peer services - peer.addConsoleService(ctx) - // run peer services peer.Services.Run(ctx, group) return group.Wait() diff --git a/web/storagenode/src/app/components/SNOContentTitle.vue b/web/storagenode/src/app/components/SNOContentTitle.vue index 715a0a91b..42a321185 100644 --- a/web/storagenode/src/app/components/SNOContentTitle.vue +++ b/web/storagenode/src/app/components/SNOContentTitle.vue @@ -19,18 +19,29 @@
-

QUIC

-

OK

+

{{ quicStatusRefreshing }}

-
+ +
+

QUIC

+

{{ quicStatusOk }}

+
+
+

QUIC

@@ -81,7 +92,7 @@