storagenode: overhaul QUIC check implementation

The current implementation blocks the the startup until one or none
of the trusted satellites is able to reach the node via QUIC.
This can cause delayed startup. Also, the quic check is done
once during startup, and if there is a misconfiguration later,
snos would have to restart to node.

In this change, we reuse the contact service which pings the satellite
periodically for node checkin. During checkin the satellite tries
pinging the node back via both TCP and QUIC and reports both statuses.
WIth this, we are able to get a periodic update of the QUIC status
without restarting the node.

Also adds the time the node was last pinged via QUIC to the tooltip
on the QUIC status tab.

Resolves https://github.com/storj/storj/issues/4398

Change-Id: I18aa2a8e8d44e8187f8f2eb51f398fa6073882a4
This commit is contained in:
Clement Sam 2022-01-25 10:51:40 +00:00 committed by Clement Sam
parent 7b13af1184
commit 59b37db670
12 changed files with 187 additions and 80 deletions

View File

@ -62,7 +62,7 @@ type Service struct {
startedAt time.Time
versionInfo version.Info
quicEnabled bool
quicStats *contact.QUICStats
configuredPort string
}
@ -71,7 +71,7 @@ func NewService(log *zap.Logger, bandwidth bandwidth.DB, pieceStore *pieces.Stor
allocatedDiskSpace memory.Size, walletAddress string, versionInfo version.Info, trust *trust.Pool,
reputationDB reputation.DB, storageUsageDB storageusage.DB, pricingDB pricing.DB, satelliteDB satellites.DB,
pingStats *contact.PingStats, contact *contact.Service, estimation *estimatedpayouts.Service, usageCache *pieces.BlobsUsageCache,
walletFeatures operator.WalletFeatures, port string, quicEnabled bool) (*Service, error) {
walletFeatures operator.WalletFeatures, port string, quicStats *contact.QUICStats) (*Service, error) {
if log == nil {
return nil, errs.New("log can't be nil")
}
@ -123,7 +123,7 @@ func NewService(log *zap.Logger, bandwidth bandwidth.DB, pieceStore *pieces.Stor
startedAt: time.Now(),
versionInfo: versionInfo,
walletFeatures: walletFeatures,
quicEnabled: quicEnabled,
quicStats: quicStats,
configuredPort: port,
}, nil
}
@ -156,8 +156,9 @@ type Dashboard struct {
StartedAt time.Time `json:"startedAt"`
ConfiguredPort string `json:"configuredPort"`
QUICEnabled bool `json:"quicEnabled"`
ConfiguredPort string `json:"configuredPort"`
QUICStatus string `json:"quicStatus"`
LastQUICPingedAt time.Time `json:"lastQuicPingedAt"`
}
// GetDashboardData returns stale dashboard data.
@ -174,7 +175,8 @@ func (s *Service) GetDashboardData(ctx context.Context) (_ *Dashboard, err error
data.LastPinged = s.pingStats.WhenLastPinged()
data.AllowedVersion, data.UpToDate = s.version.IsAllowed(ctx)
data.QUICEnabled = s.quicEnabled
data.QUICStatus = s.quicStats.Status()
data.LastQUICPingedAt = s.quicStats.WhenLastPinged()
data.ConfiguredPort = s.configuredPort
stats, err := s.reputationDB.All(ctx)
@ -476,8 +478,3 @@ func (s *Service) VerifySatelliteID(ctx context.Context, satelliteID storj.NodeI
return nil
}
// SetQUICEnabled sets QUIC status for the SNO dashboard.
func (s *Service) SetQUICEnabled(enabled bool) {
s.quicEnabled = enabled
}

View File

@ -19,6 +19,7 @@ import (
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/storagenode/contact"
)
func TestStoragenodeContactEndpoint(t *testing.T) {
@ -168,7 +169,8 @@ func TestServiceRequestPingMeQUIC(t *testing.T) {
node := planet.StorageNodes[0]
node.Contact.Chore.Pause(ctx)
err := node.Contact.Service.RequestPingMeQUIC(ctx)
quicStats, err := node.Contact.Service.RequestPingMeQUIC(ctx)
require.NoError(t, err)
require.Equal(t, contact.NetworkStatusOk, quicStats.Status())
})
}

View File

@ -0,0 +1,75 @@
// Copyright (C) 2021 Storj Labs, Inc.
// See LICENSE for copying information.
package contact
import (
"sync"
"time"
)
const (
// NetworkStatusOk represents node successfully pinged.
NetworkStatusOk = "OK"
// NetworkStatusMisconfigured means satellite could not ping
// back node due to misconfiguration on the node host.
NetworkStatusMisconfigured = "Misconfigured"
// NetworkStatusDisabled means QUIC is disabled by config.
NetworkStatusDisabled = "Disabled"
// NetworkStatusRefreshing means QUIC check is in progress.
NetworkStatusRefreshing = "Refreshing"
)
// QUICStats contains information regarding QUIC status of the node.
type QUICStats struct {
status string
enabled bool
mu sync.Mutex
lastPinged time.Time
}
// NewQUICStats returns a new QUICStats.
func NewQUICStats(enabled bool) *QUICStats {
stats := &QUICStats{
enabled: enabled,
status: NetworkStatusRefreshing,
}
if !enabled {
stats.status = NetworkStatusDisabled
}
return stats
}
// SetStatus sets the QUIC status during PingMe request.
func (q *QUICStats) SetStatus(pingSuccess bool) {
q.mu.Lock()
defer q.mu.Unlock()
q.lastPinged = time.Now()
if pingSuccess {
q.status = NetworkStatusOk
return
}
q.status = NetworkStatusMisconfigured
}
// Status returns the quic status gathered in a PingMe request.
func (q *QUICStats) Status() string {
q.mu.Lock()
defer q.mu.Unlock()
if !q.enabled {
return NetworkStatusDisabled
}
return q.status
}
// WhenLastPinged returns last time someone pinged this node via QUIC.
func (q *QUICStats) WhenLastPinged() (when time.Time) {
q.mu.Lock()
defer q.mu.Unlock()
return q.lastPinged
}

View File

@ -58,19 +58,21 @@ type Service struct {
mu sync.Mutex
self NodeInfo
trust *trust.Pool
trust *trust.Pool
quicStats *QUICStats
initialized sync2.Fence
}
// NewService creates a new contact service.
func NewService(log *zap.Logger, dialer rpc.Dialer, self NodeInfo, trust *trust.Pool) *Service {
func NewService(log *zap.Logger, dialer rpc.Dialer, self NodeInfo, trust *trust.Pool, quicStats *QUICStats) *Service {
return &Service{
log: log,
rand: rand.New(rand.NewSource(time.Now().UnixNano())),
dialer: dialer,
trust: trust,
self: self,
log: log,
rand: rand.New(rand.NewSource(time.Now().UnixNano())),
dialer: dialer,
trust: trust,
self: self,
quicStats: quicStats,
}
}
@ -132,11 +134,16 @@ func (service *Service) pingSatelliteOnce(ctx context.Context, id storj.NodeID)
Capacity: &self.Capacity,
Operator: &self.Operator,
})
service.quicStats.SetStatus(false)
if err != nil {
return errPingSatellite.Wrap(err)
}
if resp != nil && !resp.PingNodeSuccess {
return errPingSatellite.New("%s", resp.PingErrorMessage)
if resp != nil {
service.quicStats.SetStatus(resp.PingNodeSuccessQuic)
if !resp.PingNodeSuccess {
return errPingSatellite.New("%s", resp.PingErrorMessage)
}
}
if resp.PingErrorMessage != "" {
service.log.Warn("Your node is still considered to be online but encountered an error.", zap.Stringer("Satellite ID", id), zap.String("Error", resp.GetPingErrorMessage()))
@ -145,12 +152,14 @@ func (service *Service) pingSatelliteOnce(ctx context.Context, id storj.NodeID)
}
// RequestPingMeQUIC sends pings request to satellite for a pingBack via QUIC.
func (service *Service) RequestPingMeQUIC(ctx context.Context) (err error) {
func (service *Service) RequestPingMeQUIC(ctx context.Context) (stats *QUICStats, err error) {
defer mon.Task()(&ctx)(&err)
stats = NewQUICStats(true)
satellites := service.trust.GetSatellites(ctx)
if len(satellites) < 1 {
return errPingSatellite.New("no trusted satellite available")
return nil, errPingSatellite.New("no trusted satellite available")
}
// Shuffle the satellites
@ -166,14 +175,18 @@ func (service *Service) RequestPingMeQUIC(ctx context.Context) (err error) {
for _, satellite := range satellites {
err = service.requestPingMeOnce(ctx, satellite)
if err != nil {
stats.SetStatus(false)
// log warning and try the next trusted satellite
service.log.Warn("failed PingMe request to satellite", zap.Stringer("Satellite ID", satellite), zap.Error(err))
continue
}
return nil
stats.SetStatus(true)
return stats, nil
}
return errPingSatellite.New("failed to ping storage node using QUIC: %q", err)
return stats, errPingSatellite.New("failed to ping storage node using QUIC: %q", err)
}
func (service *Service) requestPingMeOnce(ctx context.Context, satellite storj.NodeID) (err error) {

View File

@ -237,6 +237,7 @@ type Peer struct {
Chore *contact.Chore
Endpoint *contact.Endpoint
PingStats *contact.PingStats
QUICStats *contact.QUICStats
}
Estimation struct {
@ -433,7 +434,8 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
Version: *pbVersion,
}
peer.Contact.PingStats = new(contact.PingStats)
peer.Contact.Service = contact.NewService(peer.Log.Named("contact:service"), peer.Dialer, self, peer.Storage2.Trust)
peer.Contact.QUICStats = contact.NewQUICStats(peer.Server.IsQUICEnabled())
peer.Contact.Service = contact.NewService(peer.Log.Named("contact:service"), peer.Dialer, self, peer.Storage2.Trust, peer.Contact.QUICStats)
peer.Contact.Chore = contact.NewChore(peer.Log.Named("contact:chore"), config.Contact.Interval, peer.Contact.Service)
peer.Services.Add(lifecycle.Item{
@ -679,7 +681,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
peer.Storage2.BlobsCache,
config.Operator.WalletFeatures,
port,
false,
peer.Contact.QUICStats,
)
if err != nil {
return nil, errs.Combine(err, peer.Close())
@ -707,7 +709,13 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
peer.Payout.Service,
peer.Console.Listener,
)
// NOTE: Console service is added to peer services during peer run to allow for QUIC checkins
// add console service to peer services
peer.Services.Add(lifecycle.Item{
Name: "console:endpoint",
Run: peer.Console.Endpoint.Run,
Close: peer.Console.Endpoint.Close,
})
}
{ // setup storage inspector
@ -859,32 +867,6 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
return peer, nil
}
// addConsoleService completes the SNO dashboard setup and adds the console service
// to the peer services.
func (peer *Peer) addConsoleService(ctx context.Context) {
// perform QUIC checks
quicEnabled := peer.Server.IsQUICEnabled()
if quicEnabled {
if err := peer.Contact.Service.RequestPingMeQUIC(ctx); err != nil {
peer.Log.Warn("failed QUIC check", zap.Error(err))
quicEnabled = false
} else {
peer.Log.Debug("QUIC check success")
}
} else {
peer.Log.Warn("UDP Port not configured for QUIC")
}
peer.Console.Service.SetQUICEnabled(quicEnabled)
// add console service to peer services
peer.Services.Add(lifecycle.Item{
Name: "console:endpoint",
Run: peer.Console.Endpoint.Run,
Close: peer.Console.Endpoint.Close,
})
}
// Run runs storage node until it's either closed or it errors.
func (peer *Peer) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
@ -903,9 +885,6 @@ func (peer *Peer) Run(ctx context.Context) (err error) {
group, ctx := errgroup.WithContext(ctx)
peer.Servers.Run(ctx, group)
// complete SNO dashboard setup and add console service to peer services
peer.addConsoleService(ctx)
// run peer services
peer.Services.Run(ctx, group)
return group.Wait()

View File

@ -19,18 +19,29 @@
</div>
<div class="title-area-divider" />
<div
v-if="info.quicEnabled"
<VInfo
v-if="info.quicStatus === quicStatusRefreshing"
text="Testing connection to node via QUIC"
>
<div class="title-area__info-container__info-item">
<p class="title-area__info-container__info-item__title">QUIC</p>
<p class="title-area__info-container__info-item__content online-status">OK</p>
<p class="title-area__info-container__info-item__content">{{ quicStatusRefreshing }}</p>
</div>
</div>
</VInfo>
<VInfo
v-if="!info.quicEnabled"
v-if="info.quicStatus === quicStatusOk"
:text="'QUIC is configured to use UDP port ' + info.configuredPort"
:bold-text="'Last pinged ' + lastQuicPingedAt + ' ago'"
>
<div class="title-area__info-container__info-item">
<p class="title-area__info-container__info-item__title">QUIC</p>
<p class="title-area__info-container__info-item__content online-status">{{ quicStatusOk }}</p>
</div>
</VInfo>
<VInfo
v-if="info.quicStatus === quicStatusMisconfigured"
:text="'QUIC is misconfigured. You must forward port ' + info.configuredPort + ' for both TCP and UDP to enable QUIC.'"
bold-text="See https://docs.storj.io/node/dependencies/port-forwarding on how to do this."
bold-text="See https://docs.storj.io/node/dependencies/port-forwarding on how to do this"
>
<div class="title-area__info-container__info-item">
<p class="title-area__info-container__info-item__title">QUIC</p>
@ -81,7 +92,7 @@
<script lang="ts">
import { Component, Vue } from 'vue-property-decorator';
import { StatusOnline } from '@/app/store/modules/node';
import { StatusOnline, QUIC_STATUS } from '@/app/store/modules/node';
import { Duration, millisecondsInSecond, minutesInHour, secondsInHour, secondsInMinute } from '@/app/utils/duration';
import VInfo from '@/app/components/VInfo.vue';
@ -98,17 +109,17 @@ class NodeInfo {
public allowedVersion: string;
public wallet: string;
public isLastVersion: boolean;
public quicEnabled: boolean;
public quicStatus: string;
public configuredPort: string;
public constructor(id: string, status: string, version: string, allowedVersion: string, wallet: string, isLastVersion: boolean, quicEnabled: boolean, port: string) {
public constructor(id: string, status: string, version: string, allowedVersion: string, wallet: string, isLastVersion: boolean, quicStatus: string, port: string) {
this.id = id;
this.status = status;
this.version = this.toVersionString(version);
this.allowedVersion = this.toVersionString(allowedVersion);
this.wallet = wallet;
this.isLastVersion = isLastVersion;
this.quicEnabled = quicEnabled;
this.quicStatus = quicStatus;
this.configuredPort = port;
}
@ -141,13 +152,25 @@ export default class SNOContentTitle extends Vue {
const nodeInfo = this.$store.state.node.info;
return new NodeInfo(nodeInfo.id, nodeInfo.status, nodeInfo.version, nodeInfo.allowedVersion, nodeInfo.wallet,
nodeInfo.isLastVersion, nodeInfo.quicEnabled, nodeInfo.configuredPort);
nodeInfo.isLastVersion, nodeInfo.quicStatus, nodeInfo.configuredPort);
}
public get online(): boolean {
return this.$store.state.node.info.status === StatusOnline;
}
public get quicStatusOk(): string {
return QUIC_STATUS.StatusOk;
}
public get quicStatusRefreshing(): string {
return QUIC_STATUS.StatusRefreshing;
}
public get quicStatusMisconfigured(): string {
return QUIC_STATUS.StatusMisconfigured;
}
public get uptime(): string {
return this.timePassed(this.$store.state.node.info.startedAt);
}
@ -156,6 +179,10 @@ export default class SNOContentTitle extends Vue {
return this.timePassed(this.$store.state.node.info.lastPinged);
}
public get lastQuicPingedAt(): string {
return this.timePassed(this.$store.state.node.info.lastQuicPingedAt);
}
public get currentMonth(): string {
const monthNames = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December',

View File

@ -30,6 +30,12 @@ export const NODE_ACTIONS = {
export const StatusOnline = 'Online';
export const StatusOffline = 'Offline';
export const QUIC_STATUS = {
StatusOk: 'OK',
StatusMisconfigured: 'Misconfigured',
StatusRefreshing: 'Refreshing',
};
const {
POPULATE_STORE,
SELECT_SATELLITE,
@ -61,8 +67,9 @@ export function newNodeModule(service: StorageNodeService): StoreModule<StorageN
nodeInfo.wallet,
nodeInfo.walletFeatures,
nodeInfo.isUpToDate,
nodeInfo.quicEnabled,
nodeInfo.quicStatus,
nodeInfo.configuredPort,
nodeInfo.lastQuicPingedAt,
);
state.utilization = new Utilization(

View File

@ -45,7 +45,7 @@ export class StorageNodeApi {
const bandwidth: Traffic = new Traffic(data.bandwidth.used);
return new Dashboard(data.nodeID, data.wallet, data.walletFeatures || [], satellites, diskSpace, bandwidth,
new Date(data.lastPinged), new Date(data.startedAt), data.version, data.allowedVersion, data.upToDate, data.quicEnabled, data.configuredPort);
new Date(data.lastPinged), new Date(data.startedAt), data.version, data.allowedVersion, data.upToDate, data.quicStatus, data.configuredPort, new Date(data.lastQuicPingedAt));
}
/**

View File

@ -17,8 +17,9 @@ export class Node {
public wallet: string = '',
public walletFeatures: string[] = [],
public isLastVersion: boolean = false,
public quicEnabled: boolean = false,
public quicStatus: string = '',
public configuredPort: string = '',
public lastQuicPingedAt: Date = new Date(),
) {}
}
@ -75,8 +76,9 @@ export class Dashboard {
public version: string,
public allowedVersion: string,
public isUpToDate: boolean,
public quicEnabled: boolean,
public quicStatus: string,
public configuredPort: string,
public lastQuicPingedAt: Date,
) { }
}

View File

@ -4,7 +4,7 @@
import Vuex from 'vuex';
import { createLocalVue, shallowMount } from '@vue/test-utils';
import { newNodeModule, NODE_ACTIONS } from '@/app/store/modules/node';
import { newNodeModule, NODE_ACTIONS, QUIC_STATUS } from '@/app/store/modules/node';
import { Size } from '@/private/memory/size';
import { StorageNodeApi } from '@/storagenode/api/storagenode';
import { StorageNodeService } from '@/storagenode/sno/service';
@ -58,8 +58,9 @@ describe('DiskStatChart', (): void => {
'0.1.1',
'0.2.2',
false,
true,
QUIC_STATUS.StatusOk,
'13000',
new Date(2022, 11, 8),
),
),
);

View File

@ -6,7 +6,7 @@ import Vuex from 'vuex';
import { createLocalVue, shallowMount } from '@vue/test-utils';
import { appStateModule } from '@/app/store/modules/appState';
import { newNodeModule, NODE_MUTATIONS } from '@/app/store/modules/node';
import { newNodeModule, NODE_MUTATIONS, QUIC_STATUS } from '@/app/store/modules/node';
import { StorageNodeApi } from '@/storagenode/api/storagenode';
import { StorageNodeService } from '@/storagenode/sno/service';
import {
@ -73,8 +73,9 @@ describe('EstimationPeriodDropdown', (): void => {
'0.1.1',
'0.2.2',
false,
true,
QUIC_STATUS.StatusOk,
'13000',
new Date(),
);
store.commit(NODE_MUTATIONS.POPULATE_STORE, dashboardInfo);

View File

@ -4,7 +4,7 @@
import Vuex from 'vuex';
import { createLocalVue } from '@vue/test-utils';
import { newNodeModule, NODE_ACTIONS, NODE_MUTATIONS, StatusOnline } from '@/app/store/modules/node';
import { newNodeModule, NODE_ACTIONS, NODE_MUTATIONS, QUIC_STATUS, StatusOnline } from '@/app/store/modules/node';
import { StorageNodeApi } from '@/storagenode/api/storagenode';
import { StorageNodeService } from '@/storagenode/sno/service';
import {
@ -53,8 +53,9 @@ describe('mutations', () => {
'0.1.1',
'0.2.2',
false,
true,
QUIC_STATUS.StatusOk,
'13000',
new Date(2022, 11, 8),
);
store.commit(NODE_MUTATIONS.POPULATE_STORE, dashboardInfo);
@ -193,8 +194,9 @@ describe('actions', () => {
'0.1.1',
'0.2.2',
false,
true,
QUIC_STATUS.StatusOk,
'13000',
new Date(2022, 11, 8),
),
),
);
@ -339,8 +341,9 @@ describe('getters', () => {
'0.1.1',
'0.2.2',
false,
true,
QUIC_STATUS.StatusOk,
'13000',
new Date(2022, 11, 8),
);
store.commit(NODE_MUTATIONS.POPULATE_STORE, dashboardInfo);