satellite/{overlay,satellitedb}: add flag to toggle suspending nodes for offline audits
This change introduces a new config flag, --overlay.audit-history.offline-suspension-enabled, to toggle suspending nodes for offline audits. If the flag is set to true, nodes will be suspended if they meet the requirements. If the flag is false, nodes will not be suspended. If they are already suspended and/or under review, these will be cleared. Change-Id: Ibeba759c42d6e504f6b7598120d4fd4dab85ca74
This commit is contained in:
parent
eb44dc21b4
commit
1a51049ac0
@ -56,11 +56,12 @@ type NodeSelectionConfig struct {
|
||||
// AuditHistoryConfig is a configuration struct defining time periods and thresholds for penalizing nodes for being offline.
|
||||
// It is used for downtime suspension and disqualification.
|
||||
type AuditHistoryConfig struct {
|
||||
WindowSize time.Duration `help:"The length of time spanning a single audit window" releaseDefault:"12h" devDefault:"5m"`
|
||||
TrackingPeriod time.Duration `help:"The length of time to track audit windows for node suspension and disqualification" releaseDefault:"720h" devDefault:"1h"`
|
||||
GracePeriod time.Duration `help:"The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification" releaseDefault:"168h" devDefault:"1h"`
|
||||
OfflineThreshold float64 `help:"The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period." default:"0.6"`
|
||||
OfflineDQEnabled bool `help:"whether nodes will be disqualified if they have low online score after a review period" releaseDefault:"false" devDefault:"true"`
|
||||
WindowSize time.Duration `help:"The length of time spanning a single audit window" releaseDefault:"12h" devDefault:"5m"`
|
||||
TrackingPeriod time.Duration `help:"The length of time to track audit windows for node suspension and disqualification" releaseDefault:"720h" devDefault:"1h"`
|
||||
GracePeriod time.Duration `help:"The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification" releaseDefault:"168h" devDefault:"1h"`
|
||||
OfflineThreshold float64 `help:"The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period." default:"0.6"`
|
||||
OfflineDQEnabled bool `help:"whether nodes will be disqualified if they have low online score after a review period" releaseDefault:"false" devDefault:"true"`
|
||||
OfflineSuspensionEnabled bool `help:"whether nodes will be suspended if they have low online score" releaseDefault:"true" devDefault:"true"`
|
||||
}
|
||||
|
||||
func (aost *AsOfSystemTimeConfig) isValid() error {
|
||||
|
@ -258,6 +258,74 @@ func TestAuditSuspendDQDisabled(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestOfflineAuditSuspensionDisabled ensures that a node is not suspended if the offline suspension enabled flag is false.
|
||||
func TestOfflineAuditSuspensionDisabled(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Overlay.AuditHistory.OfflineSuspensionEnabled = false
|
||||
config.Overlay.AuditHistory.WindowSize = time.Hour
|
||||
config.Overlay.AuditHistory.TrackingPeriod = 2 * time.Hour
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
nodeID := planet.StorageNodes[0].ID()
|
||||
oc := planet.Satellites[0].Overlay.DB
|
||||
config := planet.Satellites[0].Config.Overlay.AuditHistory
|
||||
|
||||
node, err := oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, node.OfflineSuspended)
|
||||
require.Nil(t, node.OfflineUnderReview)
|
||||
require.Nil(t, node.Disqualified)
|
||||
|
||||
req := &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
AuditOutcome: overlay.AuditOffline,
|
||||
AuditHistory: config,
|
||||
}
|
||||
windowSize := config.WindowSize
|
||||
trackingPeriodLength := config.TrackingPeriod
|
||||
currentWindow := time.Now()
|
||||
|
||||
// check that unsuspended node does not get suspended
|
||||
for i := 0; i <= int(trackingPeriodLength/windowSize); i++ {
|
||||
_, err = planet.Satellites[0].DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{req}, 1, currentWindow)
|
||||
require.NoError(t, err)
|
||||
currentWindow = currentWindow.Add(windowSize)
|
||||
}
|
||||
|
||||
n, err := oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Less(t, n.Reputation.OnlineScore, config.OfflineThreshold)
|
||||
require.Nil(t, n.OfflineSuspended)
|
||||
require.Nil(t, n.OfflineUnderReview)
|
||||
|
||||
// check that enabling flag suspends the node
|
||||
req.AuditHistory.OfflineSuspensionEnabled = true
|
||||
_, err = planet.Satellites[0].DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{req}, 1, currentWindow)
|
||||
require.NoError(t, err)
|
||||
|
||||
n, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Less(t, n.Reputation.OnlineScore, config.OfflineThreshold)
|
||||
require.NotNil(t, n.OfflineSuspended)
|
||||
require.NotNil(t, n.OfflineUnderReview)
|
||||
|
||||
// check that disabling flag clears suspension and under review
|
||||
req.AuditHistory.OfflineSuspensionEnabled = false
|
||||
_, err = planet.Satellites[0].DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{req}, 1, currentWindow)
|
||||
require.NoError(t, err)
|
||||
|
||||
n, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Less(t, n.Reputation.OnlineScore, config.OfflineThreshold)
|
||||
require.Nil(t, n.OfflineSuspended)
|
||||
require.Nil(t, n.OfflineUnderReview)
|
||||
})
|
||||
}
|
||||
|
||||
// TestAuditSuspendBatchUpdateStats ensures that suspension and alpha/beta fields are properly updated from batch update stats.
|
||||
func TestAuditSuspendBatchUpdateStats(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
@ -340,11 +408,12 @@ func TestOfflineSuspend(t *testing.T) {
|
||||
NodeID: nodeID,
|
||||
AuditOutcome: overlay.AuditOffline,
|
||||
AuditHistory: overlay.AuditHistoryConfig{
|
||||
WindowSize: time.Hour,
|
||||
TrackingPeriod: 2 * time.Hour,
|
||||
GracePeriod: time.Hour,
|
||||
OfflineThreshold: 0.6,
|
||||
OfflineDQEnabled: true,
|
||||
WindowSize: time.Hour,
|
||||
TrackingPeriod: 2 * time.Hour,
|
||||
GracePeriod: time.Hour,
|
||||
OfflineThreshold: 0.6,
|
||||
OfflineDQEnabled: true,
|
||||
OfflineSuspensionEnabled: true,
|
||||
},
|
||||
|
||||
AuditLambda: 0.95,
|
||||
|
@ -1438,6 +1438,20 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
|
||||
// Updating node stats always exits it from containment mode
|
||||
updateFields.Contained = boolField{set: true, value: false}
|
||||
|
||||
// always update online score
|
||||
updateFields.OnlineScore = float64Field{set: true, value: auditHistoryResponse.NewScore}
|
||||
|
||||
// if suspension not enabled, skip penalization and unsuspend node if applicable
|
||||
if !updateReq.AuditHistory.OfflineSuspensionEnabled {
|
||||
if dbNode.OfflineSuspended != nil {
|
||||
updateFields.OfflineSuspended = timeField{set: true, isNil: true}
|
||||
}
|
||||
if dbNode.UnderReview != nil {
|
||||
updateFields.OfflineUnderReview = timeField{set: true, isNil: true}
|
||||
}
|
||||
return updateFields
|
||||
}
|
||||
|
||||
// only penalize node if online score is below threshold and
|
||||
// if it has enough completed windows to fill a tracking period
|
||||
penalizeOfflineNode := false
|
||||
@ -1445,9 +1459,6 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
|
||||
penalizeOfflineNode = true
|
||||
}
|
||||
|
||||
// always update online score
|
||||
updateFields.OnlineScore = float64Field{set: true, value: auditHistoryResponse.NewScore}
|
||||
|
||||
// Suspension and disqualification for offline nodes
|
||||
if dbNode.UnderReview != nil {
|
||||
// move node in and out of suspension as needed during review period
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -439,6 +439,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# whether nodes will be disqualified if they have low online score after a review period
|
||||
# overlay.audit-history.offline-dq-enabled: false
|
||||
|
||||
# whether nodes will be suspended if they have low online score
|
||||
# overlay.audit-history.offline-suspension-enabled: true
|
||||
|
||||
# The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period.
|
||||
# overlay.audit-history.offline-threshold: 0.6
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user