From 7c3afe164bac8db0c955f8cd4536e60fef0e7b2c Mon Sep 17 00:00:00 2001 From: Moby von Briesen Date: Thu, 15 Oct 2020 12:00:08 -0400 Subject: [PATCH] satellite/overlay: uncomment dq for offline and disable with feature flag Change-Id: Ib39e2be32e880b822a94eddfb81af99a38843a27 --- monkit.lock | 3 +++ satellite/overlay/config.go | 1 + satellite/overlay/suspension_test.go | 12 +++--------- satellite/satellitedb/overlaycache.go | 12 ++++++------ scripts/testdata/satellite-config.yaml.lock | 3 +++ 5 files changed, 16 insertions(+), 15 deletions(-) mode change 100644 => 100755 scripts/testdata/satellite-config.yaml.lock diff --git a/monkit.lock b/monkit.lock index 76cbc2315..fd29869a0 100644 --- a/monkit.lock +++ b/monkit.lock @@ -96,9 +96,12 @@ storj.io/storj/satellite/repair/repairer."time_since_checker_queue" FloatVal storj.io/storj/satellite/satellitedb."audit_online_score" FloatVal storj.io/storj/satellite/satellitedb."audit_reputation_alpha" FloatVal storj.io/storj/satellite/satellitedb."audit_reputation_beta" FloatVal +storj.io/storj/satellite/satellitedb."bad_audit_dqs" Meter storj.io/storj/satellite/satellitedb."nodetallies.totalsum" IntVal +storj.io/storj/satellite/satellitedb."offline_dqs" Meter storj.io/storj/satellite/satellitedb."unknown_audit_reputation_alpha" FloatVal storj.io/storj/satellite/satellitedb."unknown_audit_reputation_beta" FloatVal +storj.io/storj/satellite/satellitedb."unknown_suspension_dqs" Meter storj.io/storj/storage/filestore."open_file_in_trash" Meter storj.io/storj/storagenode/contact."satellite_contact_request" Meter storj.io/storj/storagenode/gracefulexit."satellite_gracefulexit_request" Meter diff --git a/satellite/overlay/config.go b/satellite/overlay/config.go index e6ca1a068..50667e9ba 100644 --- a/satellite/overlay/config.go +++ b/satellite/overlay/config.go @@ -53,4 +53,5 @@ type AuditHistoryConfig struct { TrackingPeriod time.Duration `help:"The length of time to track audit windows for node suspension and disqualification" releaseDefault:"720h" devDefault:"1h"` GracePeriod time.Duration `help:"The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification" releaseDefault:"168h" devDefault:"1h"` OfflineThreshold float64 `help:"The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period." default:"0.6"` + OfflineDQEnabled bool `help:"whether nodes will be disqualified if they have low online score after a review period" releaseDefault:"false" devDefault:"true"` } diff --git a/satellite/overlay/suspension_test.go b/satellite/overlay/suspension_test.go index ae8891281..04ab30be3 100644 --- a/satellite/overlay/suspension_test.go +++ b/satellite/overlay/suspension_test.go @@ -325,7 +325,7 @@ func TestAuditSuspendBatchUpdateStats(t *testing.T) { // TestOfflineSuspend tests that a node enters offline suspension and "under review" when online score passes below threshold. // The node should be able to enter and exit suspension while remaining under review. // The node should be reinstated if it has a good online score after the review period. -// (TODO) The node should be disqualified if it has a bad online score after the review period. +// The node should be disqualified if it has a bad online score after the review period. func TestOfflineSuspend(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0, @@ -349,6 +349,7 @@ func TestOfflineSuspend(t *testing.T) { TrackingPeriod: 2 * time.Hour, GracePeriod: time.Hour, OfflineThreshold: 0.6, + OfflineDQEnabled: true, }, AuditLambda: 0.95, @@ -456,15 +457,8 @@ func TestOfflineSuspend(t *testing.T) { require.NoError(t, err) require.NotNil(t, node.OfflineSuspended) require.NotNil(t, node.OfflineUnderReview) - require.Nil(t, node.Disqualified) + require.NotNil(t, node.Disqualified) require.EqualValues(t, 0.5, node.Reputation.OnlineScore) - // TODO uncomment and remove above 4 lines when dq is enabled - /* - require.NotNil(t, node.OfflineSuspended) - require.NotNil(t, node.OfflineUnderReview) - require.NotNil(t, node.Disqualified) - require.EqualValues(t, 0.5, node.Reputation.OnlineScore) - */ }) } diff --git a/satellite/satellitedb/overlaycache.go b/satellite/satellitedb/overlaycache.go index 3ccb1bb58..e4f5e2423 100644 --- a/satellite/satellitedb/overlaycache.go +++ b/satellite/satellitedb/overlaycache.go @@ -1278,6 +1278,7 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq * auditRep := auditAlpha / (auditAlpha + auditBeta) if auditRep <= updateReq.AuditDQ { cache.db.log.Info("Disqualified", zap.String("DQ type", "audit failure"), zap.String("Node ID", updateReq.NodeID.String())) + mon.Meter("bad_audit_dqs").Mark(1) //mon:locked updateFields.Disqualified = timeField{set: true, value: now} } @@ -1302,6 +1303,7 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq * time.Since(*dbNode.UnknownAuditSuspended) > updateReq.SuspensionGracePeriod && updateReq.SuspensionDQEnabled { cache.db.log.Info("Disqualified", zap.String("DQ type", "suspension grace period expired for unknown audits"), zap.String("Node ID", updateReq.NodeID.String())) + mon.Meter("unknown_suspension_dqs").Mark(1) //mon:locked updateFields.Disqualified = timeField{set: true, value: now} updateFields.UnknownAuditSuspended = timeField{set: true, isNil: true} } @@ -1351,16 +1353,14 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq * trackingPeriodPassed := now.After(trackingPeriodEnd) // after tracking period has elapsed, if score is good, clear under review - // otherwise, disqualify node - // TODO until disqualification is enabled, nodes will remain under review if their score is passed after the grace+tracking period + // otherwise, disqualify node (if OfflineDQEnabled feature flag is true) if trackingPeriodPassed { if penalizeOfflineNode { - // TODO enable disqualification - /* + if updateReq.AuditHistory.OfflineDQEnabled { cache.db.log.Info("Disqualified", zap.String("DQ type", "node offline"), zap.String("Node ID", updateReq.NodeID.String())) + mon.Meter("offline_dqs").Mark(1) //mon:locked updateFields.Disqualified = timeField{set: true, value: now} - // TODO metric - */ + } } else { updateFields.OfflineUnderReview = timeField{set: true, isNil: true} updateFields.OfflineSuspended = timeField{set: true, isNil: true} diff --git a/scripts/testdata/satellite-config.yaml.lock b/scripts/testdata/satellite-config.yaml.lock old mode 100644 new mode 100755 index f1f467bca..72706d8e7 --- a/scripts/testdata/satellite-config.yaml.lock +++ b/scripts/testdata/satellite-config.yaml.lock @@ -478,6 +478,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key # The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification # overlay.audit-history.grace-period: 168h0m0s +# whether nodes will be disqualified if they have low online score after a review period +# overlay.audit-history.offline-dq-enabled: false + # The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period. # overlay.audit-history.offline-threshold: 0.6