satellite/overlay: uncomment dq for offline and disable with feature flag

Change-Id: Ib39e2be32e880b822a94eddfb81af99a38843a27
This commit is contained in:
Moby von Briesen 2020-10-15 12:00:08 -04:00 committed by Maximillian von Briesen
parent 7c275830a1
commit 7c3afe164b
5 changed files with 16 additions and 15 deletions

View File

@ -96,9 +96,12 @@ storj.io/storj/satellite/repair/repairer."time_since_checker_queue" FloatVal
storj.io/storj/satellite/satellitedb."audit_online_score" FloatVal
storj.io/storj/satellite/satellitedb."audit_reputation_alpha" FloatVal
storj.io/storj/satellite/satellitedb."audit_reputation_beta" FloatVal
storj.io/storj/satellite/satellitedb."bad_audit_dqs" Meter
storj.io/storj/satellite/satellitedb."nodetallies.totalsum" IntVal
storj.io/storj/satellite/satellitedb."offline_dqs" Meter
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_alpha" FloatVal
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_beta" FloatVal
storj.io/storj/satellite/satellitedb."unknown_suspension_dqs" Meter
storj.io/storj/storage/filestore."open_file_in_trash" Meter
storj.io/storj/storagenode/contact."satellite_contact_request" Meter
storj.io/storj/storagenode/gracefulexit."satellite_gracefulexit_request" Meter

View File

@ -53,4 +53,5 @@ type AuditHistoryConfig struct {
TrackingPeriod time.Duration `help:"The length of time to track audit windows for node suspension and disqualification" releaseDefault:"720h" devDefault:"1h"`
GracePeriod time.Duration `help:"The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification" releaseDefault:"168h" devDefault:"1h"`
OfflineThreshold float64 `help:"The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period." default:"0.6"`
OfflineDQEnabled bool `help:"whether nodes will be disqualified if they have low online score after a review period" releaseDefault:"false" devDefault:"true"`
}

View File

@ -325,7 +325,7 @@ func TestAuditSuspendBatchUpdateStats(t *testing.T) {
// TestOfflineSuspend tests that a node enters offline suspension and "under review" when online score passes below threshold.
// The node should be able to enter and exit suspension while remaining under review.
// The node should be reinstated if it has a good online score after the review period.
// (TODO) The node should be disqualified if it has a bad online score after the review period.
// The node should be disqualified if it has a bad online score after the review period.
func TestOfflineSuspend(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
@ -349,6 +349,7 @@ func TestOfflineSuspend(t *testing.T) {
TrackingPeriod: 2 * time.Hour,
GracePeriod: time.Hour,
OfflineThreshold: 0.6,
OfflineDQEnabled: true,
},
AuditLambda: 0.95,
@ -456,15 +457,8 @@ func TestOfflineSuspend(t *testing.T) {
require.NoError(t, err)
require.NotNil(t, node.OfflineSuspended)
require.NotNil(t, node.OfflineUnderReview)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.Disqualified)
require.EqualValues(t, 0.5, node.Reputation.OnlineScore)
// TODO uncomment and remove above 4 lines when dq is enabled
/*
require.NotNil(t, node.OfflineSuspended)
require.NotNil(t, node.OfflineUnderReview)
require.NotNil(t, node.Disqualified)
require.EqualValues(t, 0.5, node.Reputation.OnlineScore)
*/
})
}

View File

@ -1278,6 +1278,7 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
auditRep := auditAlpha / (auditAlpha + auditBeta)
if auditRep <= updateReq.AuditDQ {
cache.db.log.Info("Disqualified", zap.String("DQ type", "audit failure"), zap.String("Node ID", updateReq.NodeID.String()))
mon.Meter("bad_audit_dqs").Mark(1) //mon:locked
updateFields.Disqualified = timeField{set: true, value: now}
}
@ -1302,6 +1303,7 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
time.Since(*dbNode.UnknownAuditSuspended) > updateReq.SuspensionGracePeriod &&
updateReq.SuspensionDQEnabled {
cache.db.log.Info("Disqualified", zap.String("DQ type", "suspension grace period expired for unknown audits"), zap.String("Node ID", updateReq.NodeID.String()))
mon.Meter("unknown_suspension_dqs").Mark(1) //mon:locked
updateFields.Disqualified = timeField{set: true, value: now}
updateFields.UnknownAuditSuspended = timeField{set: true, isNil: true}
}
@ -1351,16 +1353,14 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
trackingPeriodPassed := now.After(trackingPeriodEnd)
// after tracking period has elapsed, if score is good, clear under review
// otherwise, disqualify node
// TODO until disqualification is enabled, nodes will remain under review if their score is passed after the grace+tracking period
// otherwise, disqualify node (if OfflineDQEnabled feature flag is true)
if trackingPeriodPassed {
if penalizeOfflineNode {
// TODO enable disqualification
/*
if updateReq.AuditHistory.OfflineDQEnabled {
cache.db.log.Info("Disqualified", zap.String("DQ type", "node offline"), zap.String("Node ID", updateReq.NodeID.String()))
mon.Meter("offline_dqs").Mark(1) //mon:locked
updateFields.Disqualified = timeField{set: true, value: now}
// TODO metric
*/
}
} else {
updateFields.OfflineUnderReview = timeField{set: true, isNil: true}
updateFields.OfflineSuspended = timeField{set: true, isNil: true}

3
scripts/testdata/satellite-config.yaml.lock vendored Normal file → Executable file
View File

@ -478,6 +478,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification
# overlay.audit-history.grace-period: 168h0m0s
# whether nodes will be disqualified if they have low online score after a review period
# overlay.audit-history.offline-dq-enabled: false
# The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period.
# overlay.audit-history.offline-threshold: 0.6