satellite/overlay: uncomment dq for offline and disable with feature flag
Change-Id: Ib39e2be32e880b822a94eddfb81af99a38843a27
This commit is contained in:
parent
7c275830a1
commit
7c3afe164b
@ -96,9 +96,12 @@ storj.io/storj/satellite/repair/repairer."time_since_checker_queue" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_online_score" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_reputation_alpha" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_reputation_beta" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."bad_audit_dqs" Meter
|
||||
storj.io/storj/satellite/satellitedb."nodetallies.totalsum" IntVal
|
||||
storj.io/storj/satellite/satellitedb."offline_dqs" Meter
|
||||
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_alpha" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_beta" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."unknown_suspension_dqs" Meter
|
||||
storj.io/storj/storage/filestore."open_file_in_trash" Meter
|
||||
storj.io/storj/storagenode/contact."satellite_contact_request" Meter
|
||||
storj.io/storj/storagenode/gracefulexit."satellite_gracefulexit_request" Meter
|
||||
|
@ -53,4 +53,5 @@ type AuditHistoryConfig struct {
|
||||
TrackingPeriod time.Duration `help:"The length of time to track audit windows for node suspension and disqualification" releaseDefault:"720h" devDefault:"1h"`
|
||||
GracePeriod time.Duration `help:"The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification" releaseDefault:"168h" devDefault:"1h"`
|
||||
OfflineThreshold float64 `help:"The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period." default:"0.6"`
|
||||
OfflineDQEnabled bool `help:"whether nodes will be disqualified if they have low online score after a review period" releaseDefault:"false" devDefault:"true"`
|
||||
}
|
||||
|
@ -325,7 +325,7 @@ func TestAuditSuspendBatchUpdateStats(t *testing.T) {
|
||||
// TestOfflineSuspend tests that a node enters offline suspension and "under review" when online score passes below threshold.
|
||||
// The node should be able to enter and exit suspension while remaining under review.
|
||||
// The node should be reinstated if it has a good online score after the review period.
|
||||
// (TODO) The node should be disqualified if it has a bad online score after the review period.
|
||||
// The node should be disqualified if it has a bad online score after the review period.
|
||||
func TestOfflineSuspend(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
@ -349,6 +349,7 @@ func TestOfflineSuspend(t *testing.T) {
|
||||
TrackingPeriod: 2 * time.Hour,
|
||||
GracePeriod: time.Hour,
|
||||
OfflineThreshold: 0.6,
|
||||
OfflineDQEnabled: true,
|
||||
},
|
||||
|
||||
AuditLambda: 0.95,
|
||||
@ -456,15 +457,8 @@ func TestOfflineSuspend(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, node.OfflineSuspended)
|
||||
require.NotNil(t, node.OfflineUnderReview)
|
||||
require.Nil(t, node.Disqualified)
|
||||
require.NotNil(t, node.Disqualified)
|
||||
require.EqualValues(t, 0.5, node.Reputation.OnlineScore)
|
||||
// TODO uncomment and remove above 4 lines when dq is enabled
|
||||
/*
|
||||
require.NotNil(t, node.OfflineSuspended)
|
||||
require.NotNil(t, node.OfflineUnderReview)
|
||||
require.NotNil(t, node.Disqualified)
|
||||
require.EqualValues(t, 0.5, node.Reputation.OnlineScore)
|
||||
*/
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -1278,6 +1278,7 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
|
||||
auditRep := auditAlpha / (auditAlpha + auditBeta)
|
||||
if auditRep <= updateReq.AuditDQ {
|
||||
cache.db.log.Info("Disqualified", zap.String("DQ type", "audit failure"), zap.String("Node ID", updateReq.NodeID.String()))
|
||||
mon.Meter("bad_audit_dqs").Mark(1) //mon:locked
|
||||
updateFields.Disqualified = timeField{set: true, value: now}
|
||||
}
|
||||
|
||||
@ -1302,6 +1303,7 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
|
||||
time.Since(*dbNode.UnknownAuditSuspended) > updateReq.SuspensionGracePeriod &&
|
||||
updateReq.SuspensionDQEnabled {
|
||||
cache.db.log.Info("Disqualified", zap.String("DQ type", "suspension grace period expired for unknown audits"), zap.String("Node ID", updateReq.NodeID.String()))
|
||||
mon.Meter("unknown_suspension_dqs").Mark(1) //mon:locked
|
||||
updateFields.Disqualified = timeField{set: true, value: now}
|
||||
updateFields.UnknownAuditSuspended = timeField{set: true, isNil: true}
|
||||
}
|
||||
@ -1351,16 +1353,14 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
|
||||
trackingPeriodPassed := now.After(trackingPeriodEnd)
|
||||
|
||||
// after tracking period has elapsed, if score is good, clear under review
|
||||
// otherwise, disqualify node
|
||||
// TODO until disqualification is enabled, nodes will remain under review if their score is passed after the grace+tracking period
|
||||
// otherwise, disqualify node (if OfflineDQEnabled feature flag is true)
|
||||
if trackingPeriodPassed {
|
||||
if penalizeOfflineNode {
|
||||
// TODO enable disqualification
|
||||
/*
|
||||
if updateReq.AuditHistory.OfflineDQEnabled {
|
||||
cache.db.log.Info("Disqualified", zap.String("DQ type", "node offline"), zap.String("Node ID", updateReq.NodeID.String()))
|
||||
mon.Meter("offline_dqs").Mark(1) //mon:locked
|
||||
updateFields.Disqualified = timeField{set: true, value: now}
|
||||
// TODO metric
|
||||
*/
|
||||
}
|
||||
} else {
|
||||
updateFields.OfflineUnderReview = timeField{set: true, isNil: true}
|
||||
updateFields.OfflineSuspended = timeField{set: true, isNil: true}
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
Normal file → Executable file
3
scripts/testdata/satellite-config.yaml.lock
vendored
Normal file → Executable file
@ -478,6 +478,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification
|
||||
# overlay.audit-history.grace-period: 168h0m0s
|
||||
|
||||
# whether nodes will be disqualified if they have low online score after a review period
|
||||
# overlay.audit-history.offline-dq-enabled: false
|
||||
|
||||
# The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period.
|
||||
# overlay.audit-history.offline-threshold: 0.6
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user