storj/satellite/reputation/suspension_test.go
paul cannon 0dcc0a9ee0 satellite/reputation: reconfigure lambda and alpha
This is in response to community feedback that our existing reputation
calculation is too likely to disqualify storage nodes unfairly with
extreme swings up and down.

For details and analysis, please see the data_loss_vs_dq_chance_sim.py
tool, the "tuning reputation further.ipynb" Jupyter notebook in the
storj/datascience repository, and the discussion at

    https://forum.storj.io/t/tuning-audit-scoring/14084

In brief: changing the lambda and initial-alpha parameters in this way
causes the swings in reputation to be smaller and less likely to put a
node past the disqualification threshold unfairly.

Note: this change will cause a one-time reset of all (non-disqualified)
node reputations, because the new initial alpha value of 1000 is
dramatically different, and the disqualification threshold is going to
be much higher.

Change-Id: Id6dc4ba8fde1be3db4255b72282207bab5491ca3
2022-08-17 18:52:53 +00:00

535 lines
21 KiB
Go

// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.
package reputation_test
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/audit"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/reputation"
)
// TestAuditSuspendBasic ensures that we can suspend a node using overlayService.SuspendNode and that we can unsuspend a node using overlayservice.UnsuspendNode.
func TestAuditSuspendBasic(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeID := planet.StorageNodes[0].ID()
repService := planet.Satellites[0].Reputation.Service
oc := planet.Satellites[0].Overlay.DB
node, err := oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.UnknownAuditSuspended)
timeToSuspend := time.Now().UTC().Truncate(time.Second)
err = repService.TestSuspendNodeUnknownAudit(ctx, nodeID, timeToSuspend)
require.NoError(t, err)
node, err = oc.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, node.UnknownAuditSuspended)
require.True(t, node.UnknownAuditSuspended.Equal(timeToSuspend))
err = repService.TestUnsuspendNodeUnknownAudit(ctx, nodeID)
require.NoError(t, err)
node, err = oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.UnknownAuditSuspended)
})
}
// TestAuditSuspendWithUpdateStats ensures that a node goes into suspension node from getting enough unknown audits, and gets removed from getting enough successful audits.
func TestAuditSuspendWithUpdateStats(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.AuditWeight = 1
config.Reputation.AuditDQ = 0.6
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeID := planet.StorageNodes[0].ID()
satellite := planet.Satellites[0]
oc := satellite.Overlay.Service
repService := satellite.Reputation.Service
node, err := oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.UnknownAuditSuspended)
testStartTime := time.Now()
// give node one unknown audit - bringing unknown audit rep to 0.5, and suspending node
err = repService.ApplyAudit(ctx, nodeID, node.Reputation.Status, reputation.AuditUnknown)
require.NoError(t, err)
reputationInfo, err := repService.Get(ctx, nodeID)
require.NoError(t, err)
// expect unknown audit alpha/beta to change and suspended to be set
require.True(t, reputationInfo.UnknownAuditReputationAlpha < 1)
require.True(t, reputationInfo.UnknownAuditReputationBeta > 0)
require.NotNil(t, reputationInfo.UnknownAuditSuspended)
require.True(t, reputationInfo.UnknownAuditSuspended.After(testStartTime))
// expect normal audit alpha/beta remain unchanged
require.EqualValues(t, reputationInfo.AuditReputationAlpha, satellite.Config.Reputation.InitialAlpha)
require.EqualValues(t, reputationInfo.AuditReputationBeta, satellite.Config.Reputation.InitialBeta)
// expect node is not disqualified
node, err = oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
// give node two successful audits - bringing unknown audit rep to 0.75, and unsuspending node
for i := 0; i < 2; i++ {
err = repService.ApplyAudit(ctx, nodeID, node.Reputation.Status, reputation.AuditSuccess)
require.NoError(t, err)
}
node, err = oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.UnknownAuditSuspended)
})
}
// TestAuditSuspendFailedAudit ensures that a node is not suspended for a failed audit.
func TestAuditSuspendFailedAudit(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.InitialAlpha = 1.0
config.Reputation.AuditLambda = 1.0
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeID := planet.StorageNodes[0].ID()
oc := planet.Satellites[0].Overlay.DB
repService := planet.Satellites[0].Reputation.Service
node, err := oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
// give node one failed audit - bringing audit rep to 0.5, and disqualifying node
// expect that suspended field and unknown audit reputation remain unchanged
err = repService.ApplyAudit(ctx, nodeID, node.Reputation.Status, reputation.AuditFailure)
require.NoError(t, err)
node, err = oc.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
reputationInfo, err := repService.Get(ctx, nodeID)
require.NoError(t, err)
require.EqualValues(t, reputationInfo.UnknownAuditReputationAlpha, 1)
require.EqualValues(t, reputationInfo.UnknownAuditReputationBeta, 0)
})
}
// TestAuditSuspendExceedGracePeriod ensures that a node is disqualified when it receives a failing or unknown audit after the grace period expires.
func TestAuditSuspendExceedGracePeriod(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.SuspensionGracePeriod = time.Hour
config.Reputation.InitialAlpha = 1
config.Reputation.AuditLambda = 0.95
config.Reputation.AuditDQ = 0.6
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
successNodeID := planet.StorageNodes[0].ID()
failNodeID := planet.StorageNodes[1].ID()
offlineNodeID := planet.StorageNodes[2].ID()
unknownNodeID := planet.StorageNodes[3].ID()
// suspend each node two hours ago (more than grace period)
repService := planet.Satellites[0].Reputation.Service
for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) {
err := repService.TestSuspendNodeUnknownAudit(ctx, node, time.Now().Add(-2*time.Hour))
require.NoError(t, err)
}
nodesStatus := make(map[storj.NodeID]overlay.ReputationStatus)
// no nodes should be disqualified
for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) {
n, err := repService.Get(ctx, node)
require.NoError(t, err)
require.Nil(t, n.Disqualified)
nodesStatus[node] = overlay.ReputationStatus{
Disqualified: n.Disqualified,
UnknownAuditSuspended: n.UnknownAuditSuspended,
OfflineSuspended: n.OfflineSuspended,
VettedAt: n.VettedAt,
}
}
// give one node a successful audit, one a failed audit, one an offline audit, and one an unknown audit
report := audit.Report{
Successes: storj.NodeIDList{successNodeID},
Fails: storj.NodeIDList{failNodeID},
Offlines: storj.NodeIDList{offlineNodeID},
Unknown: storj.NodeIDList{unknownNodeID},
NodesReputation: nodesStatus,
}
auditService := planet.Satellites[0].Audit
_, err := auditService.Reporter.RecordAudits(ctx, report)
require.NoError(t, err)
// success and offline nodes should not be disqualified
// fail and unknown nodes should be disqualified
for _, node := range (storj.NodeIDList{successNodeID, offlineNodeID}) {
n, err := repService.Get(ctx, node)
require.NoError(t, err)
require.Nil(t, n.Disqualified)
}
for _, node := range (storj.NodeIDList{failNodeID, unknownNodeID}) {
n, err := repService.Get(ctx, node)
require.NoError(t, err)
require.NotNil(t, n.Disqualified)
}
})
}
// TestAuditSuspendDQDisabled ensures that a node is not disqualified from suspended mode if the suspension DQ enabled flag is false.
func TestAuditSuspendDQDisabled(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.SuspensionGracePeriod = time.Hour
config.Reputation.SuspensionDQEnabled = false
config.Reputation.InitialAlpha = 1
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
successNodeID := planet.StorageNodes[0].ID()
failNodeID := planet.StorageNodes[1].ID()
offlineNodeID := planet.StorageNodes[2].ID()
unknownNodeID := planet.StorageNodes[3].ID()
// suspend each node two hours ago (more than grace period)
oc := planet.Satellites[0].DB.OverlayCache()
repService := planet.Satellites[0].Reputation.Service
for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) {
err := repService.TestSuspendNodeUnknownAudit(ctx, node, time.Now().Add(-2*time.Hour))
require.NoError(t, err)
}
nodesStatus := make(map[storj.NodeID]overlay.ReputationStatus)
// no nodes should be disqualified
for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) {
n, err := oc.Get(ctx, node)
require.NoError(t, err)
require.Nil(t, n.Disqualified)
nodesStatus[node] = overlay.ReputationStatus{
Disqualified: n.Disqualified,
UnknownAuditSuspended: n.UnknownAuditSuspended,
OfflineSuspended: n.OfflineSuspended,
VettedAt: n.Reputation.Status.VettedAt,
}
}
// give one node a successful audit, one a failed audit, one an offline audit, and one an unknown audit
report := audit.Report{
Successes: storj.NodeIDList{successNodeID},
Fails: storj.NodeIDList{failNodeID},
Offlines: storj.NodeIDList{offlineNodeID},
Unknown: storj.NodeIDList{unknownNodeID},
NodesReputation: nodesStatus,
}
auditService := planet.Satellites[0].Audit
_, err := auditService.Reporter.RecordAudits(ctx, report)
require.NoError(t, err)
// successful node should not be suspended or disqualified
n, err := oc.Get(ctx, successNodeID)
require.NoError(t, err)
require.Nil(t, n.UnknownAuditSuspended)
require.Nil(t, n.Disqualified)
// failed node should not be suspended but should be disqualified
// (disqualified because of a failed audit, not because of exceeding suspension grace period)
n, err = oc.Get(ctx, failNodeID)
require.NoError(t, err)
require.Nil(t, n.UnknownAuditSuspended)
require.NotNil(t, n.Disqualified)
// offline node should not be suspended or disqualified
n, err = oc.Get(ctx, offlineNodeID)
require.NoError(t, err)
require.Nil(t, n.UnknownAuditSuspended)
require.Nil(t, n.Disqualified)
// unknown node should still be suspended but not disqualified
n, err = oc.Get(ctx, unknownNodeID)
require.NoError(t, err)
require.NotNil(t, n.UnknownAuditSuspended)
require.Nil(t, n.Disqualified)
})
}
// TestOfflineAuditSuspensionDisabled ensures that a node is not suspended if the offline suspension enabled flag is false.
func TestOfflineAuditSuspensionDisabled(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.AuditHistory.OfflineSuspensionEnabled = false
config.Reputation.AuditHistory.WindowSize = time.Hour
config.Reputation.AuditHistory.TrackingPeriod = 2 * time.Hour
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeID := planet.StorageNodes[0].ID()
oc := planet.Satellites[0].Overlay.DB
reputationdb := planet.Satellites[0].DB.Reputation()
config := planet.Satellites[0].Config.Reputation.AuditHistory
node, err := oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.OfflineSuspended)
require.Nil(t, node.OfflineUnderReview)
require.Nil(t, node.Disqualified)
windowSize := config.WindowSize
trackingPeriodLength := config.TrackingPeriod
currentWindow := time.Now()
req := reputation.UpdateRequest{
NodeID: nodeID,
AuditOutcome: reputation.AuditOffline,
Config: reputation.Config{
AuditHistory: config,
},
}
// check that unsuspended node does not get suspended
for i := 0; i <= int(trackingPeriodLength/windowSize); i++ {
_, err = reputationdb.Update(ctx, req, currentWindow)
require.NoError(t, err)
currentWindow = currentWindow.Add(windowSize)
}
reputationInfo, err := reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, reputationInfo.OfflineSuspended)
require.Nil(t, reputationInfo.UnderReview)
require.Less(t, reputationInfo.OnlineScore, config.OfflineThreshold)
// check that enabling flag suspends the node
req.AuditHistory.OfflineSuspensionEnabled = true
_, err = reputationdb.Update(ctx, req, currentWindow)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, reputationInfo.OfflineSuspended)
require.NotNil(t, reputationInfo.UnderReview)
require.Less(t, reputationInfo.OnlineScore, config.OfflineThreshold)
// check that disabling flag clears suspension and under review
req.AuditHistory.OfflineSuspensionEnabled = false
_, err = reputationdb.Update(ctx, req, currentWindow)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.Less(t, reputationInfo.OnlineScore, config.OfflineThreshold)
require.Nil(t, reputationInfo.OfflineSuspended)
require.Nil(t, reputationInfo.UnderReview)
})
}
// TestOfflineSuspend tests that a node enters offline suspension and "under review" when online score passes below threshold.
// The node should be able to enter and exit suspension while remaining under review.
// The node should be reinstated if it has a good online score after the review period.
// The node should be disqualified if it has a bad online score after the review period.
func TestOfflineSuspend(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.AuditHistory.OfflineSuspensionEnabled = false
config.Reputation.AuditHistory.WindowSize = time.Hour
config.Reputation.AuditHistory.TrackingPeriod = 2 * time.Hour
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeID := planet.StorageNodes[0].ID()
reputationdb := planet.Satellites[0].DB.Reputation()
oc := planet.Satellites[0].DB.OverlayCache()
node, err := oc.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, node.OfflineSuspended)
require.Nil(t, node.Disqualified)
updateReq := reputation.UpdateRequest{
NodeID: nodeID,
AuditOutcome: reputation.AuditOffline,
Config: reputation.Config{
AuditHistory: reputation.AuditHistoryConfig{
WindowSize: time.Hour,
TrackingPeriod: 2 * time.Hour,
GracePeriod: time.Hour,
OfflineThreshold: 0.6,
OfflineDQEnabled: true,
OfflineSuspensionEnabled: true,
},
AuditLambda: 0.95,
AuditWeight: 1,
AuditDQ: 0.6,
InitialAlpha: 1000,
InitialBeta: 0,
UnknownAuditDQ: 0.6,
UnknownAuditLambda: 0.95,
SuspensionGracePeriod: time.Hour,
SuspensionDQEnabled: true,
AuditCount: 0,
},
}
// give node an offline audit
// node's score is still 1 until its first window is complete
nextWindowTime := time.Now()
_, err = reputationdb.Update(ctx, updateReq, nextWindowTime)
require.NoError(t, err)
reputationInfo, err := reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, reputationInfo.OfflineSuspended)
require.Nil(t, reputationInfo.UnderReview)
require.Nil(t, reputationInfo.Disqualified)
require.EqualValues(t, 1, reputationInfo.OnlineScore)
nextWindowTime = nextWindowTime.Add(updateReq.AuditHistory.WindowSize)
// node now has one full window, so its score should be 0
// should not be suspended or DQ since it only has 1 window out of 2 for tracking period
_, err = reputationdb.Update(ctx, updateReq, nextWindowTime)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, reputationInfo.OfflineSuspended)
require.Nil(t, reputationInfo.UnderReview)
require.Nil(t, reputationInfo.Disqualified)
require.EqualValues(t, 0, reputationInfo.OnlineScore)
nextWindowTime = nextWindowTime.Add(updateReq.AuditHistory.WindowSize)
nextWindowTime, err = setOnlineScore(ctx, updateReq, 0.5, time.Hour, nextWindowTime, reputationdb)
require.NoError(t, err)
// node should be offline suspended and under review
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, reputationInfo.OfflineSuspended)
require.NotNil(t, reputationInfo.UnderReview)
require.Nil(t, node.Disqualified)
require.EqualValues(t, 0.5, reputationInfo.OnlineScore)
// set online score to be good, but use a long grace period so that node remains under review
nextWindowTime, err = setOnlineScore(ctx, updateReq, 1, 100*time.Hour, nextWindowTime, reputationdb)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, reputationInfo.OfflineSuspended)
require.NotNil(t, reputationInfo.UnderReview)
require.Nil(t, reputationInfo.Disqualified)
oldUnderReview := reputationInfo.UnderReview
require.EqualValues(t, 1, reputationInfo.OnlineScore)
// suspend again, under review should be the same
nextWindowTime, err = setOnlineScore(ctx, updateReq, 0.5, 100*time.Hour, nextWindowTime, reputationdb)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, reputationInfo.OfflineSuspended)
require.NotNil(t, reputationInfo.UnderReview)
require.Nil(t, node.Disqualified)
require.Equal(t, oldUnderReview, reputationInfo.UnderReview)
require.EqualValues(t, 0.5, reputationInfo.OnlineScore)
// node will exit review after grace period + 1 tracking window, so set grace period to be time since put under review
// subtract one hour so that review window ends when setOnlineScore adds the last window
gracePeriod := nextWindowTime.Sub(*reputationInfo.UnderReview) - time.Hour
nextWindowTime, err = setOnlineScore(ctx, updateReq, 1, gracePeriod, nextWindowTime, reputationdb)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.Nil(t, reputationInfo.OfflineSuspended)
require.Nil(t, reputationInfo.UnderReview)
require.Nil(t, reputationInfo.Disqualified)
require.EqualValues(t, 1, reputationInfo.OnlineScore)
// put into suspension and under review again
nextWindowTime, err = setOnlineScore(ctx, updateReq, 0.5, 100*time.Hour, nextWindowTime, reputationdb)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, reputationInfo.OfflineSuspended)
require.NotNil(t, reputationInfo.UnderReview)
require.Nil(t, node.Disqualified)
require.EqualValues(t, 0.5, reputationInfo.OnlineScore)
// if grace period + 1 tracking window passes and online score is still bad, expect node to be DQed
_, err = setOnlineScore(ctx, updateReq, 0.5, 0, nextWindowTime, reputationdb)
require.NoError(t, err)
reputationInfo, err = reputationdb.Get(ctx, nodeID)
require.NoError(t, err)
require.NotNil(t, reputationInfo.OfflineSuspended)
require.NotNil(t, reputationInfo.UnderReview)
require.NotNil(t, reputationInfo.Disqualified)
require.EqualValues(t, 0.5, reputationInfo.OnlineScore)
})
}
func setOnlineScore(ctx context.Context, reqPtr reputation.UpdateRequest, desiredScore float64, gracePeriod time.Duration, startTime time.Time, reputationdb reputation.DB) (nextWindowTime time.Time, err error) {
// for our tests, we are only using values of 1 and 0.5, so two audits per window is sufficient
totalAudits := 2
onlineAudits := int(float64(totalAudits) * desiredScore)
nextWindowTime = startTime
windowsPerTrackingPeriod := int(reqPtr.AuditHistory.TrackingPeriod.Seconds() / reqPtr.AuditHistory.WindowSize.Seconds())
for window := 0; window < windowsPerTrackingPeriod+1; window++ {
for i := 0; i < totalAudits; i++ {
updateReq := reqPtr
updateReq.AuditOutcome = reputation.AuditSuccess
if i >= onlineAudits {
updateReq.AuditOutcome = reputation.AuditOffline
}
updateReq.AuditHistory.GracePeriod = gracePeriod
_, err = reputationdb.Update(ctx, updateReq, nextWindowTime)
if err != nil {
return nextWindowTime, err
}
}
// increment nextWindowTime so in the next iteration, we are adding to a different window
nextWindowTime = nextWindowTime.Add(reqPtr.AuditHistory.WindowSize)
}
return nextWindowTime, err
}