storj/satellite/audit/reporter_test.go
paul cannon 1854351da6 satellite/audit: teach Reporter about piecewise audits
The Reporter is responsible for processing results from auditing
operations, logging the results, disqualifying nodes that reached
the maximum reverification count, and passing the results on to
the reputation system.

In this commit, we extend the Reporter so that it knows how to process
the results of piecewise reverification audits.

We also change most reporter-related tests so that reverifications
happen as piecewise reverification audits, exercising the new code.

Note that piecewise reverification audits are not yet being done outside
of tests. In a later commit, we will switch from doing segmentwise
reverifications to piecewise reverifications, as part of the
audit-scaling effort.

Refs: https://github.com/storj/storj/issues/5230
Change-Id: I9438164ce1ea4d9a1790d18d0e1046a8eb04d8e9
2022-12-12 11:28:02 +00:00

252 lines
8.1 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package audit_test
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/audit"
"storj.io/storj/satellite/overlay"
)
func TestReportPendingAudits(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
nodeID := planet.StorageNodes[0].ID()
pending := audit.ReverificationJob{
Locator: audit.PieceLocator{
NodeID: nodeID,
},
}
report := audit.Report{PieceAudits: []*audit.ReverificationJob{&pending}}
containment := satellite.DB.NewContainment()
audits.Reporter.RecordAudits(ctx, report)
pa, err := containment.Get(ctx, nodeID)
require.NoError(t, err)
assert.Equal(t, pending.Locator, pa.Locator)
})
}
func TestRecordAuditsAtLeastOnce(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
nodeID := planet.StorageNodes[0].ID()
report := audit.Report{Successes: []storj.NodeID{nodeID}}
// expect RecordAudits to try recording at least once (maxRetries is set to 0)
audits.Reporter.RecordAudits(ctx, report)
service := satellite.Reputation.Service
node, err := service.Get(ctx, nodeID)
require.NoError(t, err)
require.EqualValues(t, 1, node.TotalAuditCount)
})
}
// TestRecordAuditsCorrectOutcome ensures that audit successes, failures, and unknown audits result in the correct disqualification/suspension state.
func TestRecordAuditsCorrectOutcome(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.InitialAlpha = 1
config.Reputation.AuditLambda = 0.95
config.Reputation.AuditDQ = 0.6
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
goodNode := planet.StorageNodes[0].ID()
dqNode := planet.StorageNodes[1].ID()
suspendedNode := planet.StorageNodes[2].ID()
pendingNode := planet.StorageNodes[3].ID()
offlineNode := planet.StorageNodes[4].ID()
report := audit.Report{
Successes: []storj.NodeID{goodNode},
Fails: []storj.NodeID{dqNode},
Unknown: []storj.NodeID{suspendedNode},
PieceAudits: []*audit.ReverificationJob{
{
Locator: audit.PieceLocator{NodeID: pendingNode},
ReverifyCount: 0,
},
},
Offlines: []storj.NodeID{offlineNode},
}
audits.Reporter.RecordAudits(ctx, report)
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, goodNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, dqNode)
require.NoError(t, err)
require.NotNil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, pendingNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, offlineNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
})
}
func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
suspendedNode := planet.StorageNodes[0].ID()
audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}})
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.UnknownAuditSuspended)
suspendedAt := node.UnknownAuditSuspended
audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}})
node, err = overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.UnknownAuditSuspended)
require.Equal(t, suspendedAt, node.UnknownAuditSuspended)
})
}
// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
// and disqualification flags are not updated when an audit is reported for that node.
func TestGracefullyExitedNotUpdated(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
cache := satellite.Overlay.DB
reputationDB := satellite.DB.Reputation()
successNode := planet.StorageNodes[0]
failedNode := planet.StorageNodes[1]
containedNode := planet.StorageNodes[2]
unknownNode := planet.StorageNodes[3]
offlineNode := planet.StorageNodes[4]
nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
report := audit.Report{
Successes: storj.NodeIDList{successNode.ID(), failedNode.ID(), containedNode.ID(), unknownNode.ID(), offlineNode.ID()},
}
audits.Reporter.RecordAudits(ctx, report)
// mark each node as having gracefully exited
for _, node := range nodeList {
req := &overlay.ExitStatusRequest{
NodeID: node.ID(),
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
_, err := cache.UpdateExitStatus(ctx, req)
require.NoError(t, err)
}
pending := audit.ReverificationJob{
Locator: audit.PieceLocator{
NodeID: containedNode.ID(),
},
}
report = audit.Report{
Successes: storj.NodeIDList{successNode.ID()},
Fails: storj.NodeIDList{failedNode.ID()},
Offlines: storj.NodeIDList{offlineNode.ID()},
PieceAudits: []*audit.ReverificationJob{&pending},
Unknown: storj.NodeIDList{unknownNode.ID()},
}
audits.Reporter.RecordAudits(ctx, report)
// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
for _, node := range nodeList {
nodeCacheInfo, err := reputationDB.Get(ctx, node.ID())
require.NoError(t, err)
require.Nil(t, nodeCacheInfo.UnknownAuditSuspended)
require.Nil(t, nodeCacheInfo.Disqualified)
}
})
}
func TestReportOfflineAudits(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
node := planet.StorageNodes[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
reputationService := satellite.Core.Reputation.Service
audits.Reporter.RecordAudits(ctx, audit.Report{Offlines: storj.NodeIDList{node.ID()}})
info, err := reputationService.Get(ctx, node.ID())
require.NoError(t, err)
require.Equal(t, int64(1), info.TotalAuditCount)
// check that other reputation stats were not incorrectly updated by offline audit
require.EqualValues(t, 0, info.AuditSuccessCount)
require.EqualValues(t, satellite.Config.Reputation.InitialAlpha, info.AuditReputationAlpha)
require.EqualValues(t, satellite.Config.Reputation.InitialBeta, info.AuditReputationBeta)
require.EqualValues(t, 1, info.UnknownAuditReputationAlpha)
require.EqualValues(t, 0, info.UnknownAuditReputationBeta)
})
}