storj/satellite/audit/reporter_test.go
paul cannon 355ea2133b satellite/audit: remove pieces when audits fail
When pieces fail an audit (hard fail, meaning the node acknowledged it
did not have the piece or the piece was corrupted), we will now remove
those pieces from the segment.

Previously, we did not do this, and some node operators were seeing the
same missing piece audited over and over again and losing reputation
every time.

This change will include both verification and reverification audits. It
will also apply to pieces found to be bad during repair, if
repair-to-reputation reporting is enabled.

Change-Id: I0ca7af7e3fecdc0aebbd34fee4be3a0eab53f4f7
2023-06-22 14:19:00 +00:00

316 lines
10 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package audit_test
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"storj.io/common/memory"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/audit"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/overlay"
)
func TestReportPendingAudits(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
nodeID := planet.StorageNodes[0].ID()
pending := audit.ReverificationJob{
Locator: audit.PieceLocator{
NodeID: nodeID,
},
}
report := audit.Report{PendingAudits: []*audit.ReverificationJob{&pending}}
containment := satellite.DB.Containment()
audits.Reporter.RecordAudits(ctx, report)
pa, err := containment.Get(ctx, nodeID)
require.NoError(t, err)
assert.Equal(t, pending.Locator, pa.Locator)
})
}
func TestRecordAuditsAtLeastOnce(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
// disable reputation write cache so changes are immediate
config.Reputation.FlushInterval = 0
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
nodeID := planet.StorageNodes[0].ID()
report := audit.Report{Successes: []storj.NodeID{nodeID}}
// expect RecordAudits to try recording at least once (maxRetries is set to 0)
audits.Reporter.RecordAudits(ctx, report)
service := satellite.Reputation.Service
node, err := service.Get(ctx, nodeID)
require.NoError(t, err)
require.EqualValues(t, 1, node.TotalAuditCount)
})
}
// TestRecordAuditsCorrectOutcome ensures that audit successes, failures, and unknown audits result in the correct disqualification/suspension state.
func TestRecordAuditsCorrectOutcome(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.InitialAlpha = 1
config.Reputation.AuditLambda = 0.95
config.Reputation.AuditDQ = 0.6
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
goodNode := planet.StorageNodes[0].ID()
dqNode := planet.StorageNodes[1].ID()
suspendedNode := planet.StorageNodes[2].ID()
pendingNode := planet.StorageNodes[3].ID()
offlineNode := planet.StorageNodes[4].ID()
report := audit.Report{
Successes: []storj.NodeID{goodNode},
Fails: metabase.Pieces{{StorageNode: dqNode}},
Unknown: []storj.NodeID{suspendedNode},
PendingAudits: []*audit.ReverificationJob{
{
Locator: audit.PieceLocator{NodeID: pendingNode},
ReverifyCount: 0,
},
},
Offlines: []storj.NodeID{offlineNode},
}
audits.Reporter.RecordAudits(ctx, report)
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, goodNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, dqNode)
require.NoError(t, err)
require.NotNil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, pendingNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
node, err = overlay.Get(ctx, offlineNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.UnknownAuditSuspended)
})
}
func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
suspendedNode := planet.StorageNodes[0].ID()
audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}})
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.UnknownAuditSuspended)
suspendedAt := node.UnknownAuditSuspended
audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}})
node, err = overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.UnknownAuditSuspended)
require.Equal(t, suspendedAt, node.UnknownAuditSuspended)
})
}
// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
// and disqualification flags are not updated when an audit is reported for that node.
func TestGracefullyExitedNotUpdated(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
cache := satellite.Overlay.DB
reputationDB := satellite.DB.Reputation()
successNode := planet.StorageNodes[0]
failedNode := planet.StorageNodes[1]
containedNode := planet.StorageNodes[2]
unknownNode := planet.StorageNodes[3]
offlineNode := planet.StorageNodes[4]
nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
report := audit.Report{
Successes: storj.NodeIDList{successNode.ID(), failedNode.ID(), containedNode.ID(), unknownNode.ID(), offlineNode.ID()},
}
audits.Reporter.RecordAudits(ctx, report)
// mark each node as having gracefully exited
for _, node := range nodeList {
req := &overlay.ExitStatusRequest{
NodeID: node.ID(),
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
_, err := cache.UpdateExitStatus(ctx, req)
require.NoError(t, err)
}
pending := audit.ReverificationJob{
Locator: audit.PieceLocator{
NodeID: containedNode.ID(),
},
}
report = audit.Report{
Successes: storj.NodeIDList{successNode.ID()},
Fails: metabase.Pieces{{StorageNode: failedNode.ID()}},
Offlines: storj.NodeIDList{offlineNode.ID()},
PendingAudits: []*audit.ReverificationJob{&pending},
Unknown: storj.NodeIDList{unknownNode.ID()},
}
audits.Reporter.RecordAudits(ctx, report)
// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
for _, node := range nodeList {
nodeCacheInfo, err := reputationDB.Get(ctx, node.ID())
require.NoError(t, err)
require.Nil(t, nodeCacheInfo.UnknownAuditSuspended)
require.Nil(t, nodeCacheInfo.Disqualified)
}
})
}
func TestReportOfflineAudits(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
// disable reputation write cache so changes are immediate
config.Reputation.FlushInterval = 0
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
node := planet.StorageNodes[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
reputationService := satellite.Core.Reputation.Service
audits.Reporter.RecordAudits(ctx, audit.Report{Offlines: storj.NodeIDList{node.ID()}})
info, err := reputationService.Get(ctx, node.ID())
require.NoError(t, err)
require.Equal(t, int64(1), info.TotalAuditCount)
// check that other reputation stats were not incorrectly updated by offline audit
require.EqualValues(t, 0, info.AuditSuccessCount)
require.EqualValues(t, satellite.Config.Reputation.InitialAlpha, info.AuditReputationAlpha)
require.EqualValues(t, satellite.Config.Reputation.InitialBeta, info.AuditReputationBeta)
require.EqualValues(t, 1, info.UnknownAuditReputationAlpha)
require.EqualValues(t, 0, info.UnknownAuditReputationBeta)
})
}
func TestReportingAuditFailureResultsInRemovalOfPiece(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 6, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: testplanet.Combine(
func(log *zap.Logger, index int, config *satellite.Config) {
// disable reputation write cache so changes are immediate
config.Reputation.FlushInterval = 0
},
testplanet.ReconfigureRS(4, 5, 6, 6),
),
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
ul := planet.Uplinks[0]
testData := testrand.Bytes(1 * memory.MiB)
err := ul.Upload(ctx, satellite, "bucket-for-test", "path/of/testness", testData)
require.NoError(t, err)
segment, _ := getRemoteSegment(ctx, t, satellite, ul.Projects[0].ID, "bucket-for-test")
report := audit.Report{
Segment: &segment,
Fails: metabase.Pieces{
metabase.Piece{
Number: segment.Pieces[0].Number,
StorageNode: segment.Pieces[0].StorageNode,
},
},
}
satellite.Audit.Reporter.RecordAudits(ctx, report)
// piece marked as failed is no longer in the segment
afterSegment, _ := getRemoteSegment(ctx, t, satellite, ul.Projects[0].ID, "bucket-for-test")
require.Len(t, afterSegment.Pieces, len(segment.Pieces)-1)
for i, p := range afterSegment.Pieces {
assert.NotEqual(t, segment.Pieces[0].Number, p.Number, i)
assert.NotEqual(t, segment.Pieces[0].StorageNode, p.StorageNode, i)
}
// segment is still retrievable
gotData, err := ul.Download(ctx, satellite, "bucket-for-test", "path/of/testness")
require.NoError(t, err)
require.Equal(t, testData, gotData)
})
}