355ea2133b
When pieces fail an audit (hard fail, meaning the node acknowledged it did not have the piece or the piece was corrupted), we will now remove those pieces from the segment. Previously, we did not do this, and some node operators were seeing the same missing piece audited over and over again and losing reputation every time. This change will include both verification and reverification audits. It will also apply to pieces found to be bad during repair, if repair-to-reputation reporting is enabled. Change-Id: I0ca7af7e3fecdc0aebbd34fee4be3a0eab53f4f7
316 lines
10 KiB
Go
316 lines
10 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package audit_test
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/memory"
|
|
"storj.io/common/storj"
|
|
"storj.io/common/testcontext"
|
|
"storj.io/common/testrand"
|
|
"storj.io/storj/private/testplanet"
|
|
"storj.io/storj/satellite"
|
|
"storj.io/storj/satellite/audit"
|
|
"storj.io/storj/satellite/metabase"
|
|
"storj.io/storj/satellite/overlay"
|
|
)
|
|
|
|
func TestReportPendingAudits(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
audits.Worker.Loop.Pause()
|
|
|
|
nodeID := planet.StorageNodes[0].ID()
|
|
|
|
pending := audit.ReverificationJob{
|
|
Locator: audit.PieceLocator{
|
|
NodeID: nodeID,
|
|
},
|
|
}
|
|
|
|
report := audit.Report{PendingAudits: []*audit.ReverificationJob{&pending}}
|
|
containment := satellite.DB.Containment()
|
|
|
|
audits.Reporter.RecordAudits(ctx, report)
|
|
|
|
pa, err := containment.Get(ctx, nodeID)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, pending.Locator, pa.Locator)
|
|
})
|
|
}
|
|
|
|
func TestRecordAuditsAtLeastOnce(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
// disable reputation write cache so changes are immediate
|
|
config.Reputation.FlushInterval = 0
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
audits.Worker.Loop.Pause()
|
|
|
|
nodeID := planet.StorageNodes[0].ID()
|
|
|
|
report := audit.Report{Successes: []storj.NodeID{nodeID}}
|
|
|
|
// expect RecordAudits to try recording at least once (maxRetries is set to 0)
|
|
audits.Reporter.RecordAudits(ctx, report)
|
|
|
|
service := satellite.Reputation.Service
|
|
node, err := service.Get(ctx, nodeID)
|
|
require.NoError(t, err)
|
|
require.EqualValues(t, 1, node.TotalAuditCount)
|
|
})
|
|
}
|
|
|
|
// TestRecordAuditsCorrectOutcome ensures that audit successes, failures, and unknown audits result in the correct disqualification/suspension state.
|
|
func TestRecordAuditsCorrectOutcome(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
config.Reputation.InitialAlpha = 1
|
|
config.Reputation.AuditLambda = 0.95
|
|
config.Reputation.AuditDQ = 0.6
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
audits.Worker.Loop.Pause()
|
|
|
|
goodNode := planet.StorageNodes[0].ID()
|
|
dqNode := planet.StorageNodes[1].ID()
|
|
suspendedNode := planet.StorageNodes[2].ID()
|
|
pendingNode := planet.StorageNodes[3].ID()
|
|
offlineNode := planet.StorageNodes[4].ID()
|
|
|
|
report := audit.Report{
|
|
Successes: []storj.NodeID{goodNode},
|
|
Fails: metabase.Pieces{{StorageNode: dqNode}},
|
|
Unknown: []storj.NodeID{suspendedNode},
|
|
PendingAudits: []*audit.ReverificationJob{
|
|
{
|
|
Locator: audit.PieceLocator{NodeID: pendingNode},
|
|
ReverifyCount: 0,
|
|
},
|
|
},
|
|
Offlines: []storj.NodeID{offlineNode},
|
|
}
|
|
|
|
audits.Reporter.RecordAudits(ctx, report)
|
|
|
|
overlay := satellite.Overlay.Service
|
|
node, err := overlay.Get(ctx, goodNode)
|
|
require.NoError(t, err)
|
|
require.Nil(t, node.Disqualified)
|
|
require.Nil(t, node.UnknownAuditSuspended)
|
|
|
|
node, err = overlay.Get(ctx, dqNode)
|
|
require.NoError(t, err)
|
|
require.NotNil(t, node.Disqualified)
|
|
require.Nil(t, node.UnknownAuditSuspended)
|
|
|
|
node, err = overlay.Get(ctx, suspendedNode)
|
|
require.NoError(t, err)
|
|
require.Nil(t, node.Disqualified)
|
|
require.NotNil(t, node.UnknownAuditSuspended)
|
|
|
|
node, err = overlay.Get(ctx, pendingNode)
|
|
require.NoError(t, err)
|
|
require.Nil(t, node.Disqualified)
|
|
require.Nil(t, node.UnknownAuditSuspended)
|
|
|
|
node, err = overlay.Get(ctx, offlineNode)
|
|
require.NoError(t, err)
|
|
require.Nil(t, node.Disqualified)
|
|
require.Nil(t, node.UnknownAuditSuspended)
|
|
})
|
|
}
|
|
|
|
func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
audits.Worker.Loop.Pause()
|
|
|
|
suspendedNode := planet.StorageNodes[0].ID()
|
|
|
|
audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}})
|
|
|
|
overlay := satellite.Overlay.Service
|
|
|
|
node, err := overlay.Get(ctx, suspendedNode)
|
|
require.NoError(t, err)
|
|
require.Nil(t, node.Disqualified)
|
|
require.NotNil(t, node.UnknownAuditSuspended)
|
|
|
|
suspendedAt := node.UnknownAuditSuspended
|
|
|
|
audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}})
|
|
|
|
node, err = overlay.Get(ctx, suspendedNode)
|
|
require.NoError(t, err)
|
|
require.Nil(t, node.Disqualified)
|
|
require.NotNil(t, node.UnknownAuditSuspended)
|
|
require.Equal(t, suspendedAt, node.UnknownAuditSuspended)
|
|
})
|
|
}
|
|
|
|
// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
|
|
// and disqualification flags are not updated when an audit is reported for that node.
|
|
func TestGracefullyExitedNotUpdated(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
audits.Worker.Loop.Pause()
|
|
cache := satellite.Overlay.DB
|
|
reputationDB := satellite.DB.Reputation()
|
|
|
|
successNode := planet.StorageNodes[0]
|
|
failedNode := planet.StorageNodes[1]
|
|
containedNode := planet.StorageNodes[2]
|
|
unknownNode := planet.StorageNodes[3]
|
|
offlineNode := planet.StorageNodes[4]
|
|
nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
|
|
|
|
report := audit.Report{
|
|
Successes: storj.NodeIDList{successNode.ID(), failedNode.ID(), containedNode.ID(), unknownNode.ID(), offlineNode.ID()},
|
|
}
|
|
audits.Reporter.RecordAudits(ctx, report)
|
|
|
|
// mark each node as having gracefully exited
|
|
for _, node := range nodeList {
|
|
req := &overlay.ExitStatusRequest{
|
|
NodeID: node.ID(),
|
|
ExitInitiatedAt: time.Now(),
|
|
ExitLoopCompletedAt: time.Now(),
|
|
ExitFinishedAt: time.Now(),
|
|
}
|
|
_, err := cache.UpdateExitStatus(ctx, req)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
pending := audit.ReverificationJob{
|
|
Locator: audit.PieceLocator{
|
|
NodeID: containedNode.ID(),
|
|
},
|
|
}
|
|
report = audit.Report{
|
|
Successes: storj.NodeIDList{successNode.ID()},
|
|
Fails: metabase.Pieces{{StorageNode: failedNode.ID()}},
|
|
Offlines: storj.NodeIDList{offlineNode.ID()},
|
|
PendingAudits: []*audit.ReverificationJob{&pending},
|
|
Unknown: storj.NodeIDList{unknownNode.ID()},
|
|
}
|
|
audits.Reporter.RecordAudits(ctx, report)
|
|
|
|
// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
|
|
for _, node := range nodeList {
|
|
nodeCacheInfo, err := reputationDB.Get(ctx, node.ID())
|
|
require.NoError(t, err)
|
|
|
|
require.Nil(t, nodeCacheInfo.UnknownAuditSuspended)
|
|
require.Nil(t, nodeCacheInfo.Disqualified)
|
|
}
|
|
})
|
|
}
|
|
|
|
func TestReportOfflineAudits(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
// disable reputation write cache so changes are immediate
|
|
config.Reputation.FlushInterval = 0
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
node := planet.StorageNodes[0]
|
|
audits := satellite.Audit
|
|
audits.Worker.Loop.Pause()
|
|
reputationService := satellite.Core.Reputation.Service
|
|
|
|
audits.Reporter.RecordAudits(ctx, audit.Report{Offlines: storj.NodeIDList{node.ID()}})
|
|
|
|
info, err := reputationService.Get(ctx, node.ID())
|
|
require.NoError(t, err)
|
|
require.Equal(t, int64(1), info.TotalAuditCount)
|
|
|
|
// check that other reputation stats were not incorrectly updated by offline audit
|
|
require.EqualValues(t, 0, info.AuditSuccessCount)
|
|
require.EqualValues(t, satellite.Config.Reputation.InitialAlpha, info.AuditReputationAlpha)
|
|
require.EqualValues(t, satellite.Config.Reputation.InitialBeta, info.AuditReputationBeta)
|
|
require.EqualValues(t, 1, info.UnknownAuditReputationAlpha)
|
|
require.EqualValues(t, 0, info.UnknownAuditReputationBeta)
|
|
})
|
|
}
|
|
|
|
func TestReportingAuditFailureResultsInRemovalOfPiece(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 6, UplinkCount: 1,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: testplanet.Combine(
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
// disable reputation write cache so changes are immediate
|
|
config.Reputation.FlushInterval = 0
|
|
},
|
|
testplanet.ReconfigureRS(4, 5, 6, 6),
|
|
),
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(1 * memory.MiB)
|
|
err := ul.Upload(ctx, satellite, "bucket-for-test", "path/of/testness", testData)
|
|
require.NoError(t, err)
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, ul.Projects[0].ID, "bucket-for-test")
|
|
|
|
report := audit.Report{
|
|
Segment: &segment,
|
|
Fails: metabase.Pieces{
|
|
metabase.Piece{
|
|
Number: segment.Pieces[0].Number,
|
|
StorageNode: segment.Pieces[0].StorageNode,
|
|
},
|
|
},
|
|
}
|
|
|
|
satellite.Audit.Reporter.RecordAudits(ctx, report)
|
|
|
|
// piece marked as failed is no longer in the segment
|
|
afterSegment, _ := getRemoteSegment(ctx, t, satellite, ul.Projects[0].ID, "bucket-for-test")
|
|
require.Len(t, afterSegment.Pieces, len(segment.Pieces)-1)
|
|
for i, p := range afterSegment.Pieces {
|
|
assert.NotEqual(t, segment.Pieces[0].Number, p.Number, i)
|
|
assert.NotEqual(t, segment.Pieces[0].StorageNode, p.StorageNode, i)
|
|
}
|
|
|
|
// segment is still retrievable
|
|
gotData, err := ul.Download(ctx, satellite, "bucket-for-test", "path/of/testness")
|
|
require.NoError(t, err)
|
|
require.Equal(t, testData, gotData)
|
|
})
|
|
}
|