storj/satellite/audit/reporter_test.go
Moby von Briesen de366537a8 satellite/satellitedb/overlaycache: fix behavior around gracefully exited nodes
Sometimes nodes who have gracefully exited will still be holding pieces
according to the satellite. This has some unintended side effects
currently, such as nodes getting disqualified after having successfully
exited.
* When the audit reporter attempts to update node stats, do not update
stats (alpha, beta, suspension, disqualification) if the node has
finished graceful exit (audit/reporter_test.go TestGracefullyExitedNotUpdated)
* Treat gracefully exited nodes as "not reputable" so that the repairer
and checker do not count them as healthy (overlay/statdb_test.go
TestKnownUnreliableOrOffline, repair/repair_test.go
TestRepairGracefullyExited)

Change-Id: I1920d60dd35de5b2385a9b06989397628a2f1272
2020-04-28 23:58:43 +00:00

245 lines
7.8 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package audit_test
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"storj.io/common/memory"
"storj.io/common/pkcrypto"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite/audit"
"storj.io/storj/satellite/overlay"
)
func TestReportPendingAudits(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
nodeID := planet.StorageNodes[0].ID()
pending := audit.PendingAudit{
NodeID: nodeID,
PieceID: storj.NewPieceID(),
StripeIndex: 1,
ShareSize: 1 * memory.KiB.Int32(),
ExpectedShareHash: pkcrypto.SHA256Hash([]byte("test")),
}
report := audit.Report{PendingAudits: []*audit.PendingAudit{&pending}}
overlay := satellite.Overlay.Service
containment := satellite.DB.Containment()
failed, err := audits.Reporter.RecordAudits(ctx, report, "")
require.NoError(t, err)
assert.Zero(t, failed)
node, err := overlay.Get(ctx, nodeID)
require.NoError(t, err)
assert.True(t, node.Contained)
pa, err := containment.Get(ctx, nodeID)
require.NoError(t, err)
assert.Equal(t, pending, *pa)
})
}
func TestRecordAuditsAtLeastOnce(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
nodeID := planet.StorageNodes[0].ID()
report := audit.Report{Successes: []storj.NodeID{nodeID}}
// expect RecordAudits to try recording at least once (maxRetries is set to 0)
failed, err := audits.Reporter.RecordAudits(ctx, report, "")
require.NoError(t, err)
require.Zero(t, failed)
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, nodeID)
require.NoError(t, err)
require.EqualValues(t, 1, node.Reputation.AuditCount)
})
}
// TestRecordAuditsCorrectOutcome ensures that audit successes, failures, and unknown audits result in the correct disqualification/suspension state.
func TestRecordAuditsCorrectOutcome(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
goodNode := planet.StorageNodes[0].ID()
dqNode := planet.StorageNodes[1].ID()
suspendedNode := planet.StorageNodes[2].ID()
pendingNode := planet.StorageNodes[3].ID()
offlineNode := planet.StorageNodes[4].ID()
report := audit.Report{
Successes: []storj.NodeID{goodNode},
Fails: []storj.NodeID{dqNode},
Unknown: []storj.NodeID{suspendedNode},
PendingAudits: []*audit.PendingAudit{
{
NodeID: pendingNode,
PieceID: testrand.PieceID(),
StripeIndex: 0,
ShareSize: 10,
ExpectedShareHash: []byte{},
ReverifyCount: 0,
Path: "",
},
},
Offlines: []storj.NodeID{offlineNode},
}
failed, err := audits.Reporter.RecordAudits(ctx, report, "")
require.NoError(t, err)
require.Zero(t, failed)
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, goodNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.Suspended)
node, err = overlay.Get(ctx, dqNode)
require.NoError(t, err)
require.NotNil(t, node.Disqualified)
require.Nil(t, node.Suspended)
node, err = overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.Suspended)
node, err = overlay.Get(ctx, pendingNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.Suspended)
node, err = overlay.Get(ctx, offlineNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.Nil(t, node.Suspended)
})
}
func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
suspendedNode := planet.StorageNodes[0].ID()
failed, err := audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}}, "")
require.NoError(t, err)
require.Zero(t, failed)
overlay := satellite.Overlay.Service
node, err := overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.Suspended)
suspendedAt := node.Suspended
failed, err = audits.Reporter.RecordAudits(ctx, audit.Report{Unknown: []storj.NodeID{suspendedNode}}, "")
require.NoError(t, err)
require.Zero(t, failed)
node, err = overlay.Get(ctx, suspendedNode)
require.NoError(t, err)
require.Nil(t, node.Disqualified)
require.NotNil(t, node.Suspended)
require.Equal(t, suspendedAt, node.Suspended)
})
}
// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
// and disqualification flags are not updated when an audit is reported for that node.
func TestGracefullyExitedNotUpdated(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
cache := satellite.Overlay.DB
successNode := planet.StorageNodes[0]
failedNode := planet.StorageNodes[1]
containedNode := planet.StorageNodes[2]
unknownNode := planet.StorageNodes[3]
offlineNode := planet.StorageNodes[4]
nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
// mark each node as having gracefully exited
for _, node := range nodeList {
req := &overlay.ExitStatusRequest{
NodeID: node.ID(),
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
_, err := cache.UpdateExitStatus(ctx, req)
require.NoError(t, err)
}
pending := audit.PendingAudit{
NodeID: containedNode.ID(),
PieceID: storj.NewPieceID(),
StripeIndex: 1,
ShareSize: 1 * memory.KiB.Int32(),
ExpectedShareHash: pkcrypto.SHA256Hash([]byte("test")),
}
report := audit.Report{
Successes: storj.NodeIDList{successNode.ID()},
Fails: storj.NodeIDList{failedNode.ID()},
Offlines: storj.NodeIDList{offlineNode.ID()},
PendingAudits: []*audit.PendingAudit{&pending},
Unknown: storj.NodeIDList{unknownNode.ID()},
}
failed, err := audits.Reporter.RecordAudits(ctx, report, "")
require.NoError(t, err)
assert.Zero(t, failed)
// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
for _, node := range nodeList {
nodeCacheInfo, err := cache.Get(ctx, node.ID())
require.NoError(t, err)
require.EqualValues(t, 1, nodeCacheInfo.Reputation.AuditReputationAlpha)
require.EqualValues(t, 0, nodeCacheInfo.Reputation.AuditReputationBeta)
require.EqualValues(t, 1, nodeCacheInfo.Reputation.UnknownAuditReputationAlpha)
require.EqualValues(t, 0, nodeCacheInfo.Reputation.UnknownAuditReputationBeta)
require.Nil(t, nodeCacheInfo.Suspended)
require.Nil(t, nodeCacheInfo.Disqualified)
}
})
}