satellite/satellitedb/overlaycache: fix behavior around gracefully exited nodes
Sometimes nodes who have gracefully exited will still be holding pieces according to the satellite. This has some unintended side effects currently, such as nodes getting disqualified after having successfully exited. * When the audit reporter attempts to update node stats, do not update stats (alpha, beta, suspension, disqualification) if the node has finished graceful exit (audit/reporter_test.go TestGracefullyExitedNotUpdated) * Treat gracefully exited nodes as "not reputable" so that the repairer and checker do not count them as healthy (overlay/statdb_test.go TestKnownUnreliableOrOffline, repair/repair_test.go TestRepairGracefullyExited) Change-Id: I1920d60dd35de5b2385a9b06989397628a2f1272
This commit is contained in:
parent
baccfd36b1
commit
de366537a8
@ -5,6 +5,7 @@ package audit_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
@ -16,6 +17,7 @@ import (
|
||||
"storj.io/common/testrand"
|
||||
"storj.io/storj/private/testplanet"
|
||||
"storj.io/storj/satellite/audit"
|
||||
"storj.io/storj/satellite/overlay"
|
||||
)
|
||||
|
||||
func TestReportPendingAudits(t *testing.T) {
|
||||
@ -177,3 +179,66 @@ func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
|
||||
require.Equal(t, suspendedAt, node.Suspended)
|
||||
})
|
||||
}
|
||||
|
||||
// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
|
||||
// and disqualification flags are not updated when an audit is reported for that node.
|
||||
func TestGracefullyExitedNotUpdated(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
audits := satellite.Audit
|
||||
audits.Worker.Loop.Pause()
|
||||
cache := satellite.Overlay.DB
|
||||
|
||||
successNode := planet.StorageNodes[0]
|
||||
failedNode := planet.StorageNodes[1]
|
||||
containedNode := planet.StorageNodes[2]
|
||||
unknownNode := planet.StorageNodes[3]
|
||||
offlineNode := planet.StorageNodes[4]
|
||||
nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
|
||||
|
||||
// mark each node as having gracefully exited
|
||||
for _, node := range nodeList {
|
||||
req := &overlay.ExitStatusRequest{
|
||||
NodeID: node.ID(),
|
||||
ExitInitiatedAt: time.Now(),
|
||||
ExitLoopCompletedAt: time.Now(),
|
||||
ExitFinishedAt: time.Now(),
|
||||
}
|
||||
_, err := cache.UpdateExitStatus(ctx, req)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
pending := audit.PendingAudit{
|
||||
NodeID: containedNode.ID(),
|
||||
PieceID: storj.NewPieceID(),
|
||||
StripeIndex: 1,
|
||||
ShareSize: 1 * memory.KiB.Int32(),
|
||||
ExpectedShareHash: pkcrypto.SHA256Hash([]byte("test")),
|
||||
}
|
||||
report := audit.Report{
|
||||
Successes: storj.NodeIDList{successNode.ID()},
|
||||
Fails: storj.NodeIDList{failedNode.ID()},
|
||||
Offlines: storj.NodeIDList{offlineNode.ID()},
|
||||
PendingAudits: []*audit.PendingAudit{&pending},
|
||||
Unknown: storj.NodeIDList{unknownNode.ID()},
|
||||
}
|
||||
failed, err := audits.Reporter.RecordAudits(ctx, report, "")
|
||||
require.NoError(t, err)
|
||||
assert.Zero(t, failed)
|
||||
|
||||
// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
|
||||
for _, node := range nodeList {
|
||||
nodeCacheInfo, err := cache.Get(ctx, node.ID())
|
||||
require.NoError(t, err)
|
||||
|
||||
require.EqualValues(t, 1, nodeCacheInfo.Reputation.AuditReputationAlpha)
|
||||
require.EqualValues(t, 0, nodeCacheInfo.Reputation.AuditReputationBeta)
|
||||
require.EqualValues(t, 1, nodeCacheInfo.Reputation.UnknownAuditReputationAlpha)
|
||||
require.EqualValues(t, 0, nodeCacheInfo.Reputation.UnknownAuditReputationBeta)
|
||||
require.Nil(t, nodeCacheInfo.Suspended)
|
||||
require.Nil(t, nodeCacheInfo.Disqualified)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -31,15 +31,17 @@ func TestStatDB(t *testing.T) {
|
||||
func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
|
||||
{ // TestKnownUnreliableOrOffline
|
||||
for _, tt := range []struct {
|
||||
nodeID storj.NodeID
|
||||
suspended bool
|
||||
disqualified bool
|
||||
offline bool
|
||||
nodeID storj.NodeID
|
||||
suspended bool
|
||||
disqualified bool
|
||||
offline bool
|
||||
gracefullyexited bool
|
||||
}{
|
||||
{storj.NodeID{1}, false, false, false}, // good
|
||||
{storj.NodeID{2}, false, true, false}, // disqualified
|
||||
{storj.NodeID{3}, true, false, false}, // suspended
|
||||
{storj.NodeID{4}, false, false, true}, // offline
|
||||
{storj.NodeID{1}, false, false, false, false}, // good
|
||||
{storj.NodeID{2}, false, true, false, false}, // disqualified
|
||||
{storj.NodeID{3}, true, false, false, false}, // suspended
|
||||
{storj.NodeID{4}, false, false, true, false}, // offline
|
||||
{storj.NodeID{5}, false, false, false, true}, // gracefully exited
|
||||
} {
|
||||
startingRep := overlay.NodeSelectionConfig{}
|
||||
n := pb.Node{Id: tt.nodeID}
|
||||
@ -61,12 +63,22 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
|
||||
err = cache.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-2*time.Hour), overlay.NodeSelectionConfig{})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
if tt.gracefullyexited {
|
||||
req := &overlay.ExitStatusRequest{
|
||||
NodeID: tt.nodeID,
|
||||
ExitInitiatedAt: time.Now(),
|
||||
ExitLoopCompletedAt: time.Now(),
|
||||
ExitFinishedAt: time.Now(),
|
||||
}
|
||||
_, err := cache.UpdateExitStatus(ctx, req)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
nodeIds := storj.NodeIDList{
|
||||
storj.NodeID{1}, storj.NodeID{2},
|
||||
storj.NodeID{3}, storj.NodeID{4},
|
||||
storj.NodeID{5},
|
||||
storj.NodeID{5}, storj.NodeID{6},
|
||||
}
|
||||
criteria := &overlay.NodeCriteria{OnlineWindow: time.Hour}
|
||||
|
||||
@ -76,8 +88,9 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
|
||||
require.Contains(t, invalid, storj.NodeID{2}) // disqualified
|
||||
require.Contains(t, invalid, storj.NodeID{3}) // suspended
|
||||
require.Contains(t, invalid, storj.NodeID{4}) // offline
|
||||
require.Contains(t, invalid, storj.NodeID{5}) // not in db
|
||||
require.Len(t, invalid, 4)
|
||||
require.Contains(t, invalid, storj.NodeID{5}) // gracefully exited
|
||||
require.Contains(t, invalid, storj.NodeID{6}) // not in db
|
||||
require.Len(t, invalid, 5)
|
||||
}
|
||||
|
||||
{ // TestUpdateOperator
|
||||
|
@ -1238,6 +1238,124 @@ func testDataRepairUploadLimit(t *testing.T, inMemoryRepair bool) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestRepairGracefullyExited does the following:
|
||||
// - Uploads test data to 7 nodes
|
||||
// - Set 3 nodes as gracefully exited
|
||||
// - Triggers data repair, which repairs the data from the remaining 4 nodes to additional 3 new nodes
|
||||
// - Shuts down the 4 nodes from which the data was repaired
|
||||
// - Now we have just the 3 new nodes to which the data was repaired
|
||||
// - Downloads the data from these 3 nodes (succeeds because 3 nodes are enough for download)
|
||||
// - Expect newly repaired pointer does not contain the gracefully exited nodes
|
||||
func TestRepairGracefullyExitedInMemory(t *testing.T) {
|
||||
testRepairGracefullyExited(t, true)
|
||||
}
|
||||
func TestRepairGracefullyExitedToDisk(t *testing.T) {
|
||||
testRepairGracefullyExited(t, false)
|
||||
}
|
||||
|
||||
func testRepairGracefullyExited(t *testing.T, inMemoryRepair bool) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1,
|
||||
StorageNodeCount: 12,
|
||||
UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.InMemoryRepair = inMemoryRepair
|
||||
|
||||
config.Metainfo.RS.MinThreshold = 3
|
||||
config.Metainfo.RS.RepairThreshold = 5
|
||||
config.Metainfo.RS.SuccessThreshold = 7
|
||||
config.Metainfo.RS.TotalThreshold = 7
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
// first, upload some remote data
|
||||
uplinkPeer := planet.Uplinks[0]
|
||||
satellite := planet.Satellites[0]
|
||||
|
||||
satellite.Repair.Checker.Loop.Pause()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
|
||||
testData := testrand.Bytes(8 * memory.KiB)
|
||||
|
||||
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
||||
require.NoError(t, err)
|
||||
|
||||
// get a remote segment from metainfo
|
||||
metainfo := satellite.Metainfo.Service
|
||||
listResponse, _, err := metainfo.List(ctx, "", "", true, 0, 0)
|
||||
require.NoError(t, err)
|
||||
|
||||
var path string
|
||||
var pointer *pb.Pointer
|
||||
for _, v := range listResponse {
|
||||
path = v.GetPath()
|
||||
pointer, err = metainfo.Get(ctx, path)
|
||||
require.NoError(t, err)
|
||||
if pointer.GetType() == pb.Pointer_REMOTE {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
numStorageNodes := len(planet.StorageNodes)
|
||||
remotePieces := pointer.GetRemote().GetRemotePieces()
|
||||
numPieces := len(remotePieces)
|
||||
// sanity check
|
||||
require.EqualValues(t, numPieces, 7)
|
||||
toExit := 3
|
||||
// we should have enough storage nodes to repair on
|
||||
require.True(t, (numStorageNodes-toExit) >= numPieces)
|
||||
|
||||
// gracefully exit nodes and track lost pieces
|
||||
nodesToExit := make(map[storj.NodeID]bool)
|
||||
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
||||
|
||||
// exit nodes
|
||||
for i := 0; i < toExit; i++ {
|
||||
nodesToExit[remotePieces[i].NodeId] = true
|
||||
req := &overlay.ExitStatusRequest{
|
||||
NodeID: remotePieces[i].NodeId,
|
||||
ExitInitiatedAt: time.Now(),
|
||||
ExitLoopCompletedAt: time.Now(),
|
||||
ExitFinishedAt: time.Now(),
|
||||
}
|
||||
_, err := satellite.DB.OverlayCache().UpdateExitStatus(ctx, req)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
for i := toExit; i < len(remotePieces); i++ {
|
||||
nodesToKeepAlive[remotePieces[i].NodeId] = true
|
||||
}
|
||||
|
||||
err = satellite.Repair.Checker.RefreshReliabilityCache(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
satellite.Repair.Checker.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// kill nodes kept alive to ensure repair worked
|
||||
for _, node := range planet.StorageNodes {
|
||||
if nodesToKeepAlive[node.ID()] {
|
||||
stopNodeByID(t, ctx, planet, node.ID())
|
||||
}
|
||||
}
|
||||
|
||||
// we should be able to download data without any of the original nodes
|
||||
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, newData, testData)
|
||||
|
||||
// updated pointer should not contain any of the gracefully exited nodes
|
||||
pointer, err = metainfo.Get(ctx, path)
|
||||
require.NoError(t, err)
|
||||
|
||||
remotePieces = pointer.GetRemote().GetRemotePieces()
|
||||
for _, piece := range remotePieces {
|
||||
require.False(t, nodesToExit[piece.NodeId])
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// getRemoteSegment returns a remote pointer its path from satellite.
|
||||
// nolint:golint
|
||||
func getRemoteSegment(
|
||||
|
@ -229,6 +229,7 @@ func (cache *overlaycache) KnownUnreliableOrOffline(ctx context.Context, criteri
|
||||
WHERE id = any($1::bytea[])
|
||||
AND disqualified IS NULL
|
||||
AND suspended IS NULL
|
||||
AND exit_finished_at IS NULL
|
||||
AND last_contact_success > $2
|
||||
`), postgresNodeIDList(nodeIds), time.Now().Add(-criteria.OnlineWindow),
|
||||
)
|
||||
@ -269,6 +270,7 @@ func (cache *overlaycache) KnownReliable(ctx context.Context, onlineWindow time.
|
||||
WHERE id = any($1::bytea[])
|
||||
AND disqualified IS NULL
|
||||
AND suspended IS NULL
|
||||
AND exit_finished_at IS NULL
|
||||
AND last_contact_success > $2
|
||||
`), postgresNodeIDList(nodeIDs), time.Now().Add(-onlineWindow),
|
||||
)
|
||||
@ -299,6 +301,7 @@ func (cache *overlaycache) Reliable(ctx context.Context, criteria *overlay.NodeC
|
||||
SELECT id FROM nodes
|
||||
WHERE disqualified IS NULL
|
||||
AND suspended IS NULL
|
||||
AND exit_finished_at IS NULL
|
||||
AND last_contact_success > ?
|
||||
`), time.Now().Add(-criteria.OnlineWindow))
|
||||
if err != nil {
|
||||
@ -407,6 +410,10 @@ func (cache *overlaycache) BatchUpdateStats(ctx context.Context, updateRequests
|
||||
if dbNode.Disqualified != nil {
|
||||
continue
|
||||
}
|
||||
// do not update reputation if node has gracefully exited
|
||||
if dbNode.ExitFinishedAt != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
updateNodeStats := cache.populateUpdateNodeStats(dbNode, updateReq)
|
||||
|
||||
@ -471,6 +478,10 @@ func (cache *overlaycache) UpdateStats(ctx context.Context, updateReq *overlay.U
|
||||
if dbNode.Disqualified != nil {
|
||||
return nil
|
||||
}
|
||||
// do not update reputation if node has gracefully exited
|
||||
if dbNode.ExitFinishedAt != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
updateFields := cache.populateUpdateFields(dbNode, updateReq)
|
||||
|
||||
@ -549,6 +560,10 @@ func (cache *overlaycache) UpdateUptime(ctx context.Context, nodeID storj.NodeID
|
||||
if dbNode.Disqualified != nil {
|
||||
return nil
|
||||
}
|
||||
// do not update reputation if node has gracefully exited
|
||||
if dbNode.ExitFinishedAt != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
updateFields := dbx.Node_Update_Fields{}
|
||||
totalUptimeCount := dbNode.TotalUptimeCount
|
||||
|
Loading…
Reference in New Issue
Block a user