satellite/satellitedb/overlaycache: fix behavior around gracefully exited nodes

Sometimes nodes who have gracefully exited will still be holding pieces
according to the satellite. This has some unintended side effects
currently, such as nodes getting disqualified after having successfully
exited.
* When the audit reporter attempts to update node stats, do not update
stats (alpha, beta, suspension, disqualification) if the node has
finished graceful exit (audit/reporter_test.go TestGracefullyExitedNotUpdated)
* Treat gracefully exited nodes as "not reputable" so that the repairer
and checker do not count them as healthy (overlay/statdb_test.go
TestKnownUnreliableOrOffline, repair/repair_test.go
TestRepairGracefullyExited)

Change-Id: I1920d60dd35de5b2385a9b06989397628a2f1272
This commit is contained in:
Moby von Briesen 2020-04-23 15:46:16 -04:00 committed by Maximillian von Briesen
parent baccfd36b1
commit de366537a8
4 changed files with 222 additions and 11 deletions

View File

@ -5,6 +5,7 @@ package audit_test
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -16,6 +17,7 @@ import (
"storj.io/common/testrand"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite/audit"
"storj.io/storj/satellite/overlay"
)
func TestReportPendingAudits(t *testing.T) {
@ -177,3 +179,66 @@ func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
require.Equal(t, suspendedAt, node.Suspended)
})
}
// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
// and disqualification flags are not updated when an audit is reported for that node.
func TestGracefullyExitedNotUpdated(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
cache := satellite.Overlay.DB
successNode := planet.StorageNodes[0]
failedNode := planet.StorageNodes[1]
containedNode := planet.StorageNodes[2]
unknownNode := planet.StorageNodes[3]
offlineNode := planet.StorageNodes[4]
nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
// mark each node as having gracefully exited
for _, node := range nodeList {
req := &overlay.ExitStatusRequest{
NodeID: node.ID(),
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
_, err := cache.UpdateExitStatus(ctx, req)
require.NoError(t, err)
}
pending := audit.PendingAudit{
NodeID: containedNode.ID(),
PieceID: storj.NewPieceID(),
StripeIndex: 1,
ShareSize: 1 * memory.KiB.Int32(),
ExpectedShareHash: pkcrypto.SHA256Hash([]byte("test")),
}
report := audit.Report{
Successes: storj.NodeIDList{successNode.ID()},
Fails: storj.NodeIDList{failedNode.ID()},
Offlines: storj.NodeIDList{offlineNode.ID()},
PendingAudits: []*audit.PendingAudit{&pending},
Unknown: storj.NodeIDList{unknownNode.ID()},
}
failed, err := audits.Reporter.RecordAudits(ctx, report, "")
require.NoError(t, err)
assert.Zero(t, failed)
// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
for _, node := range nodeList {
nodeCacheInfo, err := cache.Get(ctx, node.ID())
require.NoError(t, err)
require.EqualValues(t, 1, nodeCacheInfo.Reputation.AuditReputationAlpha)
require.EqualValues(t, 0, nodeCacheInfo.Reputation.AuditReputationBeta)
require.EqualValues(t, 1, nodeCacheInfo.Reputation.UnknownAuditReputationAlpha)
require.EqualValues(t, 0, nodeCacheInfo.Reputation.UnknownAuditReputationBeta)
require.Nil(t, nodeCacheInfo.Suspended)
require.Nil(t, nodeCacheInfo.Disqualified)
}
})
}

View File

@ -31,15 +31,17 @@ func TestStatDB(t *testing.T) {
func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
{ // TestKnownUnreliableOrOffline
for _, tt := range []struct {
nodeID storj.NodeID
suspended bool
disqualified bool
offline bool
nodeID storj.NodeID
suspended bool
disqualified bool
offline bool
gracefullyexited bool
}{
{storj.NodeID{1}, false, false, false}, // good
{storj.NodeID{2}, false, true, false}, // disqualified
{storj.NodeID{3}, true, false, false}, // suspended
{storj.NodeID{4}, false, false, true}, // offline
{storj.NodeID{1}, false, false, false, false}, // good
{storj.NodeID{2}, false, true, false, false}, // disqualified
{storj.NodeID{3}, true, false, false, false}, // suspended
{storj.NodeID{4}, false, false, true, false}, // offline
{storj.NodeID{5}, false, false, false, true}, // gracefully exited
} {
startingRep := overlay.NodeSelectionConfig{}
n := pb.Node{Id: tt.nodeID}
@ -61,12 +63,22 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
err = cache.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-2*time.Hour), overlay.NodeSelectionConfig{})
require.NoError(t, err)
}
if tt.gracefullyexited {
req := &overlay.ExitStatusRequest{
NodeID: tt.nodeID,
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
_, err := cache.UpdateExitStatus(ctx, req)
require.NoError(t, err)
}
}
nodeIds := storj.NodeIDList{
storj.NodeID{1}, storj.NodeID{2},
storj.NodeID{3}, storj.NodeID{4},
storj.NodeID{5},
storj.NodeID{5}, storj.NodeID{6},
}
criteria := &overlay.NodeCriteria{OnlineWindow: time.Hour}
@ -76,8 +88,9 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
require.Contains(t, invalid, storj.NodeID{2}) // disqualified
require.Contains(t, invalid, storj.NodeID{3}) // suspended
require.Contains(t, invalid, storj.NodeID{4}) // offline
require.Contains(t, invalid, storj.NodeID{5}) // not in db
require.Len(t, invalid, 4)
require.Contains(t, invalid, storj.NodeID{5}) // gracefully exited
require.Contains(t, invalid, storj.NodeID{6}) // not in db
require.Len(t, invalid, 5)
}
{ // TestUpdateOperator

View File

@ -1238,6 +1238,124 @@ func testDataRepairUploadLimit(t *testing.T, inMemoryRepair bool) {
})
}
// TestRepairGracefullyExited does the following:
// - Uploads test data to 7 nodes
// - Set 3 nodes as gracefully exited
// - Triggers data repair, which repairs the data from the remaining 4 nodes to additional 3 new nodes
// - Shuts down the 4 nodes from which the data was repaired
// - Now we have just the 3 new nodes to which the data was repaired
// - Downloads the data from these 3 nodes (succeeds because 3 nodes are enough for download)
// - Expect newly repaired pointer does not contain the gracefully exited nodes
func TestRepairGracefullyExitedInMemory(t *testing.T) {
testRepairGracefullyExited(t, true)
}
func TestRepairGracefullyExitedToDisk(t *testing.T) {
testRepairGracefullyExited(t, false)
}
func testRepairGracefullyExited(t *testing.T, inMemoryRepair bool) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1,
StorageNodeCount: 12,
UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.InMemoryRepair = inMemoryRepair
config.Metainfo.RS.MinThreshold = 3
config.Metainfo.RS.RepairThreshold = 5
config.Metainfo.RS.SuccessThreshold = 7
config.Metainfo.RS.TotalThreshold = 7
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
// first, upload some remote data
uplinkPeer := planet.Uplinks[0]
satellite := planet.Satellites[0]
satellite.Repair.Checker.Loop.Pause()
satellite.Repair.Repairer.Loop.Pause()
testData := testrand.Bytes(8 * memory.KiB)
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
require.NoError(t, err)
// get a remote segment from metainfo
metainfo := satellite.Metainfo.Service
listResponse, _, err := metainfo.List(ctx, "", "", true, 0, 0)
require.NoError(t, err)
var path string
var pointer *pb.Pointer
for _, v := range listResponse {
path = v.GetPath()
pointer, err = metainfo.Get(ctx, path)
require.NoError(t, err)
if pointer.GetType() == pb.Pointer_REMOTE {
break
}
}
numStorageNodes := len(planet.StorageNodes)
remotePieces := pointer.GetRemote().GetRemotePieces()
numPieces := len(remotePieces)
// sanity check
require.EqualValues(t, numPieces, 7)
toExit := 3
// we should have enough storage nodes to repair on
require.True(t, (numStorageNodes-toExit) >= numPieces)
// gracefully exit nodes and track lost pieces
nodesToExit := make(map[storj.NodeID]bool)
nodesToKeepAlive := make(map[storj.NodeID]bool)
// exit nodes
for i := 0; i < toExit; i++ {
nodesToExit[remotePieces[i].NodeId] = true
req := &overlay.ExitStatusRequest{
NodeID: remotePieces[i].NodeId,
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
_, err := satellite.DB.OverlayCache().UpdateExitStatus(ctx, req)
require.NoError(t, err)
}
for i := toExit; i < len(remotePieces); i++ {
nodesToKeepAlive[remotePieces[i].NodeId] = true
}
err = satellite.Repair.Checker.RefreshReliabilityCache(ctx)
require.NoError(t, err)
satellite.Repair.Checker.Loop.TriggerWait()
satellite.Repair.Repairer.Loop.TriggerWait()
satellite.Repair.Repairer.WaitForPendingRepairs()
// kill nodes kept alive to ensure repair worked
for _, node := range planet.StorageNodes {
if nodesToKeepAlive[node.ID()] {
stopNodeByID(t, ctx, planet, node.ID())
}
}
// we should be able to download data without any of the original nodes
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
require.NoError(t, err)
require.Equal(t, newData, testData)
// updated pointer should not contain any of the gracefully exited nodes
pointer, err = metainfo.Get(ctx, path)
require.NoError(t, err)
remotePieces = pointer.GetRemote().GetRemotePieces()
for _, piece := range remotePieces {
require.False(t, nodesToExit[piece.NodeId])
}
})
}
// getRemoteSegment returns a remote pointer its path from satellite.
// nolint:golint
func getRemoteSegment(

View File

@ -229,6 +229,7 @@ func (cache *overlaycache) KnownUnreliableOrOffline(ctx context.Context, criteri
WHERE id = any($1::bytea[])
AND disqualified IS NULL
AND suspended IS NULL
AND exit_finished_at IS NULL
AND last_contact_success > $2
`), postgresNodeIDList(nodeIds), time.Now().Add(-criteria.OnlineWindow),
)
@ -269,6 +270,7 @@ func (cache *overlaycache) KnownReliable(ctx context.Context, onlineWindow time.
WHERE id = any($1::bytea[])
AND disqualified IS NULL
AND suspended IS NULL
AND exit_finished_at IS NULL
AND last_contact_success > $2
`), postgresNodeIDList(nodeIDs), time.Now().Add(-onlineWindow),
)
@ -299,6 +301,7 @@ func (cache *overlaycache) Reliable(ctx context.Context, criteria *overlay.NodeC
SELECT id FROM nodes
WHERE disqualified IS NULL
AND suspended IS NULL
AND exit_finished_at IS NULL
AND last_contact_success > ?
`), time.Now().Add(-criteria.OnlineWindow))
if err != nil {
@ -407,6 +410,10 @@ func (cache *overlaycache) BatchUpdateStats(ctx context.Context, updateRequests
if dbNode.Disqualified != nil {
continue
}
// do not update reputation if node has gracefully exited
if dbNode.ExitFinishedAt != nil {
continue
}
updateNodeStats := cache.populateUpdateNodeStats(dbNode, updateReq)
@ -471,6 +478,10 @@ func (cache *overlaycache) UpdateStats(ctx context.Context, updateReq *overlay.U
if dbNode.Disqualified != nil {
return nil
}
// do not update reputation if node has gracefully exited
if dbNode.ExitFinishedAt != nil {
return nil
}
updateFields := cache.populateUpdateFields(dbNode, updateReq)
@ -549,6 +560,10 @@ func (cache *overlaycache) UpdateUptime(ctx context.Context, nodeID storj.NodeID
if dbNode.Disqualified != nil {
return nil
}
// do not update reputation if node has gracefully exited
if dbNode.ExitFinishedAt != nil {
return nil
}
updateFields := dbx.Node_Update_Fields{}
totalUptimeCount := dbNode.TotalUptimeCount