satellite/gracefulexit: add missing test cases

These test cases are the parts of the testplan for the Graceful Exit
Revamp which are automateable but not yet automated.

I'm not entirely sure why we have to reject graceful exit from nodes
that are suspended, but implementing that was probably easier than
convincing everybody that it's not necessary.

Refs: https://github.com/storj/storj/issues/6369
Change-Id: I0261b37f7e010d72d84332cde5dd8689f7c41580
This commit is contained in:
paul cannon 2023-10-03 00:12:09 -05:00 committed by Storj Robot
parent a2c162db9b
commit a06735c1b6
2 changed files with 113 additions and 0 deletions

View File

@ -154,6 +154,9 @@ func (endpoint *Endpoint) processTimeBased(ctx context.Context, stream pb.DRPCSa
if isDisqualified { if isDisqualified {
return rpcstatus.Error(rpcstatus.FailedPrecondition, "node is disqualified") return rpcstatus.Error(rpcstatus.FailedPrecondition, "node is disqualified")
} }
if endpoint.handleSuspendedNodeTimeBased(nodeInfo) {
return rpcstatus.Error(rpcstatus.FailedPrecondition, "node is suspended. Please get node unsuspended before initiating graceful exit")
}
msg, err := endpoint.checkExitStatusTimeBased(ctx, nodeInfo) msg, err := endpoint.checkExitStatusTimeBased(ctx, nodeInfo)
if err != nil { if err != nil {
@ -738,6 +741,16 @@ func (endpoint *Endpoint) handleDisqualifiedNodeTimeBased(ctx context.Context, n
return false, nil return false, nil
} }
func (endpoint *Endpoint) handleSuspendedNodeTimeBased(nodeInfo *overlay.NodeDossier) (isSuspended bool) {
if nodeInfo.UnknownAuditSuspended != nil || nodeInfo.OfflineSuspended != nil {
// If the node already initiated graceful exit, we'll let it carry on until / unless it gets disqualified.
// Otherwise, the operator should make an effort to get the node un-suspended before initiating GE.
// (The all-wise Go linter won't let me write this in a clearer way.)
return nodeInfo.ExitStatus.ExitInitiatedAt == nil
}
return false
}
func (endpoint *Endpoint) handleFinished(ctx context.Context, stream pb.DRPCSatelliteGracefulExit_ProcessStream, exitStatusRequest *overlay.ExitStatusRequest, failedReason pb.ExitFailed_Reason) error { func (endpoint *Endpoint) handleFinished(ctx context.Context, stream pb.DRPCSatelliteGracefulExit_ProcessStream, exitStatusRequest *overlay.ExitStatusRequest, failedReason pb.ExitFailed_Reason) error {
finishedMsg, err := endpoint.getFinishedMessage(ctx, exitStatusRequest.NodeID, exitStatusRequest.ExitFinishedAt, exitStatusRequest.ExitSuccess, failedReason) finishedMsg, err := endpoint.getFinishedMessage(ctx, exitStatusRequest.NodeID, exitStatusRequest.ExitFinishedAt, exitStatusRequest.ExitSuccess, failedReason)
if err != nil { if err != nil {

View File

@ -5,6 +5,7 @@ package gracefulexit_test
import ( import (
"context" "context"
"fmt"
"io" "io"
"strconv" "strconv"
"testing" "testing"
@ -1820,6 +1821,105 @@ func TestNodeAlreadyExited(t *testing.T) {
}) })
} }
func TestNodeSuspended(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1,
StorageNodeCount: 4,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.GracefulExit.TimeBased = true
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// check that there are no exiting nodes.
exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)
require.NoError(t, err)
require.Len(t, exitingNodes, 0)
// mark a node as suspended
exitingNode := planet.StorageNodes[0]
err = satellite.Reputation.Service.TestSuspendNodeUnknownAudit(ctx, exitingNode.ID(), time.Now())
require.NoError(t, err)
// initiate GE
response, err := callProcess(ctx, exitingNode, satellite)
require.Error(t, err)
require.ErrorContains(t, err, "node is suspended")
require.Nil(t, response)
})
}
func TestManyNodesGracefullyExiting(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1,
StorageNodeCount: 8,
UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: testplanet.Combine(
testplanet.ReconfigureRS(2, 3, 4, 4),
func(log *zap.Logger, index int, config *satellite.Config) {
config.GracefulExit.TimeBased = true
},
),
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
uplink := planet.Uplinks[0]
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
satellite.Repair.Repairer.Loop.Pause()
// upload several objects; enough that we can reasonably expect every node to have several pieces
const numObjects = 32
objectData := make([][]byte, numObjects)
for i := 0; i < numObjects; i++ {
objectData[i] = testrand.Bytes(64 * memory.KiB)
err := uplink.Upload(ctx, satellite, "testbucket", fmt.Sprintf("test/path/obj%d", i), objectData[i])
require.NoError(t, err, i)
}
// Make half of the nodes initiate GE
for i := 0; i < len(planet.StorageNodes)/2; i++ {
response, err := callProcess(ctx, planet.StorageNodes[i], satellite)
require.NoError(t, err, i)
require.IsType(t, (*pb.SatelliteMessage_NotReady)(nil), response.GetMessage())
}
// run the satellite ranged loop to build the transfer queue.
_, err := satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
require.NoError(t, err)
// we expect ~78% of segments to be in the repair queue (the chance that a
// segment still has at least 3 pieces in not-exiting nodes). but since things
// will fluctuate, let's just expect half
count, err := satellite.DB.RepairQueue().Count(ctx)
require.NoError(t, err)
require.GreaterOrEqual(t, count, numObjects/2)
// perform the repairs, which should get every piece so that it will still be
// reconstructable without the exiting nodes.
satellite.Repair.Repairer.Loop.Restart()
satellite.Repair.Repairer.Loop.TriggerWait()
satellite.Repair.Repairer.Loop.Pause()
satellite.Repair.Repairer.WaitForPendingRepairs()
// turn off the exiting nodes entirely
for i := 0; i < len(planet.StorageNodes)/2; i++ {
err = planet.StopNodeAndUpdate(ctx, planet.StorageNodes[i])
require.NoError(t, err)
}
// expect that we can retrieve and verify all objects
for i, obj := range objectData {
gotData, err := uplink.Download(ctx, satellite, "testbucket", fmt.Sprintf("test/path/obj%d", i))
require.NoError(t, err, i)
require.Equal(t, string(obj), string(gotData))
}
})
}
func TestNodeFailingGracefulExitWithLowOnlineScore(t *testing.T) { func TestNodeFailingGracefulExitWithLowOnlineScore(t *testing.T) {
testplanet.Run(t, testplanet.Config{ testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, SatelliteCount: 1,