satellite/gracefulexit: add missing test cases
These test cases are the parts of the testplan for the Graceful Exit Revamp which are automateable but not yet automated. I'm not entirely sure why we have to reject graceful exit from nodes that are suspended, but implementing that was probably easier than convincing everybody that it's not necessary. Refs: https://github.com/storj/storj/issues/6369 Change-Id: I0261b37f7e010d72d84332cde5dd8689f7c41580
This commit is contained in:
parent
a2c162db9b
commit
a06735c1b6
@ -154,6 +154,9 @@ func (endpoint *Endpoint) processTimeBased(ctx context.Context, stream pb.DRPCSa
|
||||
if isDisqualified {
|
||||
return rpcstatus.Error(rpcstatus.FailedPrecondition, "node is disqualified")
|
||||
}
|
||||
if endpoint.handleSuspendedNodeTimeBased(nodeInfo) {
|
||||
return rpcstatus.Error(rpcstatus.FailedPrecondition, "node is suspended. Please get node unsuspended before initiating graceful exit")
|
||||
}
|
||||
|
||||
msg, err := endpoint.checkExitStatusTimeBased(ctx, nodeInfo)
|
||||
if err != nil {
|
||||
@ -738,6 +741,16 @@ func (endpoint *Endpoint) handleDisqualifiedNodeTimeBased(ctx context.Context, n
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (endpoint *Endpoint) handleSuspendedNodeTimeBased(nodeInfo *overlay.NodeDossier) (isSuspended bool) {
|
||||
if nodeInfo.UnknownAuditSuspended != nil || nodeInfo.OfflineSuspended != nil {
|
||||
// If the node already initiated graceful exit, we'll let it carry on until / unless it gets disqualified.
|
||||
// Otherwise, the operator should make an effort to get the node un-suspended before initiating GE.
|
||||
// (The all-wise Go linter won't let me write this in a clearer way.)
|
||||
return nodeInfo.ExitStatus.ExitInitiatedAt == nil
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (endpoint *Endpoint) handleFinished(ctx context.Context, stream pb.DRPCSatelliteGracefulExit_ProcessStream, exitStatusRequest *overlay.ExitStatusRequest, failedReason pb.ExitFailed_Reason) error {
|
||||
finishedMsg, err := endpoint.getFinishedMessage(ctx, exitStatusRequest.NodeID, exitStatusRequest.ExitFinishedAt, exitStatusRequest.ExitSuccess, failedReason)
|
||||
if err != nil {
|
||||
|
@ -5,6 +5,7 @@ package gracefulexit_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"testing"
|
||||
@ -1820,6 +1821,105 @@ func TestNodeAlreadyExited(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestNodeSuspended(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1,
|
||||
StorageNodeCount: 4,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.GracefulExit.TimeBased = true
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
|
||||
// check that there are no exiting nodes.
|
||||
exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, exitingNodes, 0)
|
||||
|
||||
// mark a node as suspended
|
||||
exitingNode := planet.StorageNodes[0]
|
||||
err = satellite.Reputation.Service.TestSuspendNodeUnknownAudit(ctx, exitingNode.ID(), time.Now())
|
||||
require.NoError(t, err)
|
||||
|
||||
// initiate GE
|
||||
response, err := callProcess(ctx, exitingNode, satellite)
|
||||
require.Error(t, err)
|
||||
require.ErrorContains(t, err, "node is suspended")
|
||||
require.Nil(t, response)
|
||||
})
|
||||
}
|
||||
|
||||
func TestManyNodesGracefullyExiting(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1,
|
||||
StorageNodeCount: 8,
|
||||
UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: testplanet.Combine(
|
||||
testplanet.ReconfigureRS(2, 3, 4, 4),
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.GracefulExit.TimeBased = true
|
||||
},
|
||||
),
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
uplink := planet.Uplinks[0]
|
||||
|
||||
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
|
||||
// upload several objects; enough that we can reasonably expect every node to have several pieces
|
||||
const numObjects = 32
|
||||
objectData := make([][]byte, numObjects)
|
||||
for i := 0; i < numObjects; i++ {
|
||||
objectData[i] = testrand.Bytes(64 * memory.KiB)
|
||||
err := uplink.Upload(ctx, satellite, "testbucket", fmt.Sprintf("test/path/obj%d", i), objectData[i])
|
||||
require.NoError(t, err, i)
|
||||
}
|
||||
|
||||
// Make half of the nodes initiate GE
|
||||
for i := 0; i < len(planet.StorageNodes)/2; i++ {
|
||||
response, err := callProcess(ctx, planet.StorageNodes[i], satellite)
|
||||
require.NoError(t, err, i)
|
||||
require.IsType(t, (*pb.SatelliteMessage_NotReady)(nil), response.GetMessage())
|
||||
}
|
||||
|
||||
// run the satellite ranged loop to build the transfer queue.
|
||||
_, err := satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// we expect ~78% of segments to be in the repair queue (the chance that a
|
||||
// segment still has at least 3 pieces in not-exiting nodes). but since things
|
||||
// will fluctuate, let's just expect half
|
||||
count, err := satellite.DB.RepairQueue().Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.GreaterOrEqual(t, count, numObjects/2)
|
||||
|
||||
// perform the repairs, which should get every piece so that it will still be
|
||||
// reconstructable without the exiting nodes.
|
||||
satellite.Repair.Repairer.Loop.Restart()
|
||||
satellite.Repair.Repairer.Loop.TriggerWait()
|
||||
satellite.Repair.Repairer.Loop.Pause()
|
||||
satellite.Repair.Repairer.WaitForPendingRepairs()
|
||||
|
||||
// turn off the exiting nodes entirely
|
||||
for i := 0; i < len(planet.StorageNodes)/2; i++ {
|
||||
err = planet.StopNodeAndUpdate(ctx, planet.StorageNodes[i])
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
// expect that we can retrieve and verify all objects
|
||||
for i, obj := range objectData {
|
||||
gotData, err := uplink.Download(ctx, satellite, "testbucket", fmt.Sprintf("test/path/obj%d", i))
|
||||
require.NoError(t, err, i)
|
||||
require.Equal(t, string(obj), string(gotData))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestNodeFailingGracefulExitWithLowOnlineScore(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1,
|
||||
|
Loading…
Reference in New Issue
Block a user