From dc5ae6f4f6ec5fb8bb10f4d35428e14049bc65bb Mon Sep 17 00:00:00 2001 From: paul cannon Date: Thu, 12 Oct 2023 21:52:56 -0500 Subject: [PATCH] satellite/gracefulexit: add test for GE through upgrade This creates an automated test for the situation where a node initiates graceful exit while TimeBased is off, and then TimeBased is turned on before the node has completed graceful exit. The node should no longer try to transfer any more pieces, but should instead sit and wait until the graceful exit period has elapsed. Change-Id: Iaf636f9247bc878bc20041221e1a8014c77806ad --- satellite/gracefulexit/endpoint.go | 6 ++ satellite/gracefulexit/endpoint_test.go | 108 ++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/satellite/gracefulexit/endpoint.go b/satellite/gracefulexit/endpoint.go index 2a53607dc..5c27dbc29 100644 --- a/satellite/gracefulexit/endpoint.go +++ b/satellite/gracefulexit/endpoint.go @@ -121,6 +121,12 @@ func (endpoint *Endpoint) SetNowFunc(timeFunc func() time.Time) { endpoint.nowFunc = timeFunc } +// TestSetTimeBased changes the setting of config.TimeBased at runtime. To be used for testing +// purposes only. +func (endpoint *Endpoint) TestSetTimeBased(enabled bool) { + endpoint.config.TimeBased = enabled +} + // Process is called by storage nodes to receive pieces to transfer to new nodes and get exit status. func (endpoint *Endpoint) Process(stream pb.DRPCSatelliteGracefulExit_ProcessStream) (err error) { ctx := stream.Context() diff --git a/satellite/gracefulexit/endpoint_test.go b/satellite/gracefulexit/endpoint_test.go index 3b0118757..4e80f875e 100644 --- a/satellite/gracefulexit/endpoint_test.go +++ b/satellite/gracefulexit/endpoint_test.go @@ -36,6 +36,7 @@ import ( "storj.io/storj/storagenode" "storj.io/storj/storagenode/blobstore/testblobs" "storj.io/storj/storagenode/gracefulexit" + "storj.io/storj/storagenode/satellites" "storj.io/uplink/private/piecestore" ) @@ -2036,6 +2037,113 @@ func TestSuspendedNodesFailGracefulExit(t *testing.T) { }) } +func TestNodeStartedGracefulExitBeforeUpgradeToTimeBased(t *testing.T) { + testplanet.Run(t, testplanet.Config{ + SatelliteCount: 1, + StorageNodeCount: 8, + UplinkCount: 1, + Reconfigure: testplanet.Reconfigure{ + Satellite: testplanet.Combine( + testplanet.ReconfigureRS(4, 5, 6, 6), + func(log *zap.Logger, index int, config *satellite.Config) { + config.Reputation.FlushInterval = 0 + config.GracefulExit.GracefulExitDurationInDays = 1 + config.GracefulExit.EndpointBatchSize = 1 + // starts off as false; we will recreate the API server halfway through with true + config.GracefulExit.TimeBased = false + }, + ), + }, + }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { + satellite := planet.Satellites[0] + exitingNode := planet.StorageNodes[0] + uplink := planet.Uplinks[0] + + simTime := time.Now() + satellite.GracefulExit.Endpoint.SetNowFunc(func() time.Time { return simTime }) + + // Get some data on the node. + for i := 0; i < 10; i++ { + path := fmt.Sprintf("test/path/%d", i) + err := uplink.Upload(ctx, satellite, "test-bucket", path, testrand.Bytes(24*memory.KiB)) + require.NoError(t, err, path) + } + _, piecesContentSize, err := exitingNode.Storage2.BlobsCache.SpaceUsedBySatellite(ctx, satellite.ID()) + require.NoError(t, err) + require.NotZero(t, piecesContentSize) + + // Pause the GE chore on the node so it doesn't start transferring pieces too quickly + // and finish before we even do the test. + exitingNode.GracefulExit.Chore.Loop.Pause() + + // Initiate GE on the node (this affects only the storagenode DB). + err = exitingNode.DB.Satellites().InitiateGracefulExit(ctx, satellite.ID(), time.Now(), piecesContentSize) + require.NoError(t, err) + + // Allow the node to start talking to the satellite about its graceful exit. This will make + // the Process() call to the API, but the transfer queue won't get built yet because the + // chore is stopped. The worker should exit almost immediately after determining that the + // server doesn't have any pieces for it to transfer yet. + exitingNode.GracefulExit.Chore.Loop.TriggerWait() + err = exitingNode.GracefulExit.Chore.TestWaitForNoWorkers(ctx) + require.NoError(t, err) + + // Run the ranged loop once to build the transfer queue. + _, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx) + require.NoError(t, err) + + // Ensure there are elements in the queue. + queue, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 1, 0) + require.NoError(t, err) + require.Len(t, queue, 1) + + // Change the setting of TimeBased on the API server. It shouldn't be necessary + // to restart the subsystem because TimeBased does not affect how an API server + // object is initialized. It _does_ affect whether the gracefulexit ranged loop + // observer gets registered, but having the non-time-based ranged loop observer + // registered should not break anything. + satellite.GracefulExit.Endpoint.TestSetTimeBased(true) + + // Allow the node to proceed with graceful exit. Again, the worker should exit almost + // immediately after determining that the server doesn't have any pieces for it to + // transfer yet. + exitingNode.GracefulExit.Chore.Loop.TriggerWait() + err = exitingNode.GracefulExit.Chore.TestWaitForNoWorkers(ctx) + require.NoError(t, err) + + // It shouldn't be done yet, because graceful exit is time-based now + exits, err := exitingNode.DB.Satellites().ListGracefulExits(ctx) + require.NoError(t, err) + require.Len(t, exits, 1) + nodeInfo, err := satellite.Overlay.Service.Get(ctx, exitingNode.ID()) + require.NoError(t, err) + require.False(t, nodeInfo.ExitStatus.ExitSuccess) + require.NotNil(t, nodeInfo.ExitStatus.ExitInitiatedAt) + require.Nil(t, nodeInfo.ExitStatus.ExitFinishedAt) + + // Move time forward 2 days so that the node should be done + simTime = simTime.Add(48 * time.Hour) + + // Have the node check in to the graceful exit endpoint again. This time, the API server + // should respond that the node has completed GE. + exitingNode.GracefulExit.Chore.Loop.TriggerWait() + err = exitingNode.GracefulExit.Chore.TestWaitForNoWorkers(ctx) + require.NoError(t, err) + + // The node should know it's done + exits, err = exitingNode.DB.Satellites().ListGracefulExits(ctx) + require.NoError(t, err) + require.Len(t, exits, 1) + require.Equal(t, satellites.ExitSucceeded, exits[0].Status) + require.NotNil(t, exits[0].FinishedAt) + // and the satellite should know it's done. + nodeInfo, err = satellite.Overlay.Service.Get(ctx, exitingNode.ID()) + require.NoError(t, err) + require.True(t, nodeInfo.ExitStatus.ExitSuccess) + require.NotNil(t, nodeInfo.ExitStatus.ExitFinishedAt) + }) +} + func hasDuplicates(pieces metabase.Pieces) bool { nodePieceCounts := make(map[storj.NodeID]int) for _, piece := range pieces {