satellite/gracefulexit: suspended nodes fail graceful exit

QA discovered that if your node lost all its data but your audit score was still healthy, you could call graceful exit, respond with unknown audit errors when audited, and eventually get suspended, but still get your held amount back. We will therefore mark suspended nodes as having failed graceful exit. Refs: https://github.com/storj/storj-private/issues/485 Change-Id: Id5af18786b574651587cc96bd6a7d0b47c0671a8
2023-11-02 16:40:02 -05:00 · 2023-11-02 16:40:02 -05:00 · a712ee94aa
commit a712ee94aa
parent 084719cde1
2 changed files with 51 additions and 1 deletions
--- a/satellite/gracefulexit/endpoint.go
+++ b/satellite/gracefulexit/endpoint.go
@ -970,6 +970,7 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
 				ExitFinishedAt: endpoint.nowFunc(),
 				ExitSuccess:    true,
 			}
+			var reason pb.ExitFailed_Reason

 			// We don't check the online score constantly over the course of the graceful exit,
 			// because we want to give the node a chance to get the score back up if it's
@ -978,9 +979,19 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
 			// Instead, we check the overall score at the end of the GE period.
 			if reputationInfo.OnlineScore < endpoint.config.MinimumOnlineScore {
 				request.ExitSuccess = false
+				reason = pb.ExitFailed_INACTIVE_TIMEFRAME_EXCEEDED
+			}
+			// If a node has lost all of its data, it could still initiate graceful exit and return
+			// unknown errors to audits, getting suspended but not disqualified. Since such nodes
+			// should not receive their held amount back, any nodes that are suspended at the end
+			// of the graceful exit period will be treated as having failed graceful exit.
+			if reputationInfo.UnknownAuditSuspended != nil {
+				request.ExitSuccess = false
+				reason = pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED
 			}
 			endpoint.log.Info("node completed graceful exit",
 				zap.Float64("online score", reputationInfo.OnlineScore),
+				zap.Bool("suspended", reputationInfo.UnknownAuditSuspended != nil),
 				zap.Bool("success", request.ExitSuccess),
 				zap.Stringer("node ID", nodeInfo.Id))
 			updatedNode, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)
@ -992,7 +1003,7 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
 				return endpoint.getFinishedSuccessMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt)
 			}
 			mon.Meter("graceful_exit_failure").Mark(1)
-			return endpoint.getFinishedFailureMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt, pb.ExitFailed_INACTIVE_TIMEFRAME_EXCEEDED)
+			return endpoint.getFinishedFailureMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt, reason)
 		}
 	}

--- a/satellite/gracefulexit/endpoint_test.go
+++ b/satellite/gracefulexit/endpoint_test.go
@ -1997,6 +1997,45 @@ func TestNodeFailingGracefulExitWithLowOnlineScore(t *testing.T) {
 	})
 }

+func TestSuspendedNodesFailGracefulExit(t *testing.T) {
+	testplanet.Run(t, testplanet.Config{
+		SatelliteCount:   1,
+		StorageNodeCount: 4,
+		Reconfigure: testplanet.Reconfigure{
+			Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
+				config.Reputation.FlushInterval = 0
+				config.GracefulExit.TimeBased = true
+			},
+		},
+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
+		satellite := planet.Satellites[0]
+		exitingNode := planet.StorageNodes[0]
+
+		simTime := time.Now()
+		satellite.GracefulExit.Endpoint.SetNowFunc(func() time.Time { return simTime })
+		doneTime := simTime.AddDate(0, 0, satellite.Config.GracefulExit.GracefulExitDurationInDays)
+
+		// initiate GE
+		response, err := callProcess(ctx, exitingNode, satellite)
+		require.NoError(t, err)
+		require.IsType(t, (*pb.SatelliteMessage_NotReady)(nil), response.GetMessage())
+
+		// suspend the node
+		err = satellite.Reputation.Service.TestSuspendNodeUnknownAudit(ctx, exitingNode.ID(), simTime)
+		require.NoError(t, err)
+
+		// expect failure when the time is up
+		simTime = doneTime.Add(time.Second)
+
+		response, err = callProcess(ctx, exitingNode, satellite)
+		require.NoError(t, err)
+		msg := response.GetMessage()
+		require.IsType(t, (*pb.SatelliteMessage_ExitFailed)(nil), msg)
+		failure := msg.(*pb.SatelliteMessage_ExitFailed)
+		require.Equal(t, pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED, failure.ExitFailed.Reason)
+	})
+}
+
 func hasDuplicates(pieces metabase.Pieces) bool {
 	nodePieceCounts := make(map[storj.NodeID]int)
 	for _, piece := range pieces {