satellite/gracefulexit: suspended nodes fail graceful exit

QA discovered that if your node lost all its data but your audit score
was still healthy, you could call graceful exit, respond with unknown
audit errors when audited, and eventually get suspended, but still get
your held amount back.

We will therefore mark suspended nodes as having failed graceful exit.

Refs: https://github.com/storj/storj-private/issues/485
Change-Id: Id5af18786b574651587cc96bd6a7d0b47c0671a8
This commit is contained in:
paul cannon 2023-11-02 16:40:02 -05:00 committed by Storj Robot
parent 084719cde1
commit a712ee94aa
2 changed files with 51 additions and 1 deletions

View File

@ -970,6 +970,7 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
ExitFinishedAt: endpoint.nowFunc(),
ExitSuccess: true,
}
var reason pb.ExitFailed_Reason
// We don't check the online score constantly over the course of the graceful exit,
// because we want to give the node a chance to get the score back up if it's
@ -978,9 +979,19 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
// Instead, we check the overall score at the end of the GE period.
if reputationInfo.OnlineScore < endpoint.config.MinimumOnlineScore {
request.ExitSuccess = false
reason = pb.ExitFailed_INACTIVE_TIMEFRAME_EXCEEDED
}
// If a node has lost all of its data, it could still initiate graceful exit and return
// unknown errors to audits, getting suspended but not disqualified. Since such nodes
// should not receive their held amount back, any nodes that are suspended at the end
// of the graceful exit period will be treated as having failed graceful exit.
if reputationInfo.UnknownAuditSuspended != nil {
request.ExitSuccess = false
reason = pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED
}
endpoint.log.Info("node completed graceful exit",
zap.Float64("online score", reputationInfo.OnlineScore),
zap.Bool("suspended", reputationInfo.UnknownAuditSuspended != nil),
zap.Bool("success", request.ExitSuccess),
zap.Stringer("node ID", nodeInfo.Id))
updatedNode, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)
@ -992,7 +1003,7 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
return endpoint.getFinishedSuccessMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt)
}
mon.Meter("graceful_exit_failure").Mark(1)
return endpoint.getFinishedFailureMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt, pb.ExitFailed_INACTIVE_TIMEFRAME_EXCEEDED)
return endpoint.getFinishedFailureMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt, reason)
}
}

View File

@ -1997,6 +1997,45 @@ func TestNodeFailingGracefulExitWithLowOnlineScore(t *testing.T) {
})
}
func TestSuspendedNodesFailGracefulExit(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1,
StorageNodeCount: 4,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Reputation.FlushInterval = 0
config.GracefulExit.TimeBased = true
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
exitingNode := planet.StorageNodes[0]
simTime := time.Now()
satellite.GracefulExit.Endpoint.SetNowFunc(func() time.Time { return simTime })
doneTime := simTime.AddDate(0, 0, satellite.Config.GracefulExit.GracefulExitDurationInDays)
// initiate GE
response, err := callProcess(ctx, exitingNode, satellite)
require.NoError(t, err)
require.IsType(t, (*pb.SatelliteMessage_NotReady)(nil), response.GetMessage())
// suspend the node
err = satellite.Reputation.Service.TestSuspendNodeUnknownAudit(ctx, exitingNode.ID(), simTime)
require.NoError(t, err)
// expect failure when the time is up
simTime = doneTime.Add(time.Second)
response, err = callProcess(ctx, exitingNode, satellite)
require.NoError(t, err)
msg := response.GetMessage()
require.IsType(t, (*pb.SatelliteMessage_ExitFailed)(nil), msg)
failure := msg.(*pb.SatelliteMessage_ExitFailed)
require.Equal(t, pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED, failure.ExitFailed.Reason)
})
}
func hasDuplicates(pieces metabase.Pieces) bool {
nodePieceCounts := make(map[storj.NodeID]int)
for _, piece := range pieces {