satellite/gracefulexit: suspended nodes fail graceful exit
QA discovered that if your node lost all its data but your audit score was still healthy, you could call graceful exit, respond with unknown audit errors when audited, and eventually get suspended, but still get your held amount back. We will therefore mark suspended nodes as having failed graceful exit. Refs: https://github.com/storj/storj-private/issues/485 Change-Id: Id5af18786b574651587cc96bd6a7d0b47c0671a8
This commit is contained in:
parent
084719cde1
commit
a712ee94aa
@ -970,6 +970,7 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
|
||||
ExitFinishedAt: endpoint.nowFunc(),
|
||||
ExitSuccess: true,
|
||||
}
|
||||
var reason pb.ExitFailed_Reason
|
||||
|
||||
// We don't check the online score constantly over the course of the graceful exit,
|
||||
// because we want to give the node a chance to get the score back up if it's
|
||||
@ -978,9 +979,19 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
|
||||
// Instead, we check the overall score at the end of the GE period.
|
||||
if reputationInfo.OnlineScore < endpoint.config.MinimumOnlineScore {
|
||||
request.ExitSuccess = false
|
||||
reason = pb.ExitFailed_INACTIVE_TIMEFRAME_EXCEEDED
|
||||
}
|
||||
// If a node has lost all of its data, it could still initiate graceful exit and return
|
||||
// unknown errors to audits, getting suspended but not disqualified. Since such nodes
|
||||
// should not receive their held amount back, any nodes that are suspended at the end
|
||||
// of the graceful exit period will be treated as having failed graceful exit.
|
||||
if reputationInfo.UnknownAuditSuspended != nil {
|
||||
request.ExitSuccess = false
|
||||
reason = pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED
|
||||
}
|
||||
endpoint.log.Info("node completed graceful exit",
|
||||
zap.Float64("online score", reputationInfo.OnlineScore),
|
||||
zap.Bool("suspended", reputationInfo.UnknownAuditSuspended != nil),
|
||||
zap.Bool("success", request.ExitSuccess),
|
||||
zap.Stringer("node ID", nodeInfo.Id))
|
||||
updatedNode, err := endpoint.overlaydb.UpdateExitStatus(ctx, request)
|
||||
@ -992,7 +1003,7 @@ func (endpoint *Endpoint) checkExitStatusTimeBased(ctx context.Context, nodeInfo
|
||||
return endpoint.getFinishedSuccessMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt)
|
||||
}
|
||||
mon.Meter("graceful_exit_failure").Mark(1)
|
||||
return endpoint.getFinishedFailureMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt, pb.ExitFailed_INACTIVE_TIMEFRAME_EXCEEDED)
|
||||
return endpoint.getFinishedFailureMessage(ctx, updatedNode.Id, *updatedNode.ExitStatus.ExitFinishedAt, reason)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1997,6 +1997,45 @@ func TestNodeFailingGracefulExitWithLowOnlineScore(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestSuspendedNodesFailGracefulExit(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1,
|
||||
StorageNodeCount: 4,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Reputation.FlushInterval = 0
|
||||
config.GracefulExit.TimeBased = true
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
exitingNode := planet.StorageNodes[0]
|
||||
|
||||
simTime := time.Now()
|
||||
satellite.GracefulExit.Endpoint.SetNowFunc(func() time.Time { return simTime })
|
||||
doneTime := simTime.AddDate(0, 0, satellite.Config.GracefulExit.GracefulExitDurationInDays)
|
||||
|
||||
// initiate GE
|
||||
response, err := callProcess(ctx, exitingNode, satellite)
|
||||
require.NoError(t, err)
|
||||
require.IsType(t, (*pb.SatelliteMessage_NotReady)(nil), response.GetMessage())
|
||||
|
||||
// suspend the node
|
||||
err = satellite.Reputation.Service.TestSuspendNodeUnknownAudit(ctx, exitingNode.ID(), simTime)
|
||||
require.NoError(t, err)
|
||||
|
||||
// expect failure when the time is up
|
||||
simTime = doneTime.Add(time.Second)
|
||||
|
||||
response, err = callProcess(ctx, exitingNode, satellite)
|
||||
require.NoError(t, err)
|
||||
msg := response.GetMessage()
|
||||
require.IsType(t, (*pb.SatelliteMessage_ExitFailed)(nil), msg)
|
||||
failure := msg.(*pb.SatelliteMessage_ExitFailed)
|
||||
require.Equal(t, pb.ExitFailed_OVERALL_FAILURE_PERCENTAGE_EXCEEDED, failure.ExitFailed.Reason)
|
||||
})
|
||||
}
|
||||
|
||||
func hasDuplicates(pieces metabase.Pieces) bool {
|
||||
nodePieceCounts := make(map[storj.NodeID]int)
|
||||
for _, piece := range pieces {
|
||||
|
Loading…
Reference in New Issue
Block a user