satellite/gracefulexit: don't mark GE done when it's not done

When an api server is processing a graceful exit (node is connected and
getting lists of pieces to transfer), and the api server is shut down,
it was incorrectly marking all pending graceful exits as complete. The
GE then either passed or failed depending on the ratio of successfully
transferred pieces to unsuccessful pieces. In at least one case, _no_
pieces were transferred at all before the GE was marked a success.

Change-Id: I62cfab54a2296572c2e654eb460b62f772b7a60b
This commit is contained in:
paul cannon 2021-09-29 07:21:29 -05:00
parent 48fb3e947c
commit 3f3f028c88

View File

@ -174,6 +174,9 @@ func (endpoint *Endpoint) Process(stream pb.DRPCSatelliteGracefulExit_ProcessStr
ctx, cancel := context.WithCancel(ctx)
defer cancel()
var geSuccess bool
var geSuccessMutex sync.Mutex
group.Go(func() error {
incompleteLoop := sync2.NewCycle(endpoint.interval)
@ -195,6 +198,9 @@ func (endpoint *Endpoint) Process(stream pb.DRPCSatelliteGracefulExit_ProcessStr
if len(incomplete) == 0 {
endpoint.log.Debug("no more pieces to transfer for node", zap.Stringer("Node ID", nodeID))
geSuccessMutex.Lock()
geSuccess = true
geSuccessMutex.Unlock()
cancel()
return pending.DoneSending(nil)
}
@ -222,6 +228,16 @@ func (endpoint *Endpoint) Process(stream pb.DRPCSatelliteGracefulExit_ProcessStr
// if there is no more work to receive send complete
if finished {
// This point is reached both when an exit is entirely successful and
// when the satellite is being shut down. geSuccess
// differentiates these cases.
geSuccessMutex.Lock()
wasSuccessful := geSuccess
geSuccessMutex.Unlock()
if !wasSuccessful {
return rpcstatus.Error(rpcstatus.Canceled, "graceful exit processing interrupted (node should reconnect and continue)")
}
isDisqualified, err := endpoint.handleDisqualifiedNode(ctx, nodeID)
if err != nil {
return rpcstatus.Error(rpcstatus.Internal, err.Error())