gracefulexit: reconnect added
Change-Id: I236689af944effe3e79ef92e852ae264d3b372e5
This commit is contained in:
parent
68b67c83a7
commit
cff44fbd19
@ -16,6 +16,9 @@ var (
|
|||||||
// Error is the default error class for graceful exit package.
|
// Error is the default error class for graceful exit package.
|
||||||
Error = errs.Class("gracefulexit")
|
Error = errs.Class("gracefulexit")
|
||||||
|
|
||||||
|
// ErrReconnect is error class for connection/transport error.
|
||||||
|
ErrReconnect = errs.Class("reconnect")
|
||||||
|
|
||||||
mon = monkit.Package()
|
mon = monkit.Package()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/zeebo/errs"
|
"github.com/zeebo/errs"
|
||||||
@ -40,6 +41,12 @@ type Worker struct {
|
|||||||
ecclient ecclient.Client
|
ecclient ecclient.Client
|
||||||
minBytesPerSecond memory.Size
|
minBytesPerSecond memory.Size
|
||||||
minDownloadTimeout time.Duration
|
minDownloadTimeout time.Duration
|
||||||
|
Connects int64
|
||||||
|
NumSucceeded int64
|
||||||
|
NumFailed int64
|
||||||
|
Conn *rpc.Conn
|
||||||
|
ProcessClient pb.DRPCSatelliteGracefulExit_ProcessClient
|
||||||
|
backoffTime time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewWorker instantiates Worker.
|
// NewWorker instantiates Worker.
|
||||||
@ -58,34 +65,45 @@ func NewWorker(log *zap.Logger, store *pieces.Store, trust *trust.Pool, satellit
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
startBackOffTime = 500 * time.Millisecond
|
||||||
|
maxBackOffTime = time.Hour
|
||||||
|
)
|
||||||
|
|
||||||
// Run calls the satellite endpoint, transfers pieces, validates, and responds with success or failure.
|
// Run calls the satellite endpoint, transfers pieces, validates, and responds with success or failure.
|
||||||
// It also marks the satellite finished once all the pieces have been transferred.
|
// It also marks the satellite finished once all the pieces have been transferred.
|
||||||
func (worker *Worker) Run(ctx context.Context, done func()) (err error) {
|
func (worker *Worker) Run(ctx context.Context, done func()) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
|
worker.backoffTime = startBackOffTime
|
||||||
|
|
||||||
worker.log.Debug("running worker")
|
worker.log.Debug("running worker")
|
||||||
|
|
||||||
conn, err := worker.dialer.DialNodeURL(ctx, worker.satelliteURL)
|
|
||||||
if err != nil {
|
|
||||||
return errs.Wrap(err)
|
|
||||||
}
|
|
||||||
defer func() {
|
defer func() {
|
||||||
err = errs.Combine(err, conn.Close())
|
worker.log.Info("numbers of success, reconnects, failures:", zap.Any("successes:", atomic.LoadInt64(&worker.NumSucceeded)), zap.Any("connection attempts:", atomic.LoadInt64(&worker.Connects)), zap.Any("fails:", atomic.LoadInt64(&worker.NumFailed)))
|
||||||
}()
|
}()
|
||||||
|
|
||||||
client := pb.NewDRPCSatelliteGracefulExitClient(conn)
|
defer func() {
|
||||||
|
err = errs.Combine(err, worker.Conn.Close())
|
||||||
c, err := client.Process(ctx)
|
}()
|
||||||
if err != nil {
|
|
||||||
return errs.Wrap(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
for {
|
for {
|
||||||
response, err := c.Recv()
|
err = worker.CheckConnection(ctx)
|
||||||
|
if err != nil {
|
||||||
|
if ErrReconnect.Has(err) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
worker.backoffTime = startBackOffTime
|
||||||
|
|
||||||
|
response, err := worker.ProcessClient.Recv()
|
||||||
if errs.Is(err, io.EOF) {
|
if errs.Is(err, io.EOF) {
|
||||||
// Done
|
err = worker.Conn.Close()
|
||||||
return nil
|
if err != nil {
|
||||||
|
worker.log.Error("unable to close connection", zap.Error(err))
|
||||||
|
}
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
if errs2.IsRPC(err, rpcstatus.FailedPrecondition) {
|
if errs2.IsRPC(err, rpcstatus.FailedPrecondition) {
|
||||||
// delete the entry from satellite table and inform graceful exit has failed to start
|
// delete the entry from satellite table and inform graceful exit has failed to start
|
||||||
@ -97,7 +115,7 @@ func (worker *Worker) Run(ctx context.Context, done func()) (err error) {
|
|||||||
return errs.Wrap(err)
|
return errs.Wrap(err)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// TODO what happened
|
worker.log.Error("error while receiving message from satellite", zap.Error(err))
|
||||||
return errs.Wrap(err)
|
return errs.Wrap(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -108,11 +126,14 @@ func (worker *Worker) Run(ctx context.Context, done func()) (err error) {
|
|||||||
case *pb.SatelliteMessage_TransferPiece:
|
case *pb.SatelliteMessage_TransferPiece:
|
||||||
transferPieceMsg := msg.TransferPiece
|
transferPieceMsg := msg.TransferPiece
|
||||||
worker.limiter.Go(ctx, func() {
|
worker.limiter.Go(ctx, func() {
|
||||||
err = worker.transferPiece(ctx, transferPieceMsg, c)
|
err = worker.transferPiece(ctx, transferPieceMsg, worker.ProcessClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
worker.log.Error("failed to transfer piece.",
|
worker.log.Error("failed to transfer piece.",
|
||||||
zap.Stringer("Satellite ID", worker.satelliteURL.ID),
|
zap.Stringer("Satellite ID", worker.satelliteURL.ID),
|
||||||
zap.Error(errs.Wrap(err)))
|
zap.Error(errs.Wrap(err)))
|
||||||
|
atomic.AddInt64(&worker.NumFailed, 1)
|
||||||
|
} else {
|
||||||
|
atomic.AddInt64(&worker.NumSucceeded, 1)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -162,8 +183,9 @@ func (worker *Worker) Run(ctx context.Context, done func()) (err error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return errs.Wrap(err)
|
return errs.Wrap(err)
|
||||||
}
|
}
|
||||||
// delete everything left in blobs folder of specific satellites
|
// delete everything left in blobs folder of specific satellites.
|
||||||
err = worker.store.DeleteSatelliteBlobs(ctx, worker.satelliteURL.ID)
|
err = worker.store.DeleteSatelliteBlobs(ctx, worker.satelliteURL.ID)
|
||||||
|
|
||||||
return errs.Wrap(err)
|
return errs.Wrap(err)
|
||||||
default:
|
default:
|
||||||
// TODO handle err
|
// TODO handle err
|
||||||
@ -364,6 +386,48 @@ func (worker *Worker) deletePiece(ctx context.Context, pieceID storj.PieceID) er
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// wait waiting between new requests incrementing interval each time until limit exceeded.
|
||||||
|
func (worker *Worker) wait(ctx context.Context) error {
|
||||||
|
worker.backoffTime *= 2
|
||||||
|
if worker.backoffTime > maxBackOffTime {
|
||||||
|
worker.backoffTime = maxBackOffTime
|
||||||
|
}
|
||||||
|
|
||||||
|
if !sync2.Sleep(ctx, worker.backoffTime) {
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckConnection in worker for loop checks if connection is closed or nil, if so - reconnects to satellite.
|
||||||
|
func (worker *Worker) CheckConnection(ctx context.Context) error {
|
||||||
|
if worker.Conn != nil && !worker.Conn.Closed() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
err := worker.wait(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return Error.Wrap(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic.AddInt64(&worker.Connects, 1)
|
||||||
|
worker.Conn, err = worker.dialer.DialNodeURL(ctx, worker.satelliteURL)
|
||||||
|
if err != nil {
|
||||||
|
worker.log.Error("couldn't create connection with satellite", zap.Error(err))
|
||||||
|
return ErrReconnect.Wrap(err)
|
||||||
|
}
|
||||||
|
client := pb.NewDRPCSatelliteGracefulExitClient(worker.Conn)
|
||||||
|
|
||||||
|
worker.ProcessClient, err = client.Process(ctx)
|
||||||
|
if err != nil {
|
||||||
|
worker.log.Error("storagenode couldn't call Process", zap.Error(err))
|
||||||
|
return errs.Wrap(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// deleteAllPieces deletes pieces stored for a satellite.
|
// deleteAllPieces deletes pieces stored for a satellite.
|
||||||
func (worker *Worker) deleteAllPieces(ctx context.Context) error {
|
func (worker *Worker) deleteAllPieces(ctx context.Context) error {
|
||||||
var totalDeleted int64
|
var totalDeleted int64
|
||||||
|
@ -94,6 +94,89 @@ func TestWorkerSuccess(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWorkerCheckConnection(t *testing.T) {
|
||||||
|
const successThreshold = 4
|
||||||
|
testplanet.Run(t, testplanet.Config{
|
||||||
|
SatelliteCount: 1,
|
||||||
|
StorageNodeCount: successThreshold + 1,
|
||||||
|
UplinkCount: 1,
|
||||||
|
Reconfigure: testplanet.Reconfigure{
|
||||||
|
Satellite: testplanet.ReconfigureRS(2, 3, successThreshold, successThreshold),
|
||||||
|
},
|
||||||
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||||
|
satellite := planet.Satellites[0]
|
||||||
|
ul := planet.Uplinks[0]
|
||||||
|
|
||||||
|
satellite.GracefulExit.Chore.Loop.Pause()
|
||||||
|
|
||||||
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
exitingNode, err := findNodeToExit(ctx, planet, 1)
|
||||||
|
require.NoError(t, err)
|
||||||
|
exitingNode.GracefulExit.Chore.Loop.Pause()
|
||||||
|
|
||||||
|
exitStatusReq := overlay.ExitStatusRequest{
|
||||||
|
NodeID: exitingNode.ID(),
|
||||||
|
ExitInitiatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
_, err = satellite.Overlay.DB.UpdateExitStatus(ctx, &exitStatusReq)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// run the satellite chore to build the transfer queue.
|
||||||
|
satellite.GracefulExit.Chore.Loop.TriggerWait()
|
||||||
|
satellite.GracefulExit.Chore.Loop.Pause()
|
||||||
|
|
||||||
|
// check that the satellite knows the storage node is exiting.
|
||||||
|
exitingNodes, err := satellite.DB.OverlayCache().GetExitingNodes(ctx)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, exitingNodes, 1)
|
||||||
|
require.Equal(t, exitingNode.ID(), exitingNodes[0].NodeID)
|
||||||
|
|
||||||
|
queueItems, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 10, 0)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, queueItems, 1)
|
||||||
|
|
||||||
|
// run the SN chore again to start processing transfers.
|
||||||
|
worker := gracefulexit.NewWorker(zaptest.NewLogger(t), exitingNode.Storage2.Store, exitingNode.Peer.Storage2.Trust, exitingNode.DB.Satellites(), exitingNode.Dialer, satellite.NodeURL(),
|
||||||
|
gracefulexit.Config{
|
||||||
|
ChoreInterval: 0,
|
||||||
|
NumWorkers: 2,
|
||||||
|
NumConcurrentTransfers: 2,
|
||||||
|
MinBytesPerSecond: 128,
|
||||||
|
MinDownloadTimeout: 2 * time.Minute,
|
||||||
|
})
|
||||||
|
defer ctx.Check(worker.Close)
|
||||||
|
require.Nil(t, worker.Conn)
|
||||||
|
require.Nil(t, worker.ProcessClient)
|
||||||
|
require.Equal(t, worker.NumFailed, int64(0))
|
||||||
|
require.Equal(t, worker.NumSucceeded, int64(0))
|
||||||
|
require.Equal(t, worker.Connects, int64(0))
|
||||||
|
|
||||||
|
err = worker.CheckConnection(ctx)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotNil(t, worker.Conn)
|
||||||
|
require.NotNil(t, worker.ProcessClient)
|
||||||
|
require.Equal(t, worker.Connects, int64(1))
|
||||||
|
|
||||||
|
err = worker.Run(ctx, func() {})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
progress, err := satellite.DB.GracefulExit().GetProgress(ctx, exitingNode.ID())
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.EqualValues(t, progress.PiecesFailed, 0)
|
||||||
|
require.EqualValues(t, progress.PiecesTransferred, 1)
|
||||||
|
|
||||||
|
exitStatus, err := satellite.DB.OverlayCache().GetExitStatus(ctx, exitingNode.ID())
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotNil(t, exitStatus.ExitFinishedAt)
|
||||||
|
require.True(t, exitStatus.ExitSuccess)
|
||||||
|
require.Equal(t, worker.NumFailed, int64(0))
|
||||||
|
require.Equal(t, worker.NumSucceeded, int64(1))
|
||||||
|
require.Equal(t, worker.Connects, int64(1))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestWorkerTimeout(t *testing.T) {
|
func TestWorkerTimeout(t *testing.T) {
|
||||||
const successThreshold = 4
|
const successThreshold = 4
|
||||||
testplanet.Run(t, testplanet.Config{
|
testplanet.Run(t, testplanet.Config{
|
||||||
|
Loading…
Reference in New Issue
Block a user