storagenode/piecestore: start restore trash in the background

Starting restore trash in the background allows the satellite to continue to the next storagenode without needing to wait until completion. Of course, this means the satellite doesn't get feedback whether it succeeds successfully or not. This means that the restore-trash needs to be executed several times. Change-Id: I62d43f6f2e4a07854f6d083a65badf897338083b
2022-12-15 19:52:05 +02:00 · 2022-12-15 19:52:05 +02:00 · ee71fbb41d
commit ee71fbb41d
parent 73d5c6944a
4 changed files with 115 additions and 27 deletions
--- a/storagenode/peer.go
+++ b/storagenode/peer.go
@ -542,6 +542,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
 			peer.Storage2.RetainService,
 			peer.Contact.PingStats,
 			peer.Storage2.Store,
 			peer.Storage2.TrashChore,
 			peer.Storage2.PieceDeleter,
 			peer.OrdersStore,
 			peer.DB.Bandwidth(),
--- a/storagenode/pieces/store_test.go
+++ b/storagenode/pieces/store_test.go
@ -418,12 +418,14 @@ func TestTrashAndRestore(t *testing.T) {
 		// Empty trash by running the chore once
 		trashDur := 4 * 24 * time.Hour
 		chorectx, chorecancel := context.WithCancel(ctx)
 		chore := pieces.NewTrashChore(zaptest.NewLogger(t), 24*time.Hour, trashDur, trust, store)
 		ctx.Go(func() error {
-			return chore.Run(ctx)
+			return chore.Run(chorectx)
 		})
-		chore.TriggerWait(ctx)
+		chore.Cycle.TriggerWait()
 		require.NoError(t, chore.Close())
 		chorecancel()
 		// Restore all pieces in the first satellite
 		require.NoError(t, store.RestoreTrash(ctx, satellites[0].satelliteID))
--- a/storagenode/pieces/trashchore.go
+++ b/storagenode/pieces/trashchore.go
@ -5,11 +5,13 @@ package pieces
 import (
 	"context"
 	"sync"
 	"time"
 	"go.uber.org/zap"
 	"golang.org/x/sync/errgroup"
 	"storj.io/common/storj"
 	"storj.io/common/sync2"
 	"storj.io/storj/storagenode/trust"
 )
@ -17,12 +19,15 @@ import (
 // TrashChore is the chore that periodically empties the trash.
 type TrashChore struct {
 	log                 *zap.Logger
 	interval            time.Duration
 	trashExpiryInterval time.Duration
 	store               *Store
 	trust               *trust.Pool
-	cycle               *sync2.Cycle
+
-	started             sync2.Fence
+	Cycle *sync2.Cycle
 	workers   workersService
 	mu        sync.Mutex
 	restoring map[storj.NodeID]bool
 }
 // NewTrashChore instantiates a new TrashChore. choreInterval is how often this
@ -31,10 +36,12 @@ type TrashChore struct {
 func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Duration, trust *trust.Pool, store *Store) *TrashChore {
 	return &TrashChore{
 		log:                 log,
 		interval:            choreInterval,
 		trashExpiryInterval: trashExpiryInterval,
 		store:               store,
 		trust:               trust,
 		Cycle:     sync2.NewCycle(choreInterval),
 		restoring: map[storj.NodeID]bool{},
 	}
 }
@ -42,11 +49,19 @@ func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Dura
 func (chore *TrashChore) Run(ctx context.Context) (err error) {
 	defer mon.Task()(&ctx)(&err)
-	chore.cycle = sync2.NewCycle(chore.interval)
+	var group errgroup.Group
-	chore.cycle.Start(ctx, &errgroup.Group{}, func(ctx context.Context) error {
+	chore.Cycle.Start(ctx, &group, func(ctx context.Context) error {
 		chore.log.Debug("starting to empty trash")
 		for _, satelliteID := range chore.trust.GetSatellites(ctx) {
 			// ignore satellites that are being restored
 			chore.mu.Lock()
 			isRestoring := chore.restoring[satelliteID]
 			chore.mu.Unlock()
 			if isRestoring {
 				continue
 			}
 			trashedBefore := time.Now().Add(-chore.trashExpiryInterval)
 			err := chore.store.EmptyTrash(ctx, satelliteID, trashedBefore)
 			if err != nil {
@ -56,22 +71,96 @@ func (chore *TrashChore) Run(ctx context.Context) (err error) {
 		return nil
 	})
-	chore.started.Release()
+	group.Go(func() error {
-	return err
+		chore.workers.Run(ctx)
 		return nil
 	})
 	return group.Wait()
 }
-// TriggerWait ensures that the cycle is done at least once and waits for
+// StartRestore starts restoring trash for the specified satellite.
-// completion.  If the cycle is currently running it waits for the previous to
+func (chore *TrashChore) StartRestore(ctx context.Context, satellite storj.NodeID) {
-// complete and then runs.
+	chore.mu.Lock()
-func (chore *TrashChore) TriggerWait(ctx context.Context) {
+	isRestoring := chore.restoring[satellite]
-	chore.started.Wait(ctx)
+	if isRestoring {
-	chore.cycle.TriggerWait()
+		chore.mu.Unlock()
 		return
 	}
 	chore.restoring[satellite] = true
 	chore.mu.Unlock()
 	ok := chore.workers.Go(ctx, func(ctx context.Context) {
 		chore.log.Info("restore trash started", zap.Stringer("Satellite ID", satellite))
 		err := chore.store.RestoreTrash(ctx, satellite)
 		if err != nil {
 			chore.log.Error("restore trash failed", zap.Stringer("Satellite ID", satellite), zap.Error(err))
 		} else {
 			chore.log.Info("restore trash finished", zap.Stringer("Satellite ID", satellite))
 		}
 		chore.mu.Lock()
 		delete(chore.restoring, satellite)
 		chore.mu.Unlock()
 	})
 	if !ok {
 		chore.log.Info("failed to start restore trash", zap.Stringer("Satellite ID", satellite))
 	}
 }
 // Close the chore.
 func (chore *TrashChore) Close() error {
-	if chore.cycle != nil {
+	chore.Cycle.Close()
 		chore.cycle.Close()
 	}
 	return nil
 }
 // workersService allows to start workers with a different context.
 type workersService struct {
 	started sync2.Fence
 	root    context.Context
 	active  sync.WaitGroup
 	mu     sync.Mutex
 	closed bool
 }
 // Run starts waiting for worker requests with the specified context.
 func (workers *workersService) Run(ctx context.Context) {
 	// setup root context that the workers are bound to
 	workers.root = ctx
 	workers.started.Release()
 	// wait until it's time to shut down:
 	<-workers.root.Done()
 	// ensure we don't allow starting workers after it's time to shut down
 	workers.mu.Lock()
 	workers.closed = true
 	workers.mu.Unlock()
 	// wait for any remaining workers
 	workers.active.Wait()
 }
 // Go tries to start a worker.
 func (workers *workersService) Go(ctx context.Context, work func(context.Context)) bool {
 	// Wait until we can use workers.root.
 	if !workers.started.Wait(ctx) {
 		return false
 	}
 	// check that we are still allowed to start new workers
 	workers.mu.Lock()
 	if workers.closed {
 		workers.mu.Unlock()
 		return false
 	}
 	workers.active.Add(1)
 	workers.mu.Unlock()
 	go func() {
 		defer workers.active.Done()
 		work(workers.root)
 	}()
 	return true
 }
--- a/storagenode/piecestore/endpoint.go
+++ b/storagenode/piecestore/endpoint.go
@ -96,6 +96,7 @@ type Endpoint struct {
 	pingStats pingStatsSource
 	store        *pieces.Store
 	trashChore   *pieces.TrashChore
 	ordersStore  *orders.FileStore
 	usage        bandwidth.DB
 	usedSerials  *usedserials.Table
@ -105,7 +106,7 @@ type Endpoint struct {
 }
 // NewEndpoint creates a new piecestore endpoint.
-func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, retain *retain.Service, pingStats pingStatsSource, store *pieces.Store, pieceDeleter *pieces.Deleter, ordersStore *orders.FileStore, usage bandwidth.DB, usedSerials *usedserials.Table, config Config) (*Endpoint, error) {
+func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, retain *retain.Service, pingStats pingStatsSource, store *pieces.Store, trashChore *pieces.TrashChore, pieceDeleter *pieces.Deleter, ordersStore *orders.FileStore, usage bandwidth.DB, usedSerials *usedserials.Table, config Config) (*Endpoint, error) {
 	return &Endpoint{
 		log:    log,
 		config: config,
@ -117,6 +118,7 @@ func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, moni
 		pingStats: pingStats,
 		store:        store,
 		trashChore:   trashChore,
 		ordersStore:  ordersStore,
 		usage:        usage,
 		usedSerials:  usedSerials,
@ -750,13 +752,7 @@ func (endpoint *Endpoint) RestoreTrash(ctx context.Context, restoreTrashReq *pb.
 		return nil, rpcstatus.Error(rpcstatus.PermissionDenied, "RestoreTrash called with untrusted ID")
 	}
-	endpoint.log.Info("restore trash started", zap.Stringer("Satellite ID", peer.ID))
+	endpoint.trashChore.StartRestore(ctx, peer.ID)
 	err = endpoint.store.RestoreTrash(ctx, peer.ID)
 	if err != nil {
 		endpoint.log.Error("restore trash failed", zap.Stringer("Satellite ID", peer.ID), zap.Error(err))
 		return nil, rpcstatus.Wrap(rpcstatus.Internal, err)
 	}
 	endpoint.log.Info("restore trash finished", zap.Stringer("Satellite ID", peer.ID))
 	return &pb.RestoreTrashResponse{}, nil
 }