storagenode/piecestore: start restore trash in the background

Starting restore trash in the background allows the satellite to
continue to the next storagenode without needing to wait until
completion.

Of course, this means the satellite doesn't get feedback whether it
succeeds successfully or not. This means that the restore-trash needs to
be executed several times.

Change-Id: I62d43f6f2e4a07854f6d083a65badf897338083b
This commit is contained in:
Egon Elbre 2022-12-15 19:52:05 +02:00
parent 73d5c6944a
commit ee71fbb41d
4 changed files with 115 additions and 27 deletions

View File

@ -542,6 +542,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
peer.Storage2.RetainService, peer.Storage2.RetainService,
peer.Contact.PingStats, peer.Contact.PingStats,
peer.Storage2.Store, peer.Storage2.Store,
peer.Storage2.TrashChore,
peer.Storage2.PieceDeleter, peer.Storage2.PieceDeleter,
peer.OrdersStore, peer.OrdersStore,
peer.DB.Bandwidth(), peer.DB.Bandwidth(),

View File

@ -418,12 +418,14 @@ func TestTrashAndRestore(t *testing.T) {
// Empty trash by running the chore once // Empty trash by running the chore once
trashDur := 4 * 24 * time.Hour trashDur := 4 * 24 * time.Hour
chorectx, chorecancel := context.WithCancel(ctx)
chore := pieces.NewTrashChore(zaptest.NewLogger(t), 24*time.Hour, trashDur, trust, store) chore := pieces.NewTrashChore(zaptest.NewLogger(t), 24*time.Hour, trashDur, trust, store)
ctx.Go(func() error { ctx.Go(func() error {
return chore.Run(ctx) return chore.Run(chorectx)
}) })
chore.TriggerWait(ctx) chore.Cycle.TriggerWait()
require.NoError(t, chore.Close()) require.NoError(t, chore.Close())
chorecancel()
// Restore all pieces in the first satellite // Restore all pieces in the first satellite
require.NoError(t, store.RestoreTrash(ctx, satellites[0].satelliteID)) require.NoError(t, store.RestoreTrash(ctx, satellites[0].satelliteID))

View File

@ -5,11 +5,13 @@ package pieces
import ( import (
"context" "context"
"sync"
"time" "time"
"go.uber.org/zap" "go.uber.org/zap"
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"storj.io/common/storj"
"storj.io/common/sync2" "storj.io/common/sync2"
"storj.io/storj/storagenode/trust" "storj.io/storj/storagenode/trust"
) )
@ -17,12 +19,15 @@ import (
// TrashChore is the chore that periodically empties the trash. // TrashChore is the chore that periodically empties the trash.
type TrashChore struct { type TrashChore struct {
log *zap.Logger log *zap.Logger
interval time.Duration
trashExpiryInterval time.Duration trashExpiryInterval time.Duration
store *Store store *Store
trust *trust.Pool trust *trust.Pool
cycle *sync2.Cycle
started sync2.Fence Cycle *sync2.Cycle
workers workersService
mu sync.Mutex
restoring map[storj.NodeID]bool
} }
// NewTrashChore instantiates a new TrashChore. choreInterval is how often this // NewTrashChore instantiates a new TrashChore. choreInterval is how often this
@ -31,10 +36,12 @@ type TrashChore struct {
func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Duration, trust *trust.Pool, store *Store) *TrashChore { func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Duration, trust *trust.Pool, store *Store) *TrashChore {
return &TrashChore{ return &TrashChore{
log: log, log: log,
interval: choreInterval,
trashExpiryInterval: trashExpiryInterval, trashExpiryInterval: trashExpiryInterval,
store: store, store: store,
trust: trust, trust: trust,
Cycle: sync2.NewCycle(choreInterval),
restoring: map[storj.NodeID]bool{},
} }
} }
@ -42,11 +49,19 @@ func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Dura
func (chore *TrashChore) Run(ctx context.Context) (err error) { func (chore *TrashChore) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err) defer mon.Task()(&ctx)(&err)
chore.cycle = sync2.NewCycle(chore.interval) var group errgroup.Group
chore.cycle.Start(ctx, &errgroup.Group{}, func(ctx context.Context) error { chore.Cycle.Start(ctx, &group, func(ctx context.Context) error {
chore.log.Debug("starting to empty trash") chore.log.Debug("starting to empty trash")
for _, satelliteID := range chore.trust.GetSatellites(ctx) { for _, satelliteID := range chore.trust.GetSatellites(ctx) {
// ignore satellites that are being restored
chore.mu.Lock()
isRestoring := chore.restoring[satelliteID]
chore.mu.Unlock()
if isRestoring {
continue
}
trashedBefore := time.Now().Add(-chore.trashExpiryInterval) trashedBefore := time.Now().Add(-chore.trashExpiryInterval)
err := chore.store.EmptyTrash(ctx, satelliteID, trashedBefore) err := chore.store.EmptyTrash(ctx, satelliteID, trashedBefore)
if err != nil { if err != nil {
@ -56,22 +71,96 @@ func (chore *TrashChore) Run(ctx context.Context) (err error) {
return nil return nil
}) })
chore.started.Release() group.Go(func() error {
return err chore.workers.Run(ctx)
return nil
})
return group.Wait()
} }
// TriggerWait ensures that the cycle is done at least once and waits for // StartRestore starts restoring trash for the specified satellite.
// completion. If the cycle is currently running it waits for the previous to func (chore *TrashChore) StartRestore(ctx context.Context, satellite storj.NodeID) {
// complete and then runs. chore.mu.Lock()
func (chore *TrashChore) TriggerWait(ctx context.Context) { isRestoring := chore.restoring[satellite]
chore.started.Wait(ctx) if isRestoring {
chore.cycle.TriggerWait() chore.mu.Unlock()
return
}
chore.restoring[satellite] = true
chore.mu.Unlock()
ok := chore.workers.Go(ctx, func(ctx context.Context) {
chore.log.Info("restore trash started", zap.Stringer("Satellite ID", satellite))
err := chore.store.RestoreTrash(ctx, satellite)
if err != nil {
chore.log.Error("restore trash failed", zap.Stringer("Satellite ID", satellite), zap.Error(err))
} else {
chore.log.Info("restore trash finished", zap.Stringer("Satellite ID", satellite))
}
chore.mu.Lock()
delete(chore.restoring, satellite)
chore.mu.Unlock()
})
if !ok {
chore.log.Info("failed to start restore trash", zap.Stringer("Satellite ID", satellite))
}
} }
// Close the chore. // Close the chore.
func (chore *TrashChore) Close() error { func (chore *TrashChore) Close() error {
if chore.cycle != nil { chore.Cycle.Close()
chore.cycle.Close()
}
return nil return nil
} }
// workersService allows to start workers with a different context.
type workersService struct {
started sync2.Fence
root context.Context
active sync.WaitGroup
mu sync.Mutex
closed bool
}
// Run starts waiting for worker requests with the specified context.
func (workers *workersService) Run(ctx context.Context) {
// setup root context that the workers are bound to
workers.root = ctx
workers.started.Release()
// wait until it's time to shut down:
<-workers.root.Done()
// ensure we don't allow starting workers after it's time to shut down
workers.mu.Lock()
workers.closed = true
workers.mu.Unlock()
// wait for any remaining workers
workers.active.Wait()
}
// Go tries to start a worker.
func (workers *workersService) Go(ctx context.Context, work func(context.Context)) bool {
// Wait until we can use workers.root.
if !workers.started.Wait(ctx) {
return false
}
// check that we are still allowed to start new workers
workers.mu.Lock()
if workers.closed {
workers.mu.Unlock()
return false
}
workers.active.Add(1)
workers.mu.Unlock()
go func() {
defer workers.active.Done()
work(workers.root)
}()
return true
}

View File

@ -96,6 +96,7 @@ type Endpoint struct {
pingStats pingStatsSource pingStats pingStatsSource
store *pieces.Store store *pieces.Store
trashChore *pieces.TrashChore
ordersStore *orders.FileStore ordersStore *orders.FileStore
usage bandwidth.DB usage bandwidth.DB
usedSerials *usedserials.Table usedSerials *usedserials.Table
@ -105,7 +106,7 @@ type Endpoint struct {
} }
// NewEndpoint creates a new piecestore endpoint. // NewEndpoint creates a new piecestore endpoint.
func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, retain *retain.Service, pingStats pingStatsSource, store *pieces.Store, pieceDeleter *pieces.Deleter, ordersStore *orders.FileStore, usage bandwidth.DB, usedSerials *usedserials.Table, config Config) (*Endpoint, error) { func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, retain *retain.Service, pingStats pingStatsSource, store *pieces.Store, trashChore *pieces.TrashChore, pieceDeleter *pieces.Deleter, ordersStore *orders.FileStore, usage bandwidth.DB, usedSerials *usedserials.Table, config Config) (*Endpoint, error) {
return &Endpoint{ return &Endpoint{
log: log, log: log,
config: config, config: config,
@ -117,6 +118,7 @@ func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, moni
pingStats: pingStats, pingStats: pingStats,
store: store, store: store,
trashChore: trashChore,
ordersStore: ordersStore, ordersStore: ordersStore,
usage: usage, usage: usage,
usedSerials: usedSerials, usedSerials: usedSerials,
@ -750,13 +752,7 @@ func (endpoint *Endpoint) RestoreTrash(ctx context.Context, restoreTrashReq *pb.
return nil, rpcstatus.Error(rpcstatus.PermissionDenied, "RestoreTrash called with untrusted ID") return nil, rpcstatus.Error(rpcstatus.PermissionDenied, "RestoreTrash called with untrusted ID")
} }
endpoint.log.Info("restore trash started", zap.Stringer("Satellite ID", peer.ID)) endpoint.trashChore.StartRestore(ctx, peer.ID)
err = endpoint.store.RestoreTrash(ctx, peer.ID)
if err != nil {
endpoint.log.Error("restore trash failed", zap.Stringer("Satellite ID", peer.ID), zap.Error(err))
return nil, rpcstatus.Wrap(rpcstatus.Internal, err)
}
endpoint.log.Info("restore trash finished", zap.Stringer("Satellite ID", peer.ID))
return &pb.RestoreTrashResponse{}, nil return &pb.RestoreTrashResponse{}, nil
} }