storagenode/piecestore: start restore trash in the background
Starting restore trash in the background allows the satellite to continue to the next storagenode without needing to wait until completion. Of course, this means the satellite doesn't get feedback whether it succeeds successfully or not. This means that the restore-trash needs to be executed several times. Change-Id: I62d43f6f2e4a07854f6d083a65badf897338083b
This commit is contained in:
parent
73d5c6944a
commit
ee71fbb41d
@ -542,6 +542,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
|
|||||||
peer.Storage2.RetainService,
|
peer.Storage2.RetainService,
|
||||||
peer.Contact.PingStats,
|
peer.Contact.PingStats,
|
||||||
peer.Storage2.Store,
|
peer.Storage2.Store,
|
||||||
|
peer.Storage2.TrashChore,
|
||||||
peer.Storage2.PieceDeleter,
|
peer.Storage2.PieceDeleter,
|
||||||
peer.OrdersStore,
|
peer.OrdersStore,
|
||||||
peer.DB.Bandwidth(),
|
peer.DB.Bandwidth(),
|
||||||
|
@ -418,12 +418,14 @@ func TestTrashAndRestore(t *testing.T) {
|
|||||||
|
|
||||||
// Empty trash by running the chore once
|
// Empty trash by running the chore once
|
||||||
trashDur := 4 * 24 * time.Hour
|
trashDur := 4 * 24 * time.Hour
|
||||||
|
chorectx, chorecancel := context.WithCancel(ctx)
|
||||||
chore := pieces.NewTrashChore(zaptest.NewLogger(t), 24*time.Hour, trashDur, trust, store)
|
chore := pieces.NewTrashChore(zaptest.NewLogger(t), 24*time.Hour, trashDur, trust, store)
|
||||||
ctx.Go(func() error {
|
ctx.Go(func() error {
|
||||||
return chore.Run(ctx)
|
return chore.Run(chorectx)
|
||||||
})
|
})
|
||||||
chore.TriggerWait(ctx)
|
chore.Cycle.TriggerWait()
|
||||||
require.NoError(t, chore.Close())
|
require.NoError(t, chore.Close())
|
||||||
|
chorecancel()
|
||||||
|
|
||||||
// Restore all pieces in the first satellite
|
// Restore all pieces in the first satellite
|
||||||
require.NoError(t, store.RestoreTrash(ctx, satellites[0].satelliteID))
|
require.NoError(t, store.RestoreTrash(ctx, satellites[0].satelliteID))
|
||||||
|
@ -5,11 +5,13 @@ package pieces
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
|
"storj.io/common/storj"
|
||||||
"storj.io/common/sync2"
|
"storj.io/common/sync2"
|
||||||
"storj.io/storj/storagenode/trust"
|
"storj.io/storj/storagenode/trust"
|
||||||
)
|
)
|
||||||
@ -17,12 +19,15 @@ import (
|
|||||||
// TrashChore is the chore that periodically empties the trash.
|
// TrashChore is the chore that periodically empties the trash.
|
||||||
type TrashChore struct {
|
type TrashChore struct {
|
||||||
log *zap.Logger
|
log *zap.Logger
|
||||||
interval time.Duration
|
|
||||||
trashExpiryInterval time.Duration
|
trashExpiryInterval time.Duration
|
||||||
store *Store
|
store *Store
|
||||||
trust *trust.Pool
|
trust *trust.Pool
|
||||||
cycle *sync2.Cycle
|
|
||||||
started sync2.Fence
|
Cycle *sync2.Cycle
|
||||||
|
|
||||||
|
workers workersService
|
||||||
|
mu sync.Mutex
|
||||||
|
restoring map[storj.NodeID]bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewTrashChore instantiates a new TrashChore. choreInterval is how often this
|
// NewTrashChore instantiates a new TrashChore. choreInterval is how often this
|
||||||
@ -31,10 +36,12 @@ type TrashChore struct {
|
|||||||
func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Duration, trust *trust.Pool, store *Store) *TrashChore {
|
func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Duration, trust *trust.Pool, store *Store) *TrashChore {
|
||||||
return &TrashChore{
|
return &TrashChore{
|
||||||
log: log,
|
log: log,
|
||||||
interval: choreInterval,
|
|
||||||
trashExpiryInterval: trashExpiryInterval,
|
trashExpiryInterval: trashExpiryInterval,
|
||||||
store: store,
|
store: store,
|
||||||
trust: trust,
|
trust: trust,
|
||||||
|
|
||||||
|
Cycle: sync2.NewCycle(choreInterval),
|
||||||
|
restoring: map[storj.NodeID]bool{},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -42,11 +49,19 @@ func NewTrashChore(log *zap.Logger, choreInterval, trashExpiryInterval time.Dura
|
|||||||
func (chore *TrashChore) Run(ctx context.Context) (err error) {
|
func (chore *TrashChore) Run(ctx context.Context) (err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
chore.cycle = sync2.NewCycle(chore.interval)
|
var group errgroup.Group
|
||||||
chore.cycle.Start(ctx, &errgroup.Group{}, func(ctx context.Context) error {
|
chore.Cycle.Start(ctx, &group, func(ctx context.Context) error {
|
||||||
chore.log.Debug("starting to empty trash")
|
chore.log.Debug("starting to empty trash")
|
||||||
|
|
||||||
for _, satelliteID := range chore.trust.GetSatellites(ctx) {
|
for _, satelliteID := range chore.trust.GetSatellites(ctx) {
|
||||||
|
// ignore satellites that are being restored
|
||||||
|
chore.mu.Lock()
|
||||||
|
isRestoring := chore.restoring[satelliteID]
|
||||||
|
chore.mu.Unlock()
|
||||||
|
if isRestoring {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
trashedBefore := time.Now().Add(-chore.trashExpiryInterval)
|
trashedBefore := time.Now().Add(-chore.trashExpiryInterval)
|
||||||
err := chore.store.EmptyTrash(ctx, satelliteID, trashedBefore)
|
err := chore.store.EmptyTrash(ctx, satelliteID, trashedBefore)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -56,22 +71,96 @@ func (chore *TrashChore) Run(ctx context.Context) (err error) {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
chore.started.Release()
|
group.Go(func() error {
|
||||||
return err
|
chore.workers.Run(ctx)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
return group.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
// TriggerWait ensures that the cycle is done at least once and waits for
|
// StartRestore starts restoring trash for the specified satellite.
|
||||||
// completion. If the cycle is currently running it waits for the previous to
|
func (chore *TrashChore) StartRestore(ctx context.Context, satellite storj.NodeID) {
|
||||||
// complete and then runs.
|
chore.mu.Lock()
|
||||||
func (chore *TrashChore) TriggerWait(ctx context.Context) {
|
isRestoring := chore.restoring[satellite]
|
||||||
chore.started.Wait(ctx)
|
if isRestoring {
|
||||||
chore.cycle.TriggerWait()
|
chore.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
chore.restoring[satellite] = true
|
||||||
|
chore.mu.Unlock()
|
||||||
|
|
||||||
|
ok := chore.workers.Go(ctx, func(ctx context.Context) {
|
||||||
|
chore.log.Info("restore trash started", zap.Stringer("Satellite ID", satellite))
|
||||||
|
err := chore.store.RestoreTrash(ctx, satellite)
|
||||||
|
if err != nil {
|
||||||
|
chore.log.Error("restore trash failed", zap.Stringer("Satellite ID", satellite), zap.Error(err))
|
||||||
|
} else {
|
||||||
|
chore.log.Info("restore trash finished", zap.Stringer("Satellite ID", satellite))
|
||||||
|
}
|
||||||
|
|
||||||
|
chore.mu.Lock()
|
||||||
|
delete(chore.restoring, satellite)
|
||||||
|
chore.mu.Unlock()
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
chore.log.Info("failed to start restore trash", zap.Stringer("Satellite ID", satellite))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close the chore.
|
// Close the chore.
|
||||||
func (chore *TrashChore) Close() error {
|
func (chore *TrashChore) Close() error {
|
||||||
if chore.cycle != nil {
|
chore.Cycle.Close()
|
||||||
chore.cycle.Close()
|
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// workersService allows to start workers with a different context.
|
||||||
|
type workersService struct {
|
||||||
|
started sync2.Fence
|
||||||
|
root context.Context
|
||||||
|
active sync.WaitGroup
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
closed bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run starts waiting for worker requests with the specified context.
|
||||||
|
func (workers *workersService) Run(ctx context.Context) {
|
||||||
|
// setup root context that the workers are bound to
|
||||||
|
workers.root = ctx
|
||||||
|
workers.started.Release()
|
||||||
|
|
||||||
|
// wait until it's time to shut down:
|
||||||
|
<-workers.root.Done()
|
||||||
|
|
||||||
|
// ensure we don't allow starting workers after it's time to shut down
|
||||||
|
workers.mu.Lock()
|
||||||
|
workers.closed = true
|
||||||
|
workers.mu.Unlock()
|
||||||
|
|
||||||
|
// wait for any remaining workers
|
||||||
|
workers.active.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Go tries to start a worker.
|
||||||
|
func (workers *workersService) Go(ctx context.Context, work func(context.Context)) bool {
|
||||||
|
// Wait until we can use workers.root.
|
||||||
|
if !workers.started.Wait(ctx) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that we are still allowed to start new workers
|
||||||
|
workers.mu.Lock()
|
||||||
|
if workers.closed {
|
||||||
|
workers.mu.Unlock()
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
workers.active.Add(1)
|
||||||
|
workers.mu.Unlock()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer workers.active.Done()
|
||||||
|
work(workers.root)
|
||||||
|
}()
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
@ -96,6 +96,7 @@ type Endpoint struct {
|
|||||||
pingStats pingStatsSource
|
pingStats pingStatsSource
|
||||||
|
|
||||||
store *pieces.Store
|
store *pieces.Store
|
||||||
|
trashChore *pieces.TrashChore
|
||||||
ordersStore *orders.FileStore
|
ordersStore *orders.FileStore
|
||||||
usage bandwidth.DB
|
usage bandwidth.DB
|
||||||
usedSerials *usedserials.Table
|
usedSerials *usedserials.Table
|
||||||
@ -105,7 +106,7 @@ type Endpoint struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// NewEndpoint creates a new piecestore endpoint.
|
// NewEndpoint creates a new piecestore endpoint.
|
||||||
func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, retain *retain.Service, pingStats pingStatsSource, store *pieces.Store, pieceDeleter *pieces.Deleter, ordersStore *orders.FileStore, usage bandwidth.DB, usedSerials *usedserials.Table, config Config) (*Endpoint, error) {
|
func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, monitor *monitor.Service, retain *retain.Service, pingStats pingStatsSource, store *pieces.Store, trashChore *pieces.TrashChore, pieceDeleter *pieces.Deleter, ordersStore *orders.FileStore, usage bandwidth.DB, usedSerials *usedserials.Table, config Config) (*Endpoint, error) {
|
||||||
return &Endpoint{
|
return &Endpoint{
|
||||||
log: log,
|
log: log,
|
||||||
config: config,
|
config: config,
|
||||||
@ -117,6 +118,7 @@ func NewEndpoint(log *zap.Logger, signer signing.Signer, trust *trust.Pool, moni
|
|||||||
pingStats: pingStats,
|
pingStats: pingStats,
|
||||||
|
|
||||||
store: store,
|
store: store,
|
||||||
|
trashChore: trashChore,
|
||||||
ordersStore: ordersStore,
|
ordersStore: ordersStore,
|
||||||
usage: usage,
|
usage: usage,
|
||||||
usedSerials: usedSerials,
|
usedSerials: usedSerials,
|
||||||
@ -750,13 +752,7 @@ func (endpoint *Endpoint) RestoreTrash(ctx context.Context, restoreTrashReq *pb.
|
|||||||
return nil, rpcstatus.Error(rpcstatus.PermissionDenied, "RestoreTrash called with untrusted ID")
|
return nil, rpcstatus.Error(rpcstatus.PermissionDenied, "RestoreTrash called with untrusted ID")
|
||||||
}
|
}
|
||||||
|
|
||||||
endpoint.log.Info("restore trash started", zap.Stringer("Satellite ID", peer.ID))
|
endpoint.trashChore.StartRestore(ctx, peer.ID)
|
||||||
err = endpoint.store.RestoreTrash(ctx, peer.ID)
|
|
||||||
if err != nil {
|
|
||||||
endpoint.log.Error("restore trash failed", zap.Stringer("Satellite ID", peer.ID), zap.Error(err))
|
|
||||||
return nil, rpcstatus.Wrap(rpcstatus.Internal, err)
|
|
||||||
}
|
|
||||||
endpoint.log.Info("restore trash finished", zap.Stringer("Satellite ID", peer.ID))
|
|
||||||
|
|
||||||
return &pb.RestoreTrashResponse{}, nil
|
return &pb.RestoreTrashResponse{}, nil
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user