private/testplanet, storagenode/{monitor,pieces}: write storage dir verification file on run and verify on loop

On run, write the storage directory verification file.

Every time the node runs it will write the file even if it already exists.
The reason we do this is because if the verification file is missing, the SN
doesn't know whether it is an incorrect directory, or it simply hasn't written
the file yet, and we want to keep nodes running without needing operator intervention.

Once this change has been a part of the minimum version for several releases,
we will move the file creation from the run command to the setup
command. Run will only verify its existence.

Change-Id: Ib7d20e78e711c63817db0ab3036a50af0e8f49cb
This commit is contained in:
Cameron Ayer 2020-07-10 16:01:27 -04:00 committed by Cameron
parent 586e6f2f13
commit 0155c21b44
4 changed files with 22 additions and 2 deletions

View File

@ -149,6 +149,7 @@ func (planet *Planet) newStorageNodes(count int, whitelistedSatellites storj.Nod
Monitor: monitor.Config{
MinimumDiskSpace: 100 * memory.MB,
NotifyLowDiskCooldown: defaultInterval,
VerifyDirInterval: defaultInterval,
},
Trust: trust.Config{
Sources: sources,

View File

@ -30,6 +30,7 @@ var (
// Config defines parameters for storage node disk and bandwidth usage monitoring.
type Config struct {
Interval time.Duration `help:"how frequently Kademlia bucket should be refreshed with node stats" default:"1h0m0s"`
VerifyDirInterval time.Duration `help:"how frequently to verify access to the storage directory" releaseDefault:"1m" devDefault:"30s"`
MinimumDiskSpace memory.Size `help:"how much disk space a node at minimum has to advertise" default:"500GB"`
MinimumBandwidth memory.Size `help:"how much bandwidth a node at minimum has to advertise (deprecated)" default:"0TB"`
NotifyLowDiskCooldown time.Duration `help:"minimum length of time between capacity reports" default:"10m" hidden:"true"`
@ -46,6 +47,7 @@ type Service struct {
allocatedDiskSpace int64
cooldown *sync2.Cooldown
Loop *sync2.Cycle
VerifyDirLoop *sync2.Cycle
Config Config
}
@ -59,6 +61,7 @@ func NewService(log *zap.Logger, store *pieces.Store, contact *contact.Service,
allocatedDiskSpace: allocatedDiskSpace,
cooldown: sync2.NewCooldown(config.NotifyLowDiskCooldown),
Loop: sync2.NewCycle(interval),
VerifyDirLoop: sync2.NewCycle(config.VerifyDirInterval),
Config: config,
}
}
@ -108,7 +111,21 @@ func (service *Service) Run(ctx context.Context) (err error) {
return Error.New("disk space requirement not met")
}
var group errgroup.Group
// Create file to identify the storage directory.
if err := service.store.CreateVerificationFile(service.contact.Local().ID); err != nil {
return Error.New("failed to create storage directory verification: %v", err)
}
group, ctx := errgroup.WithContext(ctx)
group.Go(func() error {
return service.VerifyDirLoop.Run(ctx, func(ctx context.Context) error {
err := service.store.VerifyStorageDir(service.contact.Local().ID)
if err != nil {
return Error.New("error verifying storage directory: %v", err)
}
return nil
})
})
group.Go(func() error {
return service.Loop.Run(ctx, func(ctx context.Context) error {
err := service.updateNodeInformation(ctx)
@ -118,7 +135,7 @@ func (service *Service) Run(ctx context.Context) (err error) {
return nil
})
})
service.cooldown.Start(ctx, &group, func(ctx context.Context) error {
service.cooldown.Start(ctx, group, func(ctx context.Context) error {
err := service.updateNodeInformation(ctx)
if err != nil {
service.log.Error("error during updating node information: ", zap.Error(err))

View File

@ -32,6 +32,7 @@ func TestMonitor(t *testing.T) {
nodeAssertions := 0
for _, storageNode := range planet.StorageNodes {
storageNode.Storage2.Monitor.Loop.TriggerWait()
storageNode.Storage2.Monitor.VerifyDirLoop.TriggerWait()
stats, err := storageNode.Storage2.Inspector.Stats(ctx, &pb.StatsRequest{})
require.NoError(t, err)
if stats.UsedSpace > 0 {

View File

@ -509,6 +509,7 @@ func (endpoint *Endpoint) Download(stream pb.DRPCPiecestore_DownloadStream) (err
pieceReader, err = endpoint.store.Reader(ctx, limit.SatelliteId, limit.PieceId)
if err != nil {
if os.IsNotExist(err) {
endpoint.monitor.VerifyDirLoop.TriggerWait()
return rpcstatus.Wrap(rpcstatus.NotFound, err)
}
return rpcstatus.Wrap(rpcstatus.Internal, err)