storagenode/monitor: add option to log only when verification check fails

This is not recommended for most nodes; leaving your node running when
it can't handle requests fast enough is a good way to fail audits and
get disqualified, which may happen before you even know about the
problem.

But some Windows users are finding that this is being triggered
regularly on their nodes, and that it apparently causes the whole system
to lock up occasionally. We are adding this option as a way to mitigate
that problem until we can collect more information.

Change-Id: I7a652b0f9f970bbb9ed9f2cb3ad1cb89d90db8d7
This commit is contained in:
paul cannon 2023-04-03 11:35:41 -05:00 committed by Storj Robot
parent 4c05293d8b
commit 556250911c

View File

@ -46,6 +46,7 @@ type Config struct {
VerifyDirWritableInterval time.Duration `help:"how frequently to verify writability of storage directory" releaseDefault:"5m" devDefault:"30s"`
VerifyDirReadableTimeout time.Duration `help:"how long to wait for a storage directory readability verification to complete" releaseDefault:"1m" devDefault:"10s"`
VerifyDirWritableTimeout time.Duration `help:"how long to wait for a storage directory writability verification to complete" releaseDefault:"1m" devDefault:"10s"`
VerifyDirWarnOnly bool `help:"if the storage directory verification check fails, log a warning instead of killing the node" default:"false"`
MinimumDiskSpace memory.Size `help:"how much disk space a node at minimum has to advertise" default:"500GB"`
MinimumBandwidth memory.Size `help:"how much bandwidth a node at minimum has to advertise (deprecated)" default:"0TB"`
NotifyLowDiskCooldown time.Duration `help:"minimum length of time between capacity reports" default:"10m" hidden:"true"`
@ -134,8 +135,16 @@ func (service *Service) Run(ctx context.Context) (err error) {
err := service.store.VerifyStorageDirWithTimeout(ctx, service.contact.Local().ID, timeout)
if err != nil {
if errs.Is(err, context.DeadlineExceeded) {
if service.Config.VerifyDirWarnOnly {
service.log.Error("timed out while verifying readability of storage directory", zap.Duration("timeout", timeout))
return nil
}
return Error.New("timed out after %v while verifying readability of storage directory", timeout)
}
if service.Config.VerifyDirWarnOnly {
service.log.Error("error verifying location and/or readability of storage directory", zap.Error(err))
return nil
}
return Error.New("error verifying location and/or readability of storage directory: %v", err)
}
return nil
@ -147,8 +156,16 @@ func (service *Service) Run(ctx context.Context) (err error) {
err := service.store.CheckWritabilityWithTimeout(ctx, timeout)
if err != nil {
if errs.Is(err, context.DeadlineExceeded) {
if service.Config.VerifyDirWarnOnly {
service.log.Error("timed out while verifying writability of storage directory", zap.Duration("timeout", timeout))
return nil
}
return Error.New("timed out after %v while verifying writability of storage directory", timeout)
}
if service.Config.VerifyDirWarnOnly {
service.log.Error("error verifying writability of storage directory", zap.Error(err))
return nil
}
return Error.New("error verifying writability of storage directory: %v", err)
}
return nil