From 556250911c6641b31c81c72a67caa28589e2461b Mon Sep 17 00:00:00 2001 From: paul cannon Date: Mon, 3 Apr 2023 11:35:41 -0500 Subject: [PATCH] storagenode/monitor: add option to log only when verification check fails This is not recommended for most nodes; leaving your node running when it can't handle requests fast enough is a good way to fail audits and get disqualified, which may happen before you even know about the problem. But some Windows users are finding that this is being triggered regularly on their nodes, and that it apparently causes the whole system to lock up occasionally. We are adding this option as a way to mitigate that problem until we can collect more information. Change-Id: I7a652b0f9f970bbb9ed9f2cb3ad1cb89d90db8d7 --- storagenode/monitor/monitor.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/storagenode/monitor/monitor.go b/storagenode/monitor/monitor.go index f6b58c76a..989750b7e 100644 --- a/storagenode/monitor/monitor.go +++ b/storagenode/monitor/monitor.go @@ -46,6 +46,7 @@ type Config struct { VerifyDirWritableInterval time.Duration `help:"how frequently to verify writability of storage directory" releaseDefault:"5m" devDefault:"30s"` VerifyDirReadableTimeout time.Duration `help:"how long to wait for a storage directory readability verification to complete" releaseDefault:"1m" devDefault:"10s"` VerifyDirWritableTimeout time.Duration `help:"how long to wait for a storage directory writability verification to complete" releaseDefault:"1m" devDefault:"10s"` + VerifyDirWarnOnly bool `help:"if the storage directory verification check fails, log a warning instead of killing the node" default:"false"` MinimumDiskSpace memory.Size `help:"how much disk space a node at minimum has to advertise" default:"500GB"` MinimumBandwidth memory.Size `help:"how much bandwidth a node at minimum has to advertise (deprecated)" default:"0TB"` NotifyLowDiskCooldown time.Duration `help:"minimum length of time between capacity reports" default:"10m" hidden:"true"` @@ -134,8 +135,16 @@ func (service *Service) Run(ctx context.Context) (err error) { err := service.store.VerifyStorageDirWithTimeout(ctx, service.contact.Local().ID, timeout) if err != nil { if errs.Is(err, context.DeadlineExceeded) { + if service.Config.VerifyDirWarnOnly { + service.log.Error("timed out while verifying readability of storage directory", zap.Duration("timeout", timeout)) + return nil + } return Error.New("timed out after %v while verifying readability of storage directory", timeout) } + if service.Config.VerifyDirWarnOnly { + service.log.Error("error verifying location and/or readability of storage directory", zap.Error(err)) + return nil + } return Error.New("error verifying location and/or readability of storage directory: %v", err) } return nil @@ -147,8 +156,16 @@ func (service *Service) Run(ctx context.Context) (err error) { err := service.store.CheckWritabilityWithTimeout(ctx, timeout) if err != nil { if errs.Is(err, context.DeadlineExceeded) { + if service.Config.VerifyDirWarnOnly { + service.log.Error("timed out while verifying writability of storage directory", zap.Duration("timeout", timeout)) + return nil + } return Error.New("timed out after %v while verifying writability of storage directory", timeout) } + if service.Config.VerifyDirWarnOnly { + service.log.Error("error verifying writability of storage directory", zap.Error(err)) + return nil + } return Error.New("error verifying writability of storage directory: %v", err) } return nil