7ac2031cac
There are many case where the keywords `free` and `available` are confused in their usage. For most cases, `free` space is the amount of free space left on the whole disk, and not just in allocation while `available` space is the amount of free space left in the allocated disk space. What the user/sno wants to see is not the free space but the available space. To the SNO, free space is the free space left in the allocated disk space. Because of this confusion, the multinode dashboard displays the `free` disk space instead of the free space in the allocated disk space https://github.com/storj/storj/issues/5248 While the storagenode dashboard shows the correct free space in the allocation. This change fixes the wrong free disk space. I also added a few comments to make a distinction between the `free` and `available` fields in the `DiskSpace*` structs. Change-Id: I11b372ca53a5ac05dc3f79834c18f85ebec11855
295 lines
11 KiB
Go
295 lines
11 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
// Package monitor is responsible for monitoring the disk is well-behaved.
|
|
// It checks whether there's sufficient space and whether directories are writable.
|
|
package monitor
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
|
"github.com/zeebo/errs"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"storj.io/common/memory"
|
|
"storj.io/common/pb"
|
|
"storj.io/common/sync2"
|
|
"storj.io/storj/storagenode/bandwidth"
|
|
"storj.io/storj/storagenode/contact"
|
|
"storj.io/storj/storagenode/pieces"
|
|
)
|
|
|
|
var (
|
|
mon = monkit.Package()
|
|
|
|
// Error is the default error class for piecestore monitor errors.
|
|
Error = errs.Class("piecestore monitor")
|
|
)
|
|
|
|
// DiskSpace consolidates monitored disk space statistics.
|
|
type DiskSpace struct {
|
|
// Allocated is the amount of disk space allocated to the storage node, in bytes.
|
|
Allocated int64
|
|
// UsedForPieces is the amount of disk space used for pieces, in bytes.
|
|
UsedForPieces int64
|
|
// UsedForTrash is the amount of disk space used for trash, in bytes.
|
|
UsedForTrash int64
|
|
// Free is the actual amount of free space on the whole disk, not just allocated disk space, in bytes.
|
|
Free int64
|
|
// Available is the amount of free space on the allocated disk space, in bytes.
|
|
Available int64
|
|
// Overused is the amount of disk space overused by the storage node, in bytes.
|
|
Overused int64
|
|
}
|
|
|
|
// Config defines parameters for storage node disk and bandwidth usage monitoring.
|
|
type Config struct {
|
|
Interval time.Duration `help:"how frequently Kademlia bucket should be refreshed with node stats" default:"1h0m0s"`
|
|
VerifyDirReadableInterval time.Duration `help:"how frequently to verify the location and readability of the storage directory" releaseDefault:"1m" devDefault:"30s"`
|
|
VerifyDirWritableInterval time.Duration `help:"how frequently to verify writability of storage directory" releaseDefault:"5m" devDefault:"30s"`
|
|
VerifyDirReadableTimeout time.Duration `help:"how long to wait for a storage directory readability verification to complete" releaseDefault:"1m" devDefault:"10s"`
|
|
VerifyDirWritableTimeout time.Duration `help:"how long to wait for a storage directory writability verification to complete" releaseDefault:"1m" devDefault:"10s"`
|
|
VerifyDirWarnOnly bool `help:"if the storage directory verification check fails, log a warning instead of killing the node" default:"false"`
|
|
MinimumDiskSpace memory.Size `help:"how much disk space a node at minimum has to advertise" default:"500GB"`
|
|
MinimumBandwidth memory.Size `help:"how much bandwidth a node at minimum has to advertise (deprecated)" default:"0TB"`
|
|
NotifyLowDiskCooldown time.Duration `help:"minimum length of time between capacity reports" default:"10m" hidden:"true"`
|
|
}
|
|
|
|
// Service which monitors disk usage.
|
|
//
|
|
// architecture: Service
|
|
type Service struct {
|
|
log *zap.Logger
|
|
store *pieces.Store
|
|
contact *contact.Service
|
|
usageDB bandwidth.DB
|
|
allocatedDiskSpace int64
|
|
cooldown *sync2.Cooldown
|
|
Loop *sync2.Cycle
|
|
VerifyDirReadableLoop *sync2.Cycle
|
|
VerifyDirWritableLoop *sync2.Cycle
|
|
Config Config
|
|
}
|
|
|
|
// NewService creates a new storage node monitoring service.
|
|
func NewService(log *zap.Logger, store *pieces.Store, contact *contact.Service, usageDB bandwidth.DB, allocatedDiskSpace int64, interval time.Duration, reportCapacity func(context.Context), config Config) *Service {
|
|
return &Service{
|
|
log: log,
|
|
store: store,
|
|
contact: contact,
|
|
usageDB: usageDB,
|
|
allocatedDiskSpace: allocatedDiskSpace,
|
|
cooldown: sync2.NewCooldown(config.NotifyLowDiskCooldown),
|
|
Loop: sync2.NewCycle(interval),
|
|
VerifyDirReadableLoop: sync2.NewCycle(config.VerifyDirReadableInterval),
|
|
VerifyDirWritableLoop: sync2.NewCycle(config.VerifyDirWritableInterval),
|
|
Config: config,
|
|
}
|
|
}
|
|
|
|
// Run runs monitor service.
|
|
func (service *Service) Run(ctx context.Context) (err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
// get the disk space details
|
|
// The returned path ends in a slash only if it represents a root directory, such as "/" on Unix or `C:\` on Windows.
|
|
storageStatus, err := service.store.StorageStatus(ctx)
|
|
if err != nil {
|
|
return Error.Wrap(err)
|
|
}
|
|
freeDiskSpace := storageStatus.DiskFree
|
|
|
|
totalUsed, err := service.store.SpaceUsedForPiecesAndTrash(ctx)
|
|
if err != nil {
|
|
return Error.Wrap(err)
|
|
}
|
|
|
|
// check your hard drive is big enough
|
|
// first time setup as a piece node server
|
|
if totalUsed == 0 && freeDiskSpace < service.allocatedDiskSpace {
|
|
service.allocatedDiskSpace = freeDiskSpace
|
|
service.log.Warn("Disk space is less than requested. Allocated space is", zap.Int64("bytes", service.allocatedDiskSpace))
|
|
}
|
|
|
|
// on restarting the Piece node server, assuming already been working as a node
|
|
// used above the alloacated space, user changed the allocation space setting
|
|
// before restarting
|
|
if totalUsed >= service.allocatedDiskSpace {
|
|
service.log.Warn("Used more space than allocated. Allocated space is", zap.Int64("bytes", service.allocatedDiskSpace))
|
|
}
|
|
|
|
// the available disk space is less than remaining allocated space,
|
|
// due to change of setting before restarting
|
|
if freeDiskSpace < service.allocatedDiskSpace-totalUsed {
|
|
service.allocatedDiskSpace = freeDiskSpace + totalUsed
|
|
service.log.Warn("Disk space is less than requested. Allocated space is", zap.Int64("bytes", service.allocatedDiskSpace))
|
|
}
|
|
|
|
// Ensure the disk is at least 500GB in size, which is our current minimum required to be an operator
|
|
if service.allocatedDiskSpace < service.Config.MinimumDiskSpace.Int64() {
|
|
service.log.Error("Total disk space is less than required minimum", zap.Int64("bytes", service.Config.MinimumDiskSpace.Int64()))
|
|
return Error.New("disk space requirement not met")
|
|
}
|
|
|
|
group, ctx := errgroup.WithContext(ctx)
|
|
group.Go(func() error {
|
|
timeout := service.Config.VerifyDirReadableTimeout
|
|
return service.VerifyDirReadableLoop.Run(ctx, func(ctx context.Context) error {
|
|
err := service.store.VerifyStorageDirWithTimeout(ctx, service.contact.Local().ID, timeout)
|
|
if err != nil {
|
|
if errs.Is(err, context.DeadlineExceeded) {
|
|
if service.Config.VerifyDirWarnOnly {
|
|
service.log.Error("timed out while verifying readability of storage directory", zap.Duration("timeout", timeout))
|
|
return nil
|
|
}
|
|
return Error.New("timed out after %v while verifying readability of storage directory", timeout)
|
|
}
|
|
if service.Config.VerifyDirWarnOnly {
|
|
service.log.Error("error verifying location and/or readability of storage directory", zap.Error(err))
|
|
return nil
|
|
}
|
|
return Error.New("error verifying location and/or readability of storage directory: %v", err)
|
|
}
|
|
return nil
|
|
})
|
|
})
|
|
group.Go(func() error {
|
|
timeout := service.Config.VerifyDirWritableTimeout
|
|
return service.VerifyDirWritableLoop.Run(ctx, func(ctx context.Context) error {
|
|
err := service.store.CheckWritabilityWithTimeout(ctx, timeout)
|
|
if err != nil {
|
|
if errs.Is(err, context.DeadlineExceeded) {
|
|
if service.Config.VerifyDirWarnOnly {
|
|
service.log.Error("timed out while verifying writability of storage directory", zap.Duration("timeout", timeout))
|
|
return nil
|
|
}
|
|
return Error.New("timed out after %v while verifying writability of storage directory", timeout)
|
|
}
|
|
if service.Config.VerifyDirWarnOnly {
|
|
service.log.Error("error verifying writability of storage directory", zap.Error(err))
|
|
return nil
|
|
}
|
|
return Error.New("error verifying writability of storage directory: %v", err)
|
|
}
|
|
return nil
|
|
})
|
|
})
|
|
group.Go(func() error {
|
|
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
|
err := service.updateNodeInformation(ctx)
|
|
if err != nil {
|
|
service.log.Error("error during updating node information: ", zap.Error(err))
|
|
}
|
|
return nil
|
|
})
|
|
})
|
|
service.cooldown.Start(ctx, group, func(ctx context.Context) error {
|
|
err := service.updateNodeInformation(ctx)
|
|
if err != nil {
|
|
service.log.Error("error during updating node information: ", zap.Error(err))
|
|
return nil
|
|
}
|
|
|
|
err = service.contact.PingSatellites(ctx, service.Config.NotifyLowDiskCooldown)
|
|
if err != nil {
|
|
service.log.Error("error notifying satellites: ", zap.Error(err))
|
|
}
|
|
return nil
|
|
})
|
|
|
|
return group.Wait()
|
|
}
|
|
|
|
// NotifyLowDisk reports disk space to satellites if cooldown timer has expired.
|
|
func (service *Service) NotifyLowDisk() {
|
|
service.cooldown.Trigger()
|
|
}
|
|
|
|
// Close stops the monitor service.
|
|
func (service *Service) Close() (err error) {
|
|
service.Loop.Close()
|
|
service.cooldown.Close()
|
|
return nil
|
|
}
|
|
|
|
func (service *Service) updateNodeInformation(ctx context.Context) (err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
freeSpace, err := service.AvailableSpace(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
service.contact.UpdateSelf(&pb.NodeCapacity{
|
|
FreeDisk: freeSpace,
|
|
})
|
|
|
|
return nil
|
|
}
|
|
|
|
// AvailableSpace returns available disk space for upload.
|
|
func (service *Service) AvailableSpace(ctx context.Context) (_ int64, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
usedSpace, err := service.store.SpaceUsedForPiecesAndTrash(ctx)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
freeSpaceForStorj := service.allocatedDiskSpace - usedSpace
|
|
|
|
diskStatus, err := service.store.StorageStatus(ctx)
|
|
if err != nil {
|
|
return 0, Error.Wrap(err)
|
|
}
|
|
if diskStatus.DiskFree < freeSpaceForStorj {
|
|
freeSpaceForStorj = diskStatus.DiskFree
|
|
}
|
|
|
|
mon.IntVal("allocated_space").Observe(service.allocatedDiskSpace)
|
|
mon.IntVal("used_space").Observe(usedSpace)
|
|
mon.IntVal("available_space").Observe(freeSpaceForStorj)
|
|
|
|
return freeSpaceForStorj, nil
|
|
}
|
|
|
|
// DiskSpace returns consolidated disk space state info.
|
|
func (service *Service) DiskSpace(ctx context.Context) (_ DiskSpace, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
usedForPieces, _, err := service.store.SpaceUsedForPieces(ctx)
|
|
if err != nil {
|
|
return DiskSpace{}, Error.Wrap(err)
|
|
}
|
|
usedForTrash, err := service.store.SpaceUsedForTrash(ctx)
|
|
if err != nil {
|
|
return DiskSpace{}, Error.Wrap(err)
|
|
}
|
|
|
|
storageStatus, err := service.store.StorageStatus(ctx)
|
|
if err != nil {
|
|
return DiskSpace{}, Error.Wrap(err)
|
|
}
|
|
|
|
overused := int64(0)
|
|
|
|
available := service.allocatedDiskSpace - (usedForPieces + usedForTrash)
|
|
if available < 0 {
|
|
overused = -available
|
|
}
|
|
if storageStatus.DiskFree < available {
|
|
available = storageStatus.DiskFree
|
|
}
|
|
|
|
return DiskSpace{
|
|
Allocated: service.allocatedDiskSpace,
|
|
UsedForPieces: usedForPieces,
|
|
UsedForTrash: usedForTrash,
|
|
Free: storageStatus.DiskFree,
|
|
Available: available,
|
|
Overused: overused,
|
|
}, nil
|
|
}
|