2019-03-18 10:55:06 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
2021-09-27 13:35:48 +01:00
// Package monitor is responsible for monitoring the disk is well-behaved.
// It checks whether there's sufficient space and whether directories are writable.
2019-03-18 10:55:06 +00:00
package monitor
import (
"context"
"time"
2019-11-08 20:40:39 +00:00
"github.com/spacemonkeygo/monkit/v3"
2019-03-18 10:55:06 +00:00
"github.com/zeebo/errs"
"go.uber.org/zap"
2020-02-26 02:39:44 +00:00
"golang.org/x/sync/errgroup"
2019-03-18 10:55:06 +00:00
2019-12-27 11:48:47 +00:00
"storj.io/common/memory"
"storj.io/common/pb"
"storj.io/common/sync2"
2019-03-18 10:55:06 +00:00
"storj.io/storj/storagenode/bandwidth"
2019-09-19 20:56:34 +01:00
"storj.io/storj/storagenode/contact"
2019-03-18 10:55:06 +00:00
"storj.io/storj/storagenode/pieces"
)
var (
mon = monkit . Package ( )
2020-08-11 15:50:01 +01:00
// Error is the default error class for piecestore monitor errors.
2019-03-18 10:55:06 +00:00
Error = errs . Class ( "piecestore monitor" )
)
2020-12-26 01:16:43 +00:00
// DiskSpace consolidates monitored disk space statistics.
type DiskSpace struct {
Allocated int64
UsedForPieces int64
UsedForTrash int64
Free int64
Available int64
Overused int64
}
2019-03-18 10:55:06 +00:00
// Config defines parameters for storage node disk and bandwidth usage monitoring.
type Config struct {
2020-08-07 17:19:37 +01:00
Interval time . Duration ` help:"how frequently Kademlia bucket should be refreshed with node stats" default:"1h0m0s" `
VerifyDirReadableInterval time . Duration ` help:"how frequently to verify the location and readability of the storage directory" releaseDefault:"1m" devDefault:"30s" `
VerifyDirWritableInterval time . Duration ` help:"how frequently to verify writability of storage directory" releaseDefault:"5m" devDefault:"30s" `
MinimumDiskSpace memory . Size ` help:"how much disk space a node at minimum has to advertise" default:"500GB" `
MinimumBandwidth memory . Size ` help:"how much bandwidth a node at minimum has to advertise (deprecated)" default:"0TB" `
NotifyLowDiskCooldown time . Duration ` help:"minimum length of time between capacity reports" default:"10m" hidden:"true" `
2019-03-18 10:55:06 +00:00
}
2020-12-05 16:01:42 +00:00
// Service which monitors disk usage.
2019-09-10 14:24:16 +01:00
//
// architecture: Service
2019-03-18 10:55:06 +00:00
type Service struct {
2020-08-07 17:19:37 +01:00
log * zap . Logger
store * pieces . Store
contact * contact . Service
usageDB bandwidth . DB
allocatedDiskSpace int64
cooldown * sync2 . Cooldown
Loop * sync2 . Cycle
VerifyDirReadableLoop * sync2 . Cycle
VerifyDirWritableLoop * sync2 . Cycle
Config Config
2019-03-18 10:55:06 +00:00
}
// NewService creates a new storage node monitoring service.
2020-02-12 21:19:42 +00:00
func NewService ( log * zap . Logger , store * pieces . Store , contact * contact . Service , usageDB bandwidth . DB , allocatedDiskSpace int64 , interval time . Duration , reportCapacity func ( context . Context ) , config Config ) * Service {
2019-03-18 10:55:06 +00:00
return & Service {
2020-08-07 17:19:37 +01:00
log : log ,
store : store ,
contact : contact ,
usageDB : usageDB ,
allocatedDiskSpace : allocatedDiskSpace ,
cooldown : sync2 . NewCooldown ( config . NotifyLowDiskCooldown ) ,
Loop : sync2 . NewCycle ( interval ) ,
VerifyDirReadableLoop : sync2 . NewCycle ( config . VerifyDirReadableInterval ) ,
VerifyDirWritableLoop : sync2 . NewCycle ( config . VerifyDirWritableInterval ) ,
Config : config ,
2019-03-18 10:55:06 +00:00
}
}
2020-07-16 15:18:02 +01:00
// Run runs monitor service.
2019-03-18 10:55:06 +00:00
func ( service * Service ) Run ( ctx context . Context ) ( err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
// get the disk space details
// The returned path ends in a slash only if it represents a root directory, such as "/" on Unix or `C:\` on Windows.
2019-06-04 13:31:39 +01:00
storageStatus , err := service . store . StorageStatus ( ctx )
2019-03-18 10:55:06 +00:00
if err != nil {
return Error . Wrap ( err )
}
2019-04-15 11:12:22 +01:00
freeDiskSpace := storageStatus . DiskFree
2019-03-18 10:55:06 +00:00
2020-12-26 01:16:43 +00:00
totalUsed , err := service . store . SpaceUsedForPiecesAndTrash ( ctx )
2019-03-18 10:55:06 +00:00
if err != nil {
2020-12-26 01:16:43 +00:00
return Error . Wrap ( err )
2019-03-18 10:55:06 +00:00
}
// check your hard drive is big enough
// first time setup as a piece node server
if totalUsed == 0 && freeDiskSpace < service . allocatedDiskSpace {
service . allocatedDiskSpace = freeDiskSpace
2020-08-29 15:36:37 +01:00
service . log . Warn ( "Disk space is less than requested. Allocated space is" , zap . Int64 ( "bytes" , service . allocatedDiskSpace ) )
2019-03-18 10:55:06 +00:00
}
// on restarting the Piece node server, assuming already been working as a node
// used above the alloacated space, user changed the allocation space setting
// before restarting
if totalUsed >= service . allocatedDiskSpace {
2020-08-29 15:36:37 +01:00
service . log . Warn ( "Used more space than allocated. Allocated space is" , zap . Int64 ( "bytes" , service . allocatedDiskSpace ) )
2019-03-18 10:55:06 +00:00
}
2019-04-15 11:12:22 +01:00
// the available disk space is less than remaining allocated space,
2019-03-18 10:55:06 +00:00
// due to change of setting before restarting
if freeDiskSpace < service . allocatedDiskSpace - totalUsed {
2019-05-06 19:59:30 +01:00
service . allocatedDiskSpace = freeDiskSpace + totalUsed
2020-08-29 15:36:37 +01:00
service . log . Warn ( "Disk space is less than requested. Allocated space is" , zap . Int64 ( "bytes" , service . allocatedDiskSpace ) )
2019-03-18 10:55:06 +00:00
}
2019-06-10 11:14:50 +01:00
// Ensure the disk is at least 500GB in size, which is our current minimum required to be an operator
if service . allocatedDiskSpace < service . Config . MinimumDiskSpace . Int64 ( ) {
2020-08-29 15:36:37 +01:00
service . log . Error ( "Total disk space is less than required minimum" , zap . Int64 ( "bytes" , service . Config . MinimumDiskSpace . Int64 ( ) ) )
2019-06-10 11:14:50 +01:00
return Error . New ( "disk space requirement not met" )
}
2020-07-10 20:36:39 +01:00
2020-07-10 21:01:27 +01:00
group , ctx := errgroup . WithContext ( ctx )
group . Go ( func ( ) error {
2020-08-07 17:19:37 +01:00
return service . VerifyDirReadableLoop . Run ( ctx , func ( ctx context . Context ) error {
2021-09-10 14:05:29 +01:00
err := service . store . VerifyStorageDir ( ctx , service . contact . Local ( ) . ID )
2020-07-10 21:01:27 +01:00
if err != nil {
2020-08-07 17:19:37 +01:00
return Error . New ( "error verifying location and/or readability of storage directory: %v" , err )
}
return nil
} )
} )
group . Go ( func ( ) error {
return service . VerifyDirWritableLoop . Run ( ctx , func ( ctx context . Context ) error {
2021-09-10 14:05:29 +01:00
err := service . store . CheckWritability ( ctx )
2020-08-07 17:19:37 +01:00
if err != nil {
return Error . New ( "error verifying writability of storage directory: %v" , err )
2020-07-10 21:01:27 +01:00
}
return nil
} )
} )
2020-02-26 02:39:44 +00:00
group . Go ( func ( ) error {
return service . Loop . Run ( ctx , func ( ctx context . Context ) error {
err := service . updateNodeInformation ( ctx )
if err != nil {
service . log . Error ( "error during updating node information: " , zap . Error ( err ) )
}
return nil
} )
} )
2020-07-10 21:01:27 +01:00
service . cooldown . Start ( ctx , group , func ( ctx context . Context ) error {
2019-03-18 10:55:06 +00:00
err := service . updateNodeInformation ( ctx )
if err != nil {
service . log . Error ( "error during updating node information: " , zap . Error ( err ) )
2020-02-26 02:39:44 +00:00
return nil
2019-03-18 10:55:06 +00:00
}
2020-02-26 02:39:44 +00:00
err = service . contact . PingSatellites ( ctx , service . Config . NotifyLowDiskCooldown )
if err != nil {
service . log . Error ( "error notifying satellites: " , zap . Error ( err ) )
}
return nil
2019-03-18 10:55:06 +00:00
} )
2020-02-26 02:39:44 +00:00
return group . Wait ( )
}
2020-07-16 15:18:02 +01:00
// NotifyLowDisk reports disk space to satellites if cooldown timer has expired.
2020-02-26 02:39:44 +00:00
func ( service * Service ) NotifyLowDisk ( ) {
service . cooldown . Trigger ( )
2019-03-18 10:55:06 +00:00
}
2019-05-08 12:11:59 +01:00
// Close stops the monitor service.
func ( service * Service ) Close ( ) ( err error ) {
service . Loop . Close ( )
2020-02-26 02:39:44 +00:00
service . cooldown . Close ( )
2019-05-08 12:11:59 +01:00
return nil
}
2019-06-04 13:31:39 +01:00
func ( service * Service ) updateNodeInformation ( ctx context . Context ) ( err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
2020-07-21 19:39:21 +01:00
freeSpace , err := service . AvailableSpace ( ctx )
2019-03-18 10:55:06 +00:00
if err != nil {
2020-07-21 19:39:21 +01:00
return err
2019-03-18 10:55:06 +00:00
}
2019-09-19 20:56:34 +01:00
service . contact . UpdateSelf ( & pb . NodeCapacity {
2020-07-21 19:39:21 +01:00
FreeDisk : freeSpace ,
2019-04-22 10:07:50 +01:00
} )
2019-03-18 10:55:06 +00:00
return nil
}
2020-07-16 15:18:02 +01:00
// AvailableSpace returns available disk space for upload.
2019-06-04 13:31:39 +01:00
func ( service * Service ) AvailableSpace ( ctx context . Context ) ( _ int64 , err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
2020-12-26 01:16:43 +00:00
usedSpace , err := service . store . SpaceUsedForPiecesAndTrash ( ctx )
2019-04-15 11:12:22 +01:00
if err != nil {
2020-12-26 01:16:43 +00:00
return 0 , err
2019-04-15 11:12:22 +01:00
}
2020-02-09 00:04:23 +00:00
2020-07-21 19:39:21 +01:00
freeSpaceForStorj := service . allocatedDiskSpace - usedSpace
diskStatus , err := service . store . StorageStatus ( ctx )
if err != nil {
return 0 , Error . Wrap ( err )
}
if diskStatus . DiskFree < freeSpaceForStorj {
freeSpaceForStorj = diskStatus . DiskFree
}
mon . IntVal ( "allocated_space" ) . Observe ( service . allocatedDiskSpace )
2020-02-09 00:04:23 +00:00
mon . IntVal ( "used_space" ) . Observe ( usedSpace )
2020-07-21 19:39:21 +01:00
mon . IntVal ( "available_space" ) . Observe ( freeSpaceForStorj )
2020-02-09 00:04:23 +00:00
2020-07-21 19:39:21 +01:00
return freeSpaceForStorj , nil
2019-04-15 11:12:22 +01:00
}
2020-12-26 01:16:43 +00:00
// DiskSpace returns consolidated disk space state info.
func ( service * Service ) DiskSpace ( ctx context . Context ) ( _ DiskSpace , err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
usedForPieces , _ , err := service . store . SpaceUsedForPieces ( ctx )
if err != nil {
return DiskSpace { } , Error . Wrap ( err )
}
usedForTrash , err := service . store . SpaceUsedForTrash ( ctx )
if err != nil {
return DiskSpace { } , Error . Wrap ( err )
}
storageStatus , err := service . store . StorageStatus ( ctx )
if err != nil {
return DiskSpace { } , Error . Wrap ( err )
}
overused := int64 ( 0 )
available := service . allocatedDiskSpace - ( usedForPieces + usedForTrash )
if available < 0 {
overused = - available
}
if storageStatus . DiskFree < available {
available = storageStatus . DiskFree
}
return DiskSpace {
Allocated : service . allocatedDiskSpace ,
UsedForPieces : usedForPieces ,
UsedForTrash : usedForTrash ,
Free : storageStatus . DiskFree ,
Available : available ,
Overused : overused ,
} , nil
}