2019-03-18 10:55:06 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package monitor
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"time"
|
|
|
|
|
2019-11-08 20:40:39 +00:00
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
2019-03-18 10:55:06 +00:00
|
|
|
"github.com/zeebo/errs"
|
|
|
|
"go.uber.org/zap"
|
2020-02-26 02:39:44 +00:00
|
|
|
"golang.org/x/sync/errgroup"
|
2019-03-18 10:55:06 +00:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/memory"
|
|
|
|
"storj.io/common/pb"
|
|
|
|
"storj.io/common/sync2"
|
2019-03-18 10:55:06 +00:00
|
|
|
"storj.io/storj/storagenode/bandwidth"
|
2019-09-19 20:56:34 +01:00
|
|
|
"storj.io/storj/storagenode/contact"
|
2019-03-18 10:55:06 +00:00
|
|
|
"storj.io/storj/storagenode/pieces"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
mon = monkit.Package()
|
|
|
|
|
|
|
|
// Error is the default error class for piecestore monitor errors
|
|
|
|
Error = errs.Class("piecestore monitor")
|
|
|
|
)
|
|
|
|
|
|
|
|
// Config defines parameters for storage node disk and bandwidth usage monitoring.
|
|
|
|
type Config struct {
|
2020-02-26 02:39:44 +00:00
|
|
|
Interval time.Duration `help:"how frequently Kademlia bucket should be refreshed with node stats" default:"1h0m0s"`
|
|
|
|
MinimumDiskSpace memory.Size `help:"how much disk space a node at minimum has to advertise" default:"500GB"`
|
|
|
|
MinimumBandwidth memory.Size `help:"how much bandwidth a node at minimum has to advertise" default:"500GB"`
|
|
|
|
NotifyLowDiskCooldown time.Duration `help:"minimum length of time between capacity reports" default:"10m" hidden:"true"`
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|
|
|
|
|
2019-10-04 21:48:41 +01:00
|
|
|
// Service which monitors disk usage
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2019-03-18 10:55:06 +00:00
|
|
|
type Service struct {
|
|
|
|
log *zap.Logger
|
|
|
|
store *pieces.Store
|
2019-09-19 20:56:34 +01:00
|
|
|
contact *contact.Service
|
2019-03-18 10:55:06 +00:00
|
|
|
usageDB bandwidth.DB
|
|
|
|
allocatedDiskSpace int64
|
|
|
|
allocatedBandwidth int64
|
2020-02-26 02:39:44 +00:00
|
|
|
cooldown *sync2.Cooldown
|
2020-01-29 15:37:50 +00:00
|
|
|
Loop *sync2.Cycle
|
2019-06-10 11:14:50 +01:00
|
|
|
Config Config
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: should it be responsible for monitoring actual bandwidth as well?
|
|
|
|
|
|
|
|
// NewService creates a new storage node monitoring service.
|
2020-02-26 02:39:44 +00:00
|
|
|
func NewService(log *zap.Logger, store *pieces.Store, contact *contact.Service, usageDB bandwidth.DB, allocatedDiskSpace, allocatedBandwidth int64, interval time.Duration, reportCapacity func(context.Context), config Config) *Service {
|
2019-03-18 10:55:06 +00:00
|
|
|
return &Service{
|
|
|
|
log: log,
|
|
|
|
store: store,
|
2019-09-19 20:56:34 +01:00
|
|
|
contact: contact,
|
2019-03-18 10:55:06 +00:00
|
|
|
usageDB: usageDB,
|
|
|
|
allocatedDiskSpace: allocatedDiskSpace,
|
|
|
|
allocatedBandwidth: allocatedBandwidth,
|
2020-02-26 02:39:44 +00:00
|
|
|
cooldown: sync2.NewCooldown(config.NotifyLowDiskCooldown),
|
2020-01-29 15:37:50 +00:00
|
|
|
Loop: sync2.NewCycle(interval),
|
2019-06-10 11:14:50 +01:00
|
|
|
Config: config,
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Run runs monitor service
|
|
|
|
func (service *Service) Run(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
// get the disk space details
|
|
|
|
// The returned path ends in a slash only if it represents a root directory, such as "/" on Unix or `C:\` on Windows.
|
2019-06-04 13:31:39 +01:00
|
|
|
storageStatus, err := service.store.StorageStatus(ctx)
|
2019-03-18 10:55:06 +00:00
|
|
|
if err != nil {
|
|
|
|
return Error.Wrap(err)
|
|
|
|
}
|
2019-04-15 11:12:22 +01:00
|
|
|
freeDiskSpace := storageStatus.DiskFree
|
2019-03-18 10:55:06 +00:00
|
|
|
|
|
|
|
totalUsed, err := service.usedSpace(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
usedBandwidth, err := service.usedBandwidth(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if usedBandwidth > service.allocatedBandwidth {
|
|
|
|
service.log.Warn("Exceed the allowed Bandwidth setting")
|
|
|
|
} else {
|
|
|
|
service.log.Info("Remaining Bandwidth", zap.Int64("bytes", service.allocatedBandwidth-usedBandwidth))
|
|
|
|
}
|
|
|
|
|
|
|
|
// check your hard drive is big enough
|
|
|
|
// first time setup as a piece node server
|
|
|
|
if totalUsed == 0 && freeDiskSpace < service.allocatedDiskSpace {
|
|
|
|
service.allocatedDiskSpace = freeDiskSpace
|
|
|
|
service.log.Warn("Disk space is less than requested. Allocating space", zap.Int64("bytes", service.allocatedDiskSpace))
|
|
|
|
}
|
|
|
|
|
|
|
|
// on restarting the Piece node server, assuming already been working as a node
|
|
|
|
// used above the alloacated space, user changed the allocation space setting
|
|
|
|
// before restarting
|
|
|
|
if totalUsed >= service.allocatedDiskSpace {
|
|
|
|
service.log.Warn("Used more space than allocated. Allocating space", zap.Int64("bytes", service.allocatedDiskSpace))
|
|
|
|
}
|
|
|
|
|
2019-04-15 11:12:22 +01:00
|
|
|
// the available disk space is less than remaining allocated space,
|
2019-03-18 10:55:06 +00:00
|
|
|
// due to change of setting before restarting
|
|
|
|
if freeDiskSpace < service.allocatedDiskSpace-totalUsed {
|
2019-05-06 19:59:30 +01:00
|
|
|
service.allocatedDiskSpace = freeDiskSpace + totalUsed
|
2019-03-18 10:55:06 +00:00
|
|
|
service.log.Warn("Disk space is less than requested. Allocating space", zap.Int64("bytes", service.allocatedDiskSpace))
|
|
|
|
}
|
|
|
|
|
2019-06-10 11:14:50 +01:00
|
|
|
// Ensure the disk is at least 500GB in size, which is our current minimum required to be an operator
|
|
|
|
if service.allocatedDiskSpace < service.Config.MinimumDiskSpace.Int64() {
|
|
|
|
service.log.Error("Total disk space less than required minimum", zap.Int64("bytes", service.Config.MinimumDiskSpace.Int64()))
|
|
|
|
return Error.New("disk space requirement not met")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure the bandwidth is at least 500GB
|
|
|
|
if service.allocatedBandwidth < service.Config.MinimumBandwidth.Int64() {
|
|
|
|
service.log.Error("Total Bandwidth available less than required minimum", zap.Int64("bytes", service.Config.MinimumBandwidth.Int64()))
|
|
|
|
return Error.New("bandwidth requirement not met")
|
|
|
|
}
|
|
|
|
|
2020-02-26 02:39:44 +00:00
|
|
|
var group errgroup.Group
|
|
|
|
group.Go(func() error {
|
|
|
|
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
|
|
|
err := service.updateNodeInformation(ctx)
|
|
|
|
if err != nil {
|
|
|
|
service.log.Error("error during updating node information: ", zap.Error(err))
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
})
|
|
|
|
service.cooldown.Start(ctx, &group, func(ctx context.Context) error {
|
2019-03-18 10:55:06 +00:00
|
|
|
err := service.updateNodeInformation(ctx)
|
|
|
|
if err != nil {
|
|
|
|
service.log.Error("error during updating node information: ", zap.Error(err))
|
2020-02-26 02:39:44 +00:00
|
|
|
return nil
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|
2020-02-26 02:39:44 +00:00
|
|
|
|
|
|
|
err = service.contact.PingSatellites(ctx, service.Config.NotifyLowDiskCooldown)
|
|
|
|
if err != nil {
|
|
|
|
service.log.Error("error notifying satellites: ", zap.Error(err))
|
|
|
|
}
|
|
|
|
return nil
|
2019-03-18 10:55:06 +00:00
|
|
|
})
|
2020-02-26 02:39:44 +00:00
|
|
|
|
|
|
|
return group.Wait()
|
|
|
|
}
|
|
|
|
|
|
|
|
// NotifyLowDisk reports disk space to satellites if cooldown timer has expired
|
|
|
|
func (service *Service) NotifyLowDisk() {
|
|
|
|
service.cooldown.Trigger()
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|
|
|
|
|
2019-05-08 12:11:59 +01:00
|
|
|
// Close stops the monitor service.
|
|
|
|
func (service *Service) Close() (err error) {
|
|
|
|
service.Loop.Close()
|
2020-02-26 02:39:44 +00:00
|
|
|
service.cooldown.Close()
|
2019-05-08 12:11:59 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-04 13:31:39 +01:00
|
|
|
func (service *Service) updateNodeInformation(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2019-03-18 10:55:06 +00:00
|
|
|
usedSpace, err := service.usedSpace(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
usedBandwidth, err := service.usedBandwidth(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2019-09-19 20:56:34 +01:00
|
|
|
service.contact.UpdateSelf(&pb.NodeCapacity{
|
2019-03-18 10:55:06 +00:00
|
|
|
FreeBandwidth: service.allocatedBandwidth - usedBandwidth,
|
|
|
|
FreeDisk: service.allocatedDiskSpace - usedSpace,
|
2019-04-22 10:07:50 +01:00
|
|
|
})
|
2019-03-18 10:55:06 +00:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-04 13:31:39 +01:00
|
|
|
func (service *Service) usedSpace(ctx context.Context) (_ int64, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-12-21 13:11:24 +00:00
|
|
|
usedSpace, err := service.store.SpaceUsedForPiecesAndTrash(ctx)
|
2019-03-18 10:55:06 +00:00
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return usedSpace, nil
|
|
|
|
}
|
|
|
|
|
2019-06-04 13:31:39 +01:00
|
|
|
func (service *Service) usedBandwidth(ctx context.Context) (_ int64, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2020-01-31 23:25:52 +00:00
|
|
|
usage, err := service.usageDB.MonthSummary(ctx, time.Now())
|
2019-03-18 10:55:06 +00:00
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
2019-07-09 01:33:50 +01:00
|
|
|
return usage, nil
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|
|
|
|
|
2019-04-15 11:12:22 +01:00
|
|
|
// AvailableSpace returns available disk space for upload
|
2019-06-04 13:31:39 +01:00
|
|
|
func (service *Service) AvailableSpace(ctx context.Context) (_ int64, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-12-21 13:11:24 +00:00
|
|
|
usedSpace, err := service.usedSpace(ctx)
|
2019-04-15 11:12:22 +01:00
|
|
|
if err != nil {
|
|
|
|
return 0, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
allocatedSpace := service.allocatedDiskSpace
|
2020-02-09 00:04:23 +00:00
|
|
|
|
|
|
|
mon.IntVal("allocated_space").Observe(allocatedSpace)
|
|
|
|
mon.IntVal("used_space").Observe(usedSpace)
|
|
|
|
mon.IntVal("available_space").Observe(allocatedSpace - usedSpace)
|
|
|
|
|
2019-04-15 11:12:22 +01:00
|
|
|
return allocatedSpace - usedSpace, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// AvailableBandwidth returns available bandwidth for upload/download
|
2019-06-04 13:31:39 +01:00
|
|
|
func (service *Service) AvailableBandwidth(ctx context.Context) (_ int64, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2020-01-31 23:25:52 +00:00
|
|
|
usage, err := service.usageDB.MonthSummary(ctx, time.Now())
|
2019-04-15 11:12:22 +01:00
|
|
|
if err != nil {
|
|
|
|
return 0, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
allocatedBandwidth := service.allocatedBandwidth
|
2019-11-21 19:51:40 +00:00
|
|
|
|
|
|
|
mon.IntVal("allocated_bandwidth").Observe(allocatedBandwidth) //locked
|
|
|
|
mon.IntVal("used_bandwidth").Observe(usage) //locked
|
2020-02-09 00:04:23 +00:00
|
|
|
mon.IntVal("available_bandwidth").Observe(allocatedBandwidth - usage)
|
2019-11-21 19:51:40 +00:00
|
|
|
|
2019-07-09 01:33:50 +01:00
|
|
|
return allocatedBandwidth - usage, nil
|
2019-03-18 10:55:06 +00:00
|
|
|
}
|