86 lines
2.1 KiB
Go
86 lines
2.1 KiB
Go
|
// Copyright (C) 2019 Storj Labs, Inc.
|
||
|
// See LICENSE for copying information.
|
||
|
|
||
|
package downtime
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"time"
|
||
|
|
||
|
"go.uber.org/zap"
|
||
|
|
||
|
"storj.io/common/sync2"
|
||
|
|
||
|
"storj.io/storj/satellite/overlay"
|
||
|
)
|
||
|
|
||
|
// EstimationChore estimates how long nodes have been offline.
|
||
|
//
|
||
|
// architecture: Chore
|
||
|
type EstimationChore struct {
|
||
|
log *zap.Logger
|
||
|
Loop sync2.Cycle
|
||
|
config Config
|
||
|
overlay *overlay.Service
|
||
|
service *Service
|
||
|
db DB
|
||
|
}
|
||
|
|
||
|
// NewEstimationChore instantiates EstimationChore.
|
||
|
func NewEstimationChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *EstimationChore {
|
||
|
return &EstimationChore{
|
||
|
log: log,
|
||
|
Loop: *sync2.NewCycle(config.EstimationInterval),
|
||
|
config: config,
|
||
|
overlay: overlay,
|
||
|
service: service,
|
||
|
db: db,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Run starts the chore.
|
||
|
func (chore *EstimationChore) Run(ctx context.Context) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
|
||
|
chore.log.Debug("checking uptime of failed nodes",
|
||
|
zap.Stringer("interval", chore.config.EstimationInterval))
|
||
|
|
||
|
offlineNodes, err := chore.overlay.GetOfflineNodesLimited(ctx, chore.config.EstimationBatchSize)
|
||
|
if err != nil {
|
||
|
chore.log.Error("error getting offline nodes", zap.Error(err))
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
for _, node := range offlineNodes {
|
||
|
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, node.ID, node.Address)
|
||
|
if err != nil {
|
||
|
chore.log.Error("error during downtime estimation ping back",
|
||
|
zap.Bool("success", success),
|
||
|
zap.Error(err))
|
||
|
continue
|
||
|
}
|
||
|
if !success {
|
||
|
now := time.Now().UTC()
|
||
|
duration := now.Sub(node.LastContactFailure)
|
||
|
|
||
|
err = chore.db.Add(ctx, node.ID, now, duration)
|
||
|
if err != nil {
|
||
|
chore.log.Error("error adding node seconds offline information.",
|
||
|
zap.Stringer("node ID", node.ID),
|
||
|
zap.Stringer("duration", duration),
|
||
|
zap.Error(err))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// Close closes chore.
|
||
|
func (chore *EstimationChore) Close() error {
|
||
|
chore.Loop.Close()
|
||
|
return nil
|
||
|
}
|