storj/satellite/downtime/estimation_chore.go
Natalie Ventura Villasana 6b1829f3c3
satellite/downtime: new chore estimates downtime
Adds EstimationChore to the downtime package, which is an
independent chore that finds offline nodes given a configurable
limit, then uptime checks those nodes, and sets a last contact
success or failure given a response. For failed nodes, the chore
updates the amount of downtime the node has been offline in the
DowntimeTracking table.

Design doc section: https://github.com/storj/storj/blob/master/docs/blueprints/storage-node-downtime-tracking.md#estimating-offline-time
Jira: https://storjlabs.atlassian.net/browse/V3-2545

Change-Id: I60af95803930bf9b33232b248bb20cca6f0e0b5f
2020-01-09 15:05:13 -05:00

86 lines
2.1 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package downtime
import (
"context"
"time"
"go.uber.org/zap"
"storj.io/common/sync2"
"storj.io/storj/satellite/overlay"
)
// EstimationChore estimates how long nodes have been offline.
//
// architecture: Chore
type EstimationChore struct {
log *zap.Logger
Loop sync2.Cycle
config Config
overlay *overlay.Service
service *Service
db DB
}
// NewEstimationChore instantiates EstimationChore.
func NewEstimationChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *EstimationChore {
return &EstimationChore{
log: log,
Loop: *sync2.NewCycle(config.EstimationInterval),
config: config,
overlay: overlay,
service: service,
db: db,
}
}
// Run starts the chore.
func (chore *EstimationChore) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
chore.log.Debug("checking uptime of failed nodes",
zap.Stringer("interval", chore.config.EstimationInterval))
offlineNodes, err := chore.overlay.GetOfflineNodesLimited(ctx, chore.config.EstimationBatchSize)
if err != nil {
chore.log.Error("error getting offline nodes", zap.Error(err))
return nil
}
for _, node := range offlineNodes {
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, node.ID, node.Address)
if err != nil {
chore.log.Error("error during downtime estimation ping back",
zap.Bool("success", success),
zap.Error(err))
continue
}
if !success {
now := time.Now().UTC()
duration := now.Sub(node.LastContactFailure)
err = chore.db.Add(ctx, node.ID, now, duration)
if err != nil {
chore.log.Error("error adding node seconds offline information.",
zap.Stringer("node ID", node.ID),
zap.Stringer("duration", duration),
zap.Error(err))
}
}
}
return nil
})
}
// Close closes chore.
func (chore *EstimationChore) Close() error {
chore.Loop.Close()
return nil
}