storj/satellite/downtime/estimation_chore.go
Moby von Briesen c4a9a5d48b satellite/downtime: update detection and estimation downtime chores for
more trustworthy downtime tracking

Detection chore: Do not update downtime at all from the detection chore.
We only want to include downtime between two explicitly failed ping attempts
(the duration between last contact success and the first failed ping is no longer
included in downtime calculation)

Estimation chore: If the satellite started after the last failed ping for a node,
do not include offline time since the last failed ping time - only
estimate based on two failed pings with no satellite downtime in
between.
This protects us from including satellite downtime in our storagenode downtime calculations.

Change-Id: I1fddc9f7255a7023e02474255d70c64faae75b8a
2020-02-10 22:37:01 +00:00

87 lines
2.2 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package downtime
import (
"context"
"time"
"go.uber.org/zap"
"storj.io/common/sync2"
"storj.io/storj/satellite/overlay"
)
// EstimationChore estimates how long nodes have been offline.
//
// architecture: Chore
type EstimationChore struct {
log *zap.Logger
Loop *sync2.Cycle
config Config
startTime time.Time
overlay *overlay.Service
service *Service
db DB
}
// NewEstimationChore instantiates EstimationChore.
func NewEstimationChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *EstimationChore {
return &EstimationChore{
log: log,
Loop: sync2.NewCycle(config.EstimationInterval),
config: config,
startTime: time.Now().UTC(),
overlay: overlay,
service: service,
db: db,
}
}
// Run starts the chore.
func (chore *EstimationChore) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
chore.log.Debug("checking uptime of failed nodes",
zap.Stringer("interval", chore.config.EstimationInterval))
offlineNodes, err := chore.overlay.GetOfflineNodesLimited(ctx, chore.config.EstimationBatchSize)
if err != nil {
chore.log.Error("error getting offline nodes", zap.Error(err))
return nil
}
for _, node := range offlineNodes {
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, node.ID, node.Address)
if err != nil {
chore.log.Error("error during downtime estimation ping back",
zap.Bool("success", success),
zap.Error(err))
continue
}
if !success && node.LastContactFailure.After(chore.startTime) {
now := time.Now().UTC()
duration := now.Sub(node.LastContactFailure)
err = chore.db.Add(ctx, node.ID, now, duration)
if err != nil {
chore.log.Error("error adding node seconds offline information.",
zap.Stringer("node ID", node.ID),
zap.Stringer("duration", duration),
zap.Error(err))
}
}
}
return nil
})
}
// Close closes chore.
func (chore *EstimationChore) Close() error {
chore.Loop.Close()
return nil
}