storj/satellite/downtime/estimation_chore.go

96 lines
2.5 KiB
Go
Raw Normal View History

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package downtime
import (
"context"
"time"
"go.uber.org/zap"
"storj.io/common/sync2"
"storj.io/storj/satellite/overlay"
)
// EstimationChore estimates how long nodes have been offline.
//
// architecture: Chore
type EstimationChore struct {
log *zap.Logger
Loop *sync2.Cycle
limiter *sync2.Limiter
config Config
startTime time.Time
overlay *overlay.Service
service *Service
db DB
}
// NewEstimationChore instantiates EstimationChore.
func NewEstimationChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *EstimationChore {
if config.EstimationConcurrencyLimit <= 0 {
config.EstimationConcurrencyLimit = 1
}
return &EstimationChore{
log: log,
Loop: sync2.NewCycle(config.EstimationInterval),
limiter: sync2.NewLimiter(config.EstimationConcurrencyLimit),
config: config,
startTime: time.Now().UTC(),
overlay: overlay,
service: service,
db: db,
}
}
// Run starts the chore.
func (chore *EstimationChore) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
chore.log.Debug("checking uptime of failed nodes",
zap.Stringer("interval", chore.config.EstimationInterval))
offlineNodes, err := chore.overlay.GetOfflineNodesLimited(ctx, chore.config.EstimationBatchSize)
if err != nil {
chore.log.Error("error getting offline nodes", zap.Error(err))
return nil
}
for _, node := range offlineNodes {
node := node
chore.limiter.Go(ctx, func() {
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, node.URL)
if err != nil {
chore.log.Error("error during downtime estimation ping back",
zap.Bool("success", success),
zap.Error(err))
return
}
if !success && node.LastContactFailure.After(chore.startTime) {
now := time.Now().UTC()
duration := now.Sub(node.LastContactFailure)
err = chore.db.Add(ctx, node.URL.ID, now, duration)
if err != nil {
chore.log.Error("error adding node seconds offline information.",
zap.Stringer("node ID", node.URL.ID),
zap.Stringer("duration", duration),
zap.Error(err))
}
}
})
}
chore.limiter.Wait()
return nil
})
}
// Close closes chore.
func (chore *EstimationChore) Close() error {
chore.Loop.Close()
return nil
}