2020-01-07 21:34:48 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package downtime
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"go.uber.org/zap"
|
|
|
|
|
|
|
|
"storj.io/common/sync2"
|
|
|
|
"storj.io/storj/satellite/overlay"
|
|
|
|
)
|
|
|
|
|
|
|
|
// EstimationChore estimates how long nodes have been offline.
|
|
|
|
//
|
|
|
|
// architecture: Chore
|
|
|
|
type EstimationChore struct {
|
2020-02-10 20:55:35 +00:00
|
|
|
log *zap.Logger
|
|
|
|
Loop *sync2.Cycle
|
2020-04-13 19:08:04 +01:00
|
|
|
limiter *sync2.Limiter
|
2020-02-10 20:55:35 +00:00
|
|
|
config Config
|
|
|
|
startTime time.Time
|
|
|
|
overlay *overlay.Service
|
|
|
|
service *Service
|
|
|
|
db DB
|
2020-01-07 21:34:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewEstimationChore instantiates EstimationChore.
|
|
|
|
func NewEstimationChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *EstimationChore {
|
2020-04-13 19:08:04 +01:00
|
|
|
if config.EstimationConcurrencyLimit <= 0 {
|
|
|
|
config.EstimationConcurrencyLimit = 1
|
|
|
|
}
|
2020-01-07 21:34:48 +00:00
|
|
|
return &EstimationChore{
|
2020-02-10 20:55:35 +00:00
|
|
|
log: log,
|
|
|
|
Loop: sync2.NewCycle(config.EstimationInterval),
|
2020-04-13 19:08:04 +01:00
|
|
|
limiter: sync2.NewLimiter(config.EstimationConcurrencyLimit),
|
2020-02-10 20:55:35 +00:00
|
|
|
config: config,
|
|
|
|
startTime: time.Now().UTC(),
|
|
|
|
overlay: overlay,
|
|
|
|
service: service,
|
|
|
|
db: db,
|
2020-01-07 21:34:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Run starts the chore.
|
|
|
|
func (chore *EstimationChore) Run(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
chore.log.Debug("checking uptime of failed nodes",
|
|
|
|
zap.Stringer("interval", chore.config.EstimationInterval))
|
|
|
|
|
|
|
|
offlineNodes, err := chore.overlay.GetOfflineNodesLimited(ctx, chore.config.EstimationBatchSize)
|
|
|
|
if err != nil {
|
|
|
|
chore.log.Error("error getting offline nodes", zap.Error(err))
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range offlineNodes {
|
2020-04-13 19:08:04 +01:00
|
|
|
node := node
|
|
|
|
chore.limiter.Go(ctx, func() {
|
|
|
|
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, node.ID, node.Address)
|
2020-01-07 21:34:48 +00:00
|
|
|
if err != nil {
|
2020-04-13 19:08:04 +01:00
|
|
|
chore.log.Error("error during downtime estimation ping back",
|
|
|
|
zap.Bool("success", success),
|
2020-01-07 21:34:48 +00:00
|
|
|
zap.Error(err))
|
2020-04-13 19:08:04 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
if !success && node.LastContactFailure.After(chore.startTime) {
|
|
|
|
now := time.Now().UTC()
|
|
|
|
duration := now.Sub(node.LastContactFailure)
|
|
|
|
|
|
|
|
err = chore.db.Add(ctx, node.ID, now, duration)
|
|
|
|
if err != nil {
|
|
|
|
chore.log.Error("error adding node seconds offline information.",
|
|
|
|
zap.Stringer("node ID", node.ID),
|
|
|
|
zap.Stringer("duration", duration),
|
|
|
|
zap.Error(err))
|
|
|
|
}
|
2020-01-07 21:34:48 +00:00
|
|
|
}
|
2020-04-13 19:08:04 +01:00
|
|
|
})
|
2020-01-07 21:34:48 +00:00
|
|
|
}
|
2020-04-13 19:08:04 +01:00
|
|
|
chore.limiter.Wait()
|
2020-01-07 21:34:48 +00:00
|
|
|
return nil
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// Close closes chore.
|
|
|
|
func (chore *EstimationChore) Close() error {
|
|
|
|
chore.Loop.Close()
|
|
|
|
return nil
|
|
|
|
}
|