90 lines
2.5 KiB
Go
90 lines
2.5 KiB
Go
|
// Copyright (C) 2019 Storj Labs, Inc.
|
||
|
// See LICENSE for copying information.
|
||
|
|
||
|
package downtime
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"time"
|
||
|
|
||
|
"go.uber.org/zap"
|
||
|
|
||
|
"storj.io/common/sync2"
|
||
|
"storj.io/storj/satellite/overlay"
|
||
|
)
|
||
|
|
||
|
// DetectionChore looks for nodes that have not checked in and tries to contact them.
|
||
|
//
|
||
|
// architecture: Chore
|
||
|
type DetectionChore struct {
|
||
|
log *zap.Logger
|
||
|
Loop sync2.Cycle
|
||
|
config Config
|
||
|
overlay *overlay.Service
|
||
|
service *Service
|
||
|
db DB
|
||
|
}
|
||
|
|
||
|
// NewDetectionChore instantiates DetectionChore.
|
||
|
func NewDetectionChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *DetectionChore {
|
||
|
return &DetectionChore{
|
||
|
log: log,
|
||
|
Loop: *sync2.NewCycle(config.DetectionInterval),
|
||
|
config: config,
|
||
|
overlay: overlay,
|
||
|
service: service,
|
||
|
db: db,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Run starts the chore.
|
||
|
func (chore *DetectionChore) Run(ctx context.Context) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
|
||
|
chore.log.Debug("checking for nodes that have not had a successful check-in within the interval.",
|
||
|
zap.Stringer("interval", chore.config.DetectionInterval))
|
||
|
|
||
|
nodeLastContacts, err := chore.overlay.GetSuccesfulNodesNotCheckedInSince(ctx, chore.config.DetectionInterval)
|
||
|
if err != nil {
|
||
|
chore.log.Error("error retrieving node addresses for downtime detection.", zap.Error(err))
|
||
|
return nil
|
||
|
}
|
||
|
chore.log.Debug("nodes that have had not had a successful check-in with the interval.",
|
||
|
zap.Stringer("interval", chore.config.DetectionInterval),
|
||
|
zap.Int("count", len(nodeLastContacts)))
|
||
|
|
||
|
for _, nodeLastContact := range nodeLastContacts {
|
||
|
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, nodeLastContact.ID, nodeLastContact.Address)
|
||
|
if err != nil {
|
||
|
chore.log.Error("error during downtime detection ping back.",
|
||
|
zap.Bool("success", success),
|
||
|
zap.Error(err))
|
||
|
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if !success {
|
||
|
now := time.Now().UTC()
|
||
|
duration := now.Sub(nodeLastContact.LastContactSuccess) - chore.config.DetectionInterval
|
||
|
|
||
|
err = chore.db.Add(ctx, nodeLastContact.ID, now, duration)
|
||
|
if err != nil {
|
||
|
chore.log.Error("error adding node seconds offline information.",
|
||
|
zap.Stringer("node ID", nodeLastContact.ID),
|
||
|
zap.Stringer("duration", duration),
|
||
|
zap.Error(err))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// Close closes chore.
|
||
|
func (chore *DetectionChore) Close() error {
|
||
|
chore.Loop.Close()
|
||
|
return nil
|
||
|
}
|