storj/satellite/downtime/detection_chore.go
Egon Elbre 8dea4f52db satellite: add control panel
Change-Id: Id48246e9bcd4c6ec643277fe740937b2e42ad85b
2020-01-30 08:06:43 -05:00

90 lines
2.5 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package downtime
import (
"context"
"time"
"go.uber.org/zap"
"storj.io/common/sync2"
"storj.io/storj/satellite/overlay"
)
// DetectionChore looks for nodes that have not checked in and tries to contact them.
//
// architecture: Chore
type DetectionChore struct {
log *zap.Logger
Loop *sync2.Cycle
config Config
overlay *overlay.Service
service *Service
db DB
}
// NewDetectionChore instantiates DetectionChore.
func NewDetectionChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service, db DB) *DetectionChore {
return &DetectionChore{
log: log,
Loop: sync2.NewCycle(config.DetectionInterval),
config: config,
overlay: overlay,
service: service,
db: db,
}
}
// Run starts the chore.
func (chore *DetectionChore) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
chore.log.Debug("checking for nodes that have not had a successful check-in within the interval.",
zap.Stringer("interval", chore.config.DetectionInterval))
nodeLastContacts, err := chore.overlay.GetSuccesfulNodesNotCheckedInSince(ctx, chore.config.DetectionInterval)
if err != nil {
chore.log.Error("error retrieving node addresses for downtime detection.", zap.Error(err))
return nil
}
chore.log.Debug("nodes that have had not had a successful check-in with the interval.",
zap.Stringer("interval", chore.config.DetectionInterval),
zap.Int("count", len(nodeLastContacts)))
for _, nodeLastContact := range nodeLastContacts {
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, nodeLastContact.ID, nodeLastContact.Address)
if err != nil {
chore.log.Error("error during downtime detection ping back.",
zap.Bool("success", success),
zap.Error(err))
continue
}
if !success {
now := time.Now().UTC()
duration := now.Sub(nodeLastContact.LastContactSuccess) - chore.config.DetectionInterval
err = chore.db.Add(ctx, nodeLastContact.ID, now, duration)
if err != nil {
chore.log.Error("error adding node seconds offline information.",
zap.Stringer("node ID", nodeLastContact.ID),
zap.Stringer("duration", duration),
zap.Error(err))
}
}
}
return nil
})
}
// Close closes chore.
func (chore *DetectionChore) Close() error {
chore.Loop.Close()
return nil
}