storj/satellite/downtime/detection_chore.go

74 lines
2.0 KiB
Go
Raw Normal View History

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package downtime
import (
"context"
"go.uber.org/zap"
"storj.io/common/sync2"
"storj.io/storj/satellite/overlay"
)
// DetectionChore looks for nodes that have not checked in and tries to contact them.
//
// architecture: Chore
type DetectionChore struct {
log *zap.Logger
Loop *sync2.Cycle
config Config
overlay *overlay.Service
service *Service
}
// NewDetectionChore instantiates DetectionChore.
func NewDetectionChore(log *zap.Logger, config Config, overlay *overlay.Service, service *Service) *DetectionChore {
return &DetectionChore{
log: log,
Loop: sync2.NewCycle(config.DetectionInterval),
config: config,
overlay: overlay,
service: service,
}
}
// Run starts the chore.
func (chore *DetectionChore) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
return chore.Loop.Run(ctx, func(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
chore.log.Debug("checking for nodes that have not had a successful check-in within the interval.",
zap.Stringer("interval", chore.config.DetectionInterval))
nodeLastContacts, err := chore.overlay.GetSuccesfulNodesNotCheckedInSince(ctx, chore.config.DetectionInterval)
if err != nil {
chore.log.Error("error retrieving node addresses for downtime detection.", zap.Error(err))
return nil
}
chore.log.Debug("nodes that have had not had a successful check-in with the interval.",
zap.Stringer("interval", chore.config.DetectionInterval),
zap.Int("count", len(nodeLastContacts)))
for _, nodeLastContact := range nodeLastContacts {
success, err := chore.service.CheckAndUpdateNodeAvailability(ctx, nodeLastContact.URL)
if err != nil {
chore.log.Error("error during downtime detection ping back.",
zap.Bool("success", success),
zap.Error(err))
continue
}
}
return nil
})
}
// Close closes chore.
func (chore *DetectionChore) Close() error {
chore.Loop.Close()
return nil
}