167 lines
4.8 KiB
Go
167 lines
4.8 KiB
Go
|
// Copyright (C) 2022 Storj Labs, Inc.
|
||
|
// See LICENSE for copying information.
|
||
|
|
||
|
package gracefulexit
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"database/sql"
|
||
|
"time"
|
||
|
|
||
|
"github.com/zeebo/errs"
|
||
|
"go.uber.org/zap"
|
||
|
|
||
|
"storj.io/common/storj"
|
||
|
"storj.io/storj/satellite/metabase/rangedloop"
|
||
|
"storj.io/storj/satellite/overlay"
|
||
|
)
|
||
|
|
||
|
// Observer populates the transfer queue for exiting nodes. It also updates the
|
||
|
// timed out status and removes transefer queue items for inactive exiting
|
||
|
// nodes.
|
||
|
type Observer struct {
|
||
|
log *zap.Logger
|
||
|
db DB
|
||
|
overlay overlay.DB
|
||
|
config Config
|
||
|
|
||
|
// The following variables are reset on each loop cycle
|
||
|
exitingNodes storj.NodeIDList
|
||
|
bytesToTransfer map[storj.NodeID]int64
|
||
|
}
|
||
|
|
||
|
var _ rangedloop.Observer = (*Observer)(nil)
|
||
|
|
||
|
// NewObserver returns a new ranged loop observer.
|
||
|
func NewObserver(log *zap.Logger, db DB, overlay overlay.DB, config Config) *Observer {
|
||
|
return &Observer{
|
||
|
log: log,
|
||
|
db: db,
|
||
|
overlay: overlay,
|
||
|
config: config,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Start updates the status and clears the transfer queue for inactive exiting
|
||
|
// nodes. It then prepares to populate the transfer queue for newly exiting
|
||
|
// nodes during the ranged loop cycle.
|
||
|
func (obs *Observer) Start(ctx context.Context, startTime time.Time) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
|
||
|
// Determine which exiting nodes have yet to have complete a segment loop
|
||
|
// that queues up related pieces for transfer.
|
||
|
exitingNodes, err := obs.overlay.GetExitingNodes(ctx)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
nodeCount := len(exitingNodes)
|
||
|
if nodeCount == 0 {
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
obs.log.Debug("found exiting nodes", zap.Int("exitingNodes", nodeCount))
|
||
|
|
||
|
obs.checkForInactiveNodes(ctx, exitingNodes)
|
||
|
|
||
|
obs.exitingNodes = nil
|
||
|
obs.bytesToTransfer = make(map[storj.NodeID]int64)
|
||
|
for _, node := range exitingNodes {
|
||
|
if node.ExitLoopCompletedAt == nil {
|
||
|
obs.exitingNodes = append(obs.exitingNodes, node.NodeID)
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Fork returns path collector that will populate the transfer queue for
|
||
|
// segments belonging to newly exiting nodes for its range.
|
||
|
func (obs *Observer) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
|
||
|
// TODO: trim out/refactor segmentloop.Observer bits from path collector
|
||
|
// once segmentloop.Observer is removed.
|
||
|
return NewPathCollector(obs.log, obs.db, obs.exitingNodes, obs.config.ChoreBatchSize), nil
|
||
|
}
|
||
|
|
||
|
// Join flushes the forked path collector and aggregates collected metrics.
|
||
|
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
|
||
|
pathCollector, ok := partial.(*PathCollector)
|
||
|
if !ok {
|
||
|
return Error.New("expected partial type %T but got %T", pathCollector, partial)
|
||
|
}
|
||
|
|
||
|
if err := pathCollector.Flush(ctx); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
for nodeID, bytesToTransfer := range pathCollector.nodeIDStorage {
|
||
|
obs.bytesToTransfer[nodeID] += bytesToTransfer
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Finish marks that the exit loop has been completed for newly exiting nodes
|
||
|
// that were processed in this loop cycle.
|
||
|
func (obs *Observer) Finish(ctx context.Context) (err error) {
|
||
|
defer mon.Task()(&ctx)(&err)
|
||
|
|
||
|
// Record that the exit loop was completed for each node
|
||
|
now := time.Now().UTC()
|
||
|
for nodeID, bytesToTransfer := range obs.bytesToTransfer {
|
||
|
exitStatus := overlay.ExitStatusRequest{
|
||
|
NodeID: nodeID,
|
||
|
ExitLoopCompletedAt: now,
|
||
|
}
|
||
|
if _, err := obs.overlay.UpdateExitStatus(ctx, &exitStatus); err != nil {
|
||
|
obs.log.Error("error updating exit status.", zap.Error(err))
|
||
|
}
|
||
|
mon.IntVal("graceful_exit_init_bytes_stored").Observe(bytesToTransfer)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (obs *Observer) checkForInactiveNodes(ctx context.Context, exitingNodes []*overlay.ExitStatus) {
|
||
|
for _, node := range exitingNodes {
|
||
|
if node.ExitLoopCompletedAt == nil {
|
||
|
// Node has not yet had all of its pieces added to the transfer queue
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
progress, err := obs.db.GetProgress(ctx, node.NodeID)
|
||
|
if err != nil && !errs.Is(err, sql.ErrNoRows) {
|
||
|
obs.log.Error("error retrieving progress for node", zap.Stringer("Node ID", node.NodeID), zap.Error(err))
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
lastActivityTime := *node.ExitLoopCompletedAt
|
||
|
if progress != nil {
|
||
|
lastActivityTime = progress.UpdatedAt
|
||
|
}
|
||
|
|
||
|
// check inactive timeframe
|
||
|
if lastActivityTime.Add(obs.config.MaxInactiveTimeFrame).Before(time.Now().UTC()) {
|
||
|
exitStatusRequest := &overlay.ExitStatusRequest{
|
||
|
NodeID: node.NodeID,
|
||
|
ExitSuccess: false,
|
||
|
ExitFinishedAt: time.Now().UTC(),
|
||
|
}
|
||
|
mon.Meter("graceful_exit_fail_inactive").Mark(1)
|
||
|
_, err = obs.overlay.UpdateExitStatus(ctx, exitStatusRequest)
|
||
|
if err != nil {
|
||
|
obs.log.Error("error updating exit status", zap.Error(err))
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// remove all items from the transfer queue
|
||
|
err := obs.db.DeleteTransferQueueItems(ctx, node.NodeID)
|
||
|
if err != nil {
|
||
|
obs.log.Error("error deleting node from transfer queue", zap.Error(err))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|