storj/satellite/repair/checker/online.go
paul cannon 1b8bd6c082 satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.

With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.

The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.

Some other side effects of this change:

* CreatePutRepairOrderLimits no longer needs to special-case excluded
  countries; it can just create as many order limits as requested (by
  way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
  any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
  function similar to the one on the overlay. The classification methods
  like MissingPieces(), OutOfPlacementPieces(), and
  PiecesNodesLastNetsInOrder() are removed in favor of the
  classification logic in satellite/repair/classification.go. This
  means the reliability cache no longer needs access to the placement
  rules or excluded countries list.

Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-25 09:42:08 -05:00

135 lines
4.4 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
"sync"
"sync/atomic"
"time"
"storj.io/common/storj"
"storj.io/storj/satellite/nodeselection"
"storj.io/storj/satellite/overlay"
)
// ReliabilityCache caches known nodes for the specified staleness duration
// and updates automatically from overlay.
//
// architecture: Service
type ReliabilityCache struct {
overlay *overlay.Service
staleness time.Duration
mu sync.Mutex
state atomic.Value // contains immutable *reliabilityState
}
// reliabilityState.
type reliabilityState struct {
nodeByID map[storj.NodeID]nodeselection.SelectedNode
created time.Time
}
// NewReliabilityCache creates a new reliability checking cache.
func NewReliabilityCache(overlay *overlay.Service, staleness time.Duration) *ReliabilityCache {
return &ReliabilityCache{
overlay: overlay,
staleness: staleness,
}
}
// LastUpdate returns when the cache was last updated, or the zero value (time.Time{}) if it
// has never yet been updated. LastUpdate() does not trigger an update itself.
func (cache *ReliabilityCache) LastUpdate() time.Time {
if state, ok := cache.state.Load().(*reliabilityState); ok {
return state.created
}
return time.Time{}
}
// NumNodes returns the number of online active nodes (as determined by the reliability cache).
// This number is not guaranteed to be consistent with either the nodes database or the
// reliability cache after returning; it is just a best-effort count and should be treated as an
// estimate.
func (cache *ReliabilityCache) NumNodes(ctx context.Context) (numNodes int, err error) {
state, err := cache.loadFast(ctx, time.Time{})
if err != nil {
return 0, err
}
return len(state.nodeByID), nil
}
// GetNodes gets the cached SelectedNode records (valid as of the given time) for each of
// the requested node IDs, and returns them in order. If a node is not in the reliability
// cache (that is, it is unknown or disqualified), an empty SelectedNode will be returned
// for the index corresponding to that node ID.
func (cache *ReliabilityCache) GetNodes(ctx context.Context, validUpTo time.Time, nodeIDs []storj.NodeID) ([]nodeselection.SelectedNode, error) {
state, err := cache.loadFast(ctx, validUpTo)
if err != nil {
return nil, err
}
nodes := make([]nodeselection.SelectedNode, len(nodeIDs))
for i, nodeID := range nodeIDs {
nodes[i] = state.nodeByID[nodeID]
}
return nodes, nil
}
func (cache *ReliabilityCache) loadFast(ctx context.Context, validUpTo time.Time) (_ *reliabilityState, err error) {
// This code is designed to be very fast in the case where a refresh is not needed: just an
// atomic load from rarely written to bit of shared memory. The general strategy is to first
// read if the state suffices to answer the query. If not (due to it not existing, being
// too stale, etc.), then we acquire the mutex to block other requests that may be stale
// and ensure we only issue one refresh at a time. After acquiring the mutex, we have to
// double check that the state is still stale because some other call may have beat us to
// the acquisition. Only then do we refresh and can then proceed answering the query.
state, ok := cache.state.Load().(*reliabilityState)
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
cache.mu.Lock()
state, ok = cache.state.Load().(*reliabilityState)
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
state, err = cache.refreshLocked(ctx)
}
cache.mu.Unlock()
if err != nil {
return nil, err
}
}
return state, nil
}
// Refresh refreshes the cache.
func (cache *ReliabilityCache) Refresh(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
cache.mu.Lock()
defer cache.mu.Unlock()
_, err = cache.refreshLocked(ctx)
return err
}
// refreshLocked does the refreshes assuming the write mutex is held.
func (cache *ReliabilityCache) refreshLocked(ctx context.Context) (_ *reliabilityState, err error) {
defer mon.Task()(&ctx)(&err)
selectedNodes, err := cache.overlay.GetParticipatingNodes(ctx)
if err != nil {
return nil, Error.Wrap(err)
}
state := &reliabilityState{
created: time.Now(),
nodeByID: make(map[storj.NodeID]nodeselection.SelectedNode, len(selectedNodes)),
}
for _, node := range selectedNodes {
state.nodeByID[node.ID] = node
}
cache.state.Store(state)
return state, nil
}