storj/satellite/repair/checker/online.go
Michal Niewrzal f2cd7b0928 satellite/overlay: refactor Reliable to be used with repair checker
Currently we are using Reliable to get missing pieces for repair
checker. The issue is that now checker is looking at more things than
just missing pieces (clumped/off, placement pieces) and using only node
ID is not enough. We have issue where we are skipping offline nodes from
clumped and off placement pieces check.

Reliable was refactored to get data (e.g. country, lastNet) about all
reliable nodes. List is split into online and offline. This data will be
cached for quick use by repair checker. It will be also possible to
check nodes metadata like country code or lastNet.

We are also slowly moving `RepairExcludedCountryCodes` config from
overlay to repair which makes more sens for it.

This this first part of changes.

https://github.com/storj/storj/issues/5998

Change-Id: If534342488c0e440affc2894a8fbda6507b8959d
2023-07-05 10:56:31 +02:00

145 lines
4.7 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
"sync"
"sync/atomic"
"time"
"storj.io/common/storj"
"storj.io/common/storj/location"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/overlay"
)
// ReliabilityCache caches the reliable nodes for the specified staleness duration
// and updates automatically from overlay.
//
// architecture: Service
type ReliabilityCache struct {
overlay *overlay.Service
staleness time.Duration
excludedCountryCodes map[location.CountryCode]struct{}
mu sync.Mutex
state atomic.Value // contains immutable *reliabilityState
}
// reliabilityState.
type reliabilityState struct {
reliable map[storj.NodeID]struct{}
created time.Time
}
// NewReliabilityCache creates a new reliability checking cache.
func NewReliabilityCache(overlay *overlay.Service, staleness time.Duration, excludedCountries []string) *ReliabilityCache {
excludedCountryCodes := make(map[location.CountryCode]struct{})
for _, countryCode := range excludedCountries {
if cc := location.ToCountryCode(countryCode); cc != location.None {
excludedCountryCodes[location.ToCountryCode(countryCode)] = struct{}{}
}
}
return &ReliabilityCache{
overlay: overlay,
staleness: staleness,
excludedCountryCodes: excludedCountryCodes,
}
}
// LastUpdate returns when the cache was last updated, or the zero value (time.Time{}) if it
// has never yet been updated. LastUpdate() does not trigger an update itself.
func (cache *ReliabilityCache) LastUpdate() time.Time {
if state, ok := cache.state.Load().(*reliabilityState); ok {
return state.created
}
return time.Time{}
}
// NumNodes returns the number of online active nodes (as determined by the reliability cache).
// This number is not guaranteed to be consistent with either the nodes database or the
// reliability cache after returning; it is just a best-effort count and should be treated as an
// estimate.
func (cache *ReliabilityCache) NumNodes(ctx context.Context) (numNodes int, err error) {
state, err := cache.loadFast(ctx, time.Time{})
if err != nil {
return 0, err
}
return len(state.reliable), nil
}
// MissingPieces returns piece indices that are unreliable with the given staleness period.
func (cache *ReliabilityCache) MissingPieces(ctx context.Context, created time.Time, pieces metabase.Pieces) (_ []metabase.Piece, err error) {
state, err := cache.loadFast(ctx, created)
if err != nil {
return nil, err
}
var unreliable []metabase.Piece
for _, p := range pieces {
if _, ok := state.reliable[p.StorageNode]; !ok {
unreliable = append(unreliable, p)
}
}
return unreliable, nil
}
func (cache *ReliabilityCache) loadFast(ctx context.Context, validUpTo time.Time) (_ *reliabilityState, err error) {
// This code is designed to be very fast in the case where a refresh is not needed: just an
// atomic load from rarely written to bit of shared memory. The general strategy is to first
// read if the state suffices to answer the query. If not (due to it not existing, being
// too stale, etc.), then we acquire the mutex to block other requests that may be stale
// and ensure we only issue one refresh at a time. After acquiring the mutex, we have to
// double check that the state is still stale because some other call may have beat us to
// the acquisition. Only then do we refresh and can then proceed answering the query.
state, ok := cache.state.Load().(*reliabilityState)
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
cache.mu.Lock()
state, ok = cache.state.Load().(*reliabilityState)
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
state, err = cache.refreshLocked(ctx)
}
cache.mu.Unlock()
if err != nil {
return nil, err
}
}
return state, nil
}
// Refresh refreshes the cache.
func (cache *ReliabilityCache) Refresh(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
cache.mu.Lock()
defer cache.mu.Unlock()
_, err = cache.refreshLocked(ctx)
return err
}
// refreshLocked does the refreshes assuming the write mutex is held.
func (cache *ReliabilityCache) refreshLocked(ctx context.Context) (_ *reliabilityState, err error) {
defer mon.Task()(&ctx)(&err)
online, _, err := cache.overlay.Reliable(ctx)
if err != nil {
return nil, Error.Wrap(err)
}
state := &reliabilityState{
created: time.Now(),
reliable: make(map[storj.NodeID]struct{}, len(online)),
}
for _, node := range online {
if _, ok := cache.excludedCountryCodes[node.CountryCode]; !ok {
state.reliable[node.ID] = struct{}{}
}
}
cache.state.Store(state)
return state, nil
}