2019-07-08 23:04:35 +01:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package checker
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2019-07-15 20:58:39 +01:00
|
|
|
"sync"
|
|
|
|
"sync/atomic"
|
2019-07-08 23:04:35 +01:00
|
|
|
"time"
|
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/storj"
|
2023-06-29 09:38:47 +01:00
|
|
|
"storj.io/common/storj/location"
|
2021-04-21 13:42:57 +01:00
|
|
|
"storj.io/storj/satellite/metabase"
|
2023-07-06 13:35:26 +01:00
|
|
|
"storj.io/storj/satellite/nodeselection"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2019-07-08 23:04:35 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
// ReliabilityCache caches the reliable nodes for the specified staleness duration
|
|
|
|
// and updates automatically from overlay.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2019-07-08 23:04:35 +01:00
|
|
|
type ReliabilityCache struct {
|
2023-06-29 14:26:52 +01:00
|
|
|
overlay *overlay.Service
|
|
|
|
staleness time.Duration
|
|
|
|
// define from which countries nodes should be marked as offline
|
2023-06-29 09:38:47 +01:00
|
|
|
excludedCountryCodes map[location.CountryCode]struct{}
|
|
|
|
mu sync.Mutex
|
|
|
|
state atomic.Value // contains immutable *reliabilityState
|
2023-07-06 13:35:26 +01:00
|
|
|
placementRules overlay.PlacementRules
|
2023-06-29 14:26:52 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// reliabilityState.
|
2019-07-15 20:58:39 +01:00
|
|
|
type reliabilityState struct {
|
2023-07-06 13:35:26 +01:00
|
|
|
reliableOnline map[storj.NodeID]nodeselection.SelectedNode
|
|
|
|
reliableAll map[storj.NodeID]nodeselection.SelectedNode
|
2023-06-29 14:26:52 +01:00
|
|
|
created time.Time
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewReliabilityCache creates a new reliability checking cache.
|
2023-07-06 13:35:26 +01:00
|
|
|
func NewReliabilityCache(overlay *overlay.Service, staleness time.Duration, placementRules overlay.PlacementRules, excludedCountries []string) *ReliabilityCache {
|
2023-06-29 09:38:47 +01:00
|
|
|
excludedCountryCodes := make(map[location.CountryCode]struct{})
|
|
|
|
for _, countryCode := range excludedCountries {
|
|
|
|
if cc := location.ToCountryCode(countryCode); cc != location.None {
|
2023-06-29 14:26:52 +01:00
|
|
|
excludedCountryCodes[cc] = struct{}{}
|
2023-06-29 09:38:47 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-08 23:04:35 +01:00
|
|
|
return &ReliabilityCache{
|
2023-06-29 09:38:47 +01:00
|
|
|
overlay: overlay,
|
|
|
|
staleness: staleness,
|
2023-07-06 13:35:26 +01:00
|
|
|
placementRules: placementRules,
|
2023-06-29 09:38:47 +01:00
|
|
|
excludedCountryCodes: excludedCountryCodes,
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
// LastUpdate returns when the cache was last updated, or the zero value (time.Time{}) if it
|
|
|
|
// has never yet been updated. LastUpdate() does not trigger an update itself.
|
2019-07-15 20:58:39 +01:00
|
|
|
func (cache *ReliabilityCache) LastUpdate() time.Time {
|
|
|
|
if state, ok := cache.state.Load().(*reliabilityState); ok {
|
|
|
|
return state.created
|
|
|
|
}
|
|
|
|
return time.Time{}
|
|
|
|
}
|
2019-07-08 23:04:35 +01:00
|
|
|
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
// NumNodes returns the number of online active nodes (as determined by the reliability cache).
|
|
|
|
// This number is not guaranteed to be consistent with either the nodes database or the
|
|
|
|
// reliability cache after returning; it is just a best-effort count and should be treated as an
|
|
|
|
// estimate.
|
|
|
|
func (cache *ReliabilityCache) NumNodes(ctx context.Context) (numNodes int, err error) {
|
|
|
|
state, err := cache.loadFast(ctx, time.Time{})
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
2023-06-29 14:26:52 +01:00
|
|
|
|
|
|
|
return len(state.reliableOnline), nil
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
}
|
|
|
|
|
2019-07-08 23:04:35 +01:00
|
|
|
// MissingPieces returns piece indices that are unreliable with the given staleness period.
|
2023-06-29 14:26:52 +01:00
|
|
|
func (cache *ReliabilityCache) MissingPieces(ctx context.Context, created time.Time, pieces metabase.Pieces) (_ metabase.Pieces, err error) {
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
state, err := cache.loadFast(ctx, created)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2023-06-29 14:26:52 +01:00
|
|
|
var unreliable metabase.Pieces
|
2021-08-17 16:18:42 +01:00
|
|
|
for _, p := range pieces {
|
2023-06-29 14:26:52 +01:00
|
|
|
node, ok := state.reliableOnline[p.StorageNode]
|
|
|
|
if !ok {
|
|
|
|
unreliable = append(unreliable, p)
|
|
|
|
} else if _, excluded := cache.excludedCountryCodes[node.CountryCode]; excluded {
|
2021-08-17 16:18:42 +01:00
|
|
|
unreliable = append(unreliable, p)
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return unreliable, nil
|
|
|
|
}
|
|
|
|
|
2023-06-29 14:26:52 +01:00
|
|
|
// OutOfPlacementPieces checks which pieces are out of segment placement. Piece placement is defined by node location which is storing it.
|
|
|
|
func (cache *ReliabilityCache) OutOfPlacementPieces(ctx context.Context, created time.Time, pieces metabase.Pieces, placement storj.PlacementConstraint) (_ metabase.Pieces, err error) {
|
|
|
|
defer mon.Task()(&ctx)(nil)
|
|
|
|
|
2023-08-14 15:27:53 +01:00
|
|
|
if len(pieces) == 0 {
|
2023-06-29 14:26:52 +01:00
|
|
|
return metabase.Pieces{}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
state, err := cache.loadFast(ctx, created)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
var outOfPlacementPieces metabase.Pieces
|
2023-07-06 13:35:26 +01:00
|
|
|
nodeFilters := cache.placementRules(placement)
|
2023-06-29 14:26:52 +01:00
|
|
|
for _, p := range pieces {
|
2023-08-28 08:42:08 +01:00
|
|
|
if node, ok := state.reliableAll[p.StorageNode]; ok && !nodeFilters.Match(&node) {
|
2023-06-29 14:26:52 +01:00
|
|
|
outOfPlacementPieces = append(outOfPlacementPieces, p)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return outOfPlacementPieces, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// PiecesNodesLastNetsInOrder returns the /24 subnet for each piece storage node, in order. If a
|
|
|
|
// requested node is not in the database or it's unreliable, an empty string will be returned corresponding
|
|
|
|
// to that node's last_net.
|
|
|
|
func (cache *ReliabilityCache) PiecesNodesLastNetsInOrder(ctx context.Context, created time.Time, pieces metabase.Pieces) (lastNets []string, err error) {
|
|
|
|
defer mon.Task()(&ctx)(nil)
|
|
|
|
|
|
|
|
if len(pieces) == 0 {
|
|
|
|
return []string{}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
state, err := cache.loadFast(ctx, created)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
lastNets = make([]string, len(pieces))
|
|
|
|
for i, piece := range pieces {
|
|
|
|
if node, ok := state.reliableAll[piece.StorageNode]; ok {
|
|
|
|
lastNets[i] = node.LastNet
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return lastNets, nil
|
|
|
|
}
|
|
|
|
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
func (cache *ReliabilityCache) loadFast(ctx context.Context, validUpTo time.Time) (_ *reliabilityState, err error) {
|
2019-07-15 20:58:39 +01:00
|
|
|
// This code is designed to be very fast in the case where a refresh is not needed: just an
|
|
|
|
// atomic load from rarely written to bit of shared memory. The general strategy is to first
|
|
|
|
// read if the state suffices to answer the query. If not (due to it not existing, being
|
|
|
|
// too stale, etc.), then we acquire the mutex to block other requests that may be stale
|
|
|
|
// and ensure we only issue one refresh at a time. After acquiring the mutex, we have to
|
|
|
|
// double check that the state is still stale because some other call may have beat us to
|
|
|
|
// the acquisition. Only then do we refresh and can then proceed answering the query.
|
|
|
|
|
|
|
|
state, ok := cache.state.Load().(*reliabilityState)
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
|
2019-07-15 20:58:39 +01:00
|
|
|
cache.mu.Lock()
|
|
|
|
state, ok = cache.state.Load().(*reliabilityState)
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
|
2019-07-15 20:58:39 +01:00
|
|
|
state, err = cache.refreshLocked(ctx)
|
|
|
|
}
|
|
|
|
cache.mu.Unlock()
|
2019-07-08 23:04:35 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
return state, nil
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Refresh refreshes the cache.
|
2019-07-15 20:58:39 +01:00
|
|
|
func (cache *ReliabilityCache) Refresh(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
cache.mu.Lock()
|
|
|
|
defer cache.mu.Unlock()
|
2019-07-08 23:04:35 +01:00
|
|
|
|
2019-07-15 20:58:39 +01:00
|
|
|
_, err = cache.refreshLocked(ctx)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// refreshLocked does the refreshes assuming the write mutex is held.
|
|
|
|
func (cache *ReliabilityCache) refreshLocked(ctx context.Context) (_ *reliabilityState, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-08 23:04:35 +01:00
|
|
|
|
2023-06-29 14:26:52 +01:00
|
|
|
online, offline, err := cache.overlay.Reliable(ctx)
|
2019-07-08 23:04:35 +01:00
|
|
|
if err != nil {
|
2019-07-15 20:58:39 +01:00
|
|
|
return nil, Error.Wrap(err)
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
2019-07-15 20:58:39 +01:00
|
|
|
state := &reliabilityState{
|
2023-06-29 14:26:52 +01:00
|
|
|
created: time.Now(),
|
2023-07-06 13:35:26 +01:00
|
|
|
reliableOnline: make(map[storj.NodeID]nodeselection.SelectedNode, len(online)),
|
|
|
|
reliableAll: make(map[storj.NodeID]nodeselection.SelectedNode, len(online)+len(offline)),
|
2019-07-15 20:58:39 +01:00
|
|
|
}
|
2023-06-29 09:38:47 +01:00
|
|
|
for _, node := range online {
|
2023-07-06 13:35:26 +01:00
|
|
|
state.reliableOnline[node.ID] = node
|
|
|
|
state.reliableAll[node.ID] = node
|
2023-06-29 14:26:52 +01:00
|
|
|
}
|
|
|
|
for _, node := range offline {
|
2023-07-06 13:35:26 +01:00
|
|
|
state.reliableAll[node.ID] = node
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
2019-07-15 20:58:39 +01:00
|
|
|
cache.state.Store(state)
|
|
|
|
return state, nil
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|