2019-07-08 23:04:35 +01:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package checker
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2019-07-15 20:58:39 +01:00
|
|
|
"sync"
|
|
|
|
"sync/atomic"
|
2019-07-08 23:04:35 +01:00
|
|
|
"time"
|
|
|
|
|
2023-10-06 10:14:35 +01:00
|
|
|
"github.com/zeebo/errs"
|
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/storj"
|
2023-07-06 13:35:26 +01:00
|
|
|
"storj.io/storj/satellite/nodeselection"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2019-07-08 23:04:35 +01:00
|
|
|
)
|
|
|
|
|
2023-08-21 12:59:54 +01:00
|
|
|
// ReliabilityCache caches known nodes for the specified staleness duration
|
2019-07-08 23:04:35 +01:00
|
|
|
// and updates automatically from overlay.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2019-07-08 23:04:35 +01:00
|
|
|
type ReliabilityCache struct {
|
2023-06-29 14:26:52 +01:00
|
|
|
overlay *overlay.Service
|
|
|
|
staleness time.Duration
|
satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.
With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.
The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.
Some other side effects of this change:
* CreatePutRepairOrderLimits no longer needs to special-case excluded
countries; it can just create as many order limits as requested (by
way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
function similar to the one on the overlay. The classification methods
like MissingPieces(), OutOfPlacementPieces(), and
PiecesNodesLastNetsInOrder() are removed in favor of the
classification logic in satellite/repair/classification.go. This
means the reliability cache no longer needs access to the placement
rules or excluded countries list.
Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-11 05:07:39 +01:00
|
|
|
mu sync.Mutex
|
|
|
|
state atomic.Value // contains immutable *reliabilityState
|
2023-06-29 14:26:52 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// reliabilityState.
|
2019-07-15 20:58:39 +01:00
|
|
|
type reliabilityState struct {
|
2023-08-21 12:59:54 +01:00
|
|
|
nodeByID map[storj.NodeID]nodeselection.SelectedNode
|
|
|
|
created time.Time
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewReliabilityCache creates a new reliability checking cache.
|
satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.
With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.
The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.
Some other side effects of this change:
* CreatePutRepairOrderLimits no longer needs to special-case excluded
countries; it can just create as many order limits as requested (by
way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
function similar to the one on the overlay. The classification methods
like MissingPieces(), OutOfPlacementPieces(), and
PiecesNodesLastNetsInOrder() are removed in favor of the
classification logic in satellite/repair/classification.go. This
means the reliability cache no longer needs access to the placement
rules or excluded countries list.
Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-11 05:07:39 +01:00
|
|
|
func NewReliabilityCache(overlay *overlay.Service, staleness time.Duration) *ReliabilityCache {
|
2019-07-08 23:04:35 +01:00
|
|
|
return &ReliabilityCache{
|
satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.
With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.
The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.
Some other side effects of this change:
* CreatePutRepairOrderLimits no longer needs to special-case excluded
countries; it can just create as many order limits as requested (by
way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
function similar to the one on the overlay. The classification methods
like MissingPieces(), OutOfPlacementPieces(), and
PiecesNodesLastNetsInOrder() are removed in favor of the
classification logic in satellite/repair/classification.go. This
means the reliability cache no longer needs access to the placement
rules or excluded countries list.
Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-11 05:07:39 +01:00
|
|
|
overlay: overlay,
|
|
|
|
staleness: staleness,
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
// LastUpdate returns when the cache was last updated, or the zero value (time.Time{}) if it
|
|
|
|
// has never yet been updated. LastUpdate() does not trigger an update itself.
|
2019-07-15 20:58:39 +01:00
|
|
|
func (cache *ReliabilityCache) LastUpdate() time.Time {
|
|
|
|
if state, ok := cache.state.Load().(*reliabilityState); ok {
|
|
|
|
return state.created
|
|
|
|
}
|
|
|
|
return time.Time{}
|
|
|
|
}
|
2019-07-08 23:04:35 +01:00
|
|
|
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
// NumNodes returns the number of online active nodes (as determined by the reliability cache).
|
|
|
|
// This number is not guaranteed to be consistent with either the nodes database or the
|
|
|
|
// reliability cache after returning; it is just a best-effort count and should be treated as an
|
|
|
|
// estimate.
|
|
|
|
func (cache *ReliabilityCache) NumNodes(ctx context.Context) (numNodes int, err error) {
|
|
|
|
state, err := cache.loadFast(ctx, time.Time{})
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
2023-06-29 14:26:52 +01:00
|
|
|
|
2023-08-21 12:59:54 +01:00
|
|
|
return len(state.nodeByID), nil
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
}
|
|
|
|
|
satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.
With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.
The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.
Some other side effects of this change:
* CreatePutRepairOrderLimits no longer needs to special-case excluded
countries; it can just create as many order limits as requested (by
way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
function similar to the one on the overlay. The classification methods
like MissingPieces(), OutOfPlacementPieces(), and
PiecesNodesLastNetsInOrder() are removed in favor of the
classification logic in satellite/repair/classification.go. This
means the reliability cache no longer needs access to the placement
rules or excluded countries list.
Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-11 05:07:39 +01:00
|
|
|
// GetNodes gets the cached SelectedNode records (valid as of the given time) for each of
|
|
|
|
// the requested node IDs, and returns them in order. If a node is not in the reliability
|
|
|
|
// cache (that is, it is unknown or disqualified), an empty SelectedNode will be returned
|
|
|
|
// for the index corresponding to that node ID.
|
2023-10-06 10:14:35 +01:00
|
|
|
// Slice selectedNodes will be filled with results nodes and returned. It's length must be
|
|
|
|
// equal to nodeIDs slice.
|
|
|
|
func (cache *ReliabilityCache) GetNodes(ctx context.Context, validUpTo time.Time, nodeIDs []storj.NodeID, selectedNodes []nodeselection.SelectedNode) ([]nodeselection.SelectedNode, error) {
|
satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.
With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.
The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.
Some other side effects of this change:
* CreatePutRepairOrderLimits no longer needs to special-case excluded
countries; it can just create as many order limits as requested (by
way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
function similar to the one on the overlay. The classification methods
like MissingPieces(), OutOfPlacementPieces(), and
PiecesNodesLastNetsInOrder() are removed in favor of the
classification logic in satellite/repair/classification.go. This
means the reliability cache no longer needs access to the placement
rules or excluded countries list.
Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-11 05:07:39 +01:00
|
|
|
state, err := cache.loadFast(ctx, validUpTo)
|
2023-06-29 14:26:52 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2023-10-06 10:14:35 +01:00
|
|
|
|
|
|
|
if len(nodeIDs) != len(selectedNodes) {
|
|
|
|
return nil, errs.New("nodeIDs length must be equal to selectedNodes: want %d have %d", len(nodeIDs), len(selectedNodes))
|
|
|
|
}
|
|
|
|
|
satellite/repair: unify repair logic
The repair checker and repair worker both need to determine which pieces
are healthy, which are retrievable, and which should be replaced, but
they have been doing it in different ways in different code, which has
been the cause of bugs. The same term could have very similar but subtly
different meanings between the two, causing much confusion.
With this change, the piece- and node-classification logic is
consolidated into one place within the satellite/repair package, so that
both subsystems can use it. This ought to make decision-making code more
concise and more readable.
The consolidated classification logic has been expanded to create more
sets, so that the decision-making code does not need to do as much
precalculation. It should now be clearer in comments and code that a
piece can belong to multiple sets arbitrarily (except where the
definition of the sets makes this logically impossible), and what the
precise meaning of each set is. These sets include Missing, Suspended,
Clumped, OutOfPlacement, InExcludedCountry, ForcingRepair,
UnhealthyRetrievable, Unhealthy, Retrievable, and Healthy.
Some other side effects of this change:
* CreatePutRepairOrderLimits no longer needs to special-case excluded
countries; it can just create as many order limits as requested (by
way of len(newNodes)).
* The repair checker will now queue a segment for repair when there are
any pieces out of placement. The code calls this "forcing a repair".
* The checker.ReliabilityCache is now accessed by way of a GetNodes()
function similar to the one on the overlay. The classification methods
like MissingPieces(), OutOfPlacementPieces(), and
PiecesNodesLastNetsInOrder() are removed in favor of the
classification logic in satellite/repair/classification.go. This
means the reliability cache no longer needs access to the placement
rules or excluded countries list.
Change-Id: I105109fb94ee126952f07d747c6e11131164fadb
2023-09-11 05:07:39 +01:00
|
|
|
for i, nodeID := range nodeIDs {
|
2023-10-06 10:14:35 +01:00
|
|
|
selectedNodes[i] = state.nodeByID[nodeID]
|
2023-06-29 14:26:52 +01:00
|
|
|
}
|
2023-10-06 10:14:35 +01:00
|
|
|
return selectedNodes, nil
|
2023-06-29 14:26:52 +01:00
|
|
|
}
|
|
|
|
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
func (cache *ReliabilityCache) loadFast(ctx context.Context, validUpTo time.Time) (_ *reliabilityState, err error) {
|
2019-07-15 20:58:39 +01:00
|
|
|
// This code is designed to be very fast in the case where a refresh is not needed: just an
|
|
|
|
// atomic load from rarely written to bit of shared memory. The general strategy is to first
|
|
|
|
// read if the state suffices to answer the query. If not (due to it not existing, being
|
|
|
|
// too stale, etc.), then we acquire the mutex to block other requests that may be stale
|
|
|
|
// and ensure we only issue one refresh at a time. After acquiring the mutex, we have to
|
|
|
|
// double check that the state is still stale because some other call may have beat us to
|
|
|
|
// the acquisition. Only then do we refresh and can then proceed answering the query.
|
|
|
|
|
|
|
|
state, ok := cache.state.Load().(*reliabilityState)
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
|
2019-07-15 20:58:39 +01:00
|
|
|
cache.mu.Lock()
|
|
|
|
state, ok = cache.state.Load().(*reliabilityState)
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
if !ok || validUpTo.After(state.created) || time.Since(state.created) > cache.staleness {
|
2019-07-15 20:58:39 +01:00
|
|
|
state, err = cache.refreshLocked(ctx)
|
|
|
|
}
|
|
|
|
cache.mu.Unlock()
|
2019-07-08 23:04:35 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
satellite/repair: use survivability model for segment health
The chief segment health models we've come up with are the "immediate
danger" model and the "survivability" model. The former calculates the
chance of losing a segment becoming lost in the next time period (using
the CDF of the binomial distribution to estimate the chance of x nodes
failing in that period), while the latter estimates the number of
iterations for which a segment can be expected to survive (using the
mean of the negative binomial distribution). The immediate danger model
was a promising one for comparing segment health across segments with
different RS parameters, as it is more precisely what we want to
prevent, but it turns out that practically all segments in production
have infinite health, as the chance of losing segments with any
reasonable estimate of node failure rate is smaller than DBL_EPSILON,
the smallest possible difference from 1.0 representable in a float64
(about 1e-16).
Leaving aside the wisdom of worrying about the repair of segments that
have less than a 1e-16 chance of being lost, we want to be extremely
conservative and proactive in our repair efforts, and the health of the
segments we have been repairing thus far also evaluates to infinity
under the immediate danger model. Thus, we find ourselves reaching for
an alternative.
Dr. Ben saves the day: the survivability model is a reasonably close
approximation of the immediate danger model, and even better, it is
far simpler to calculate and yields manageable values for real-world
segments. The downside to it is that it requires as input an estimate
of the total number of active nodes.
This change replaces the segment health calculation to use the
survivability model, and reinstates the call to SegmentHealth() where it
was reverted. It gets estimates for the total number of active nodes by
leveraging the reliability cache.
Change-Id: Ia5d9b9031b9f6cf0fa7b9005a7011609415527dc
2020-12-08 04:18:00 +00:00
|
|
|
return state, nil
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Refresh refreshes the cache.
|
2019-07-15 20:58:39 +01:00
|
|
|
func (cache *ReliabilityCache) Refresh(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
cache.mu.Lock()
|
|
|
|
defer cache.mu.Unlock()
|
2019-07-08 23:04:35 +01:00
|
|
|
|
2019-07-15 20:58:39 +01:00
|
|
|
_, err = cache.refreshLocked(ctx)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// refreshLocked does the refreshes assuming the write mutex is held.
|
|
|
|
func (cache *ReliabilityCache) refreshLocked(ctx context.Context) (_ *reliabilityState, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-08 23:04:35 +01:00
|
|
|
|
2023-08-21 12:59:54 +01:00
|
|
|
selectedNodes, err := cache.overlay.GetParticipatingNodes(ctx)
|
2019-07-08 23:04:35 +01:00
|
|
|
if err != nil {
|
2019-07-15 20:58:39 +01:00
|
|
|
return nil, Error.Wrap(err)
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
2019-07-15 20:58:39 +01:00
|
|
|
state := &reliabilityState{
|
2023-08-21 12:59:54 +01:00
|
|
|
created: time.Now(),
|
|
|
|
nodeByID: make(map[storj.NodeID]nodeselection.SelectedNode, len(selectedNodes)),
|
2019-07-15 20:58:39 +01:00
|
|
|
}
|
2023-08-21 12:59:54 +01:00
|
|
|
for _, node := range selectedNodes {
|
|
|
|
state.nodeByID[node.ID] = node
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|
|
|
|
|
2019-07-15 20:58:39 +01:00
|
|
|
cache.state.Store(state)
|
|
|
|
return state, nil
|
2019-07-08 23:04:35 +01:00
|
|
|
}
|