satellite/repair: add a repair health function
This will be used to rank segments in need of repair for attention by the repair workers. Change-Id: I5b70650cec933696b4c6d73bb7efb97e3efdf24a
This commit is contained in:
parent
31533ed1a1
commit
3e56403599
146
satellite/repair/priority.go
Normal file
146
satellite/repair/priority.go
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
// Copyright (C) 2020 Storj Labs, Inc.
|
||||||
|
// See LICENSE for copying information.
|
||||||
|
|
||||||
|
package repair
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SegmentHealth returns a value corresponding to the health of a segment
|
||||||
|
// in the repair queue. Lower health segments should be repaired first.
|
||||||
|
func SegmentHealth(numHealthy, minPieces int, failureRate float64) float64 {
|
||||||
|
return 1.0 / SegmentDanger(numHealthy, minPieces, failureRate)
|
||||||
|
}
|
||||||
|
|
||||||
|
// SegmentDanger returns the chance of a segment with the given minPieces
|
||||||
|
// and the given number of healthy pieces of being lost in the next time
|
||||||
|
// period.
|
||||||
|
//
|
||||||
|
// It assumes:
|
||||||
|
//
|
||||||
|
// * Nodes fail at the given failureRate (i.e., each node has a failureRate
|
||||||
|
// chance of going offline within the next time period).
|
||||||
|
// * Node failures are entirely independent. Obviously this is not the case,
|
||||||
|
// because many nodes may be operated by a single entity or share network
|
||||||
|
// infrastructure, in which case their failures would be correlated. But we
|
||||||
|
// can't easily model that, so our best hope is to try to avoid putting
|
||||||
|
// pieces for the same segment on related nodes to maximize failure
|
||||||
|
// independence.
|
||||||
|
//
|
||||||
|
// (The "time period" we are talking about here could be anything. The returned
|
||||||
|
// danger value will be given in terms of whatever time period was used to
|
||||||
|
// determine failureRate. If it simplifies things, you can think of the time
|
||||||
|
// period as "one repair worker iteration".)
|
||||||
|
//
|
||||||
|
// If those things are true, then the number of nodes holding this segment
|
||||||
|
// that will go offline follows the Binomial distribution:
|
||||||
|
//
|
||||||
|
// X ~ Binom(numHealthy, failureRate)
|
||||||
|
//
|
||||||
|
// A segment is lost if the number of nodes that go offline is higher than
|
||||||
|
// (numHealthy - minPieces). So we want to find
|
||||||
|
//
|
||||||
|
// Pr[X > (numHealthy - minPieces)]
|
||||||
|
//
|
||||||
|
// If we invert the logic here, we can use the standard CDF for the binomial
|
||||||
|
// distribution.
|
||||||
|
//
|
||||||
|
// Pr[X > (numHealthy - minPieces)] = 1 - Pr[X <= (numHealthy - minPieces)]
|
||||||
|
//
|
||||||
|
// And that gives us the danger value.
|
||||||
|
func SegmentDanger(numHealthy, minPieces int, failureRate float64) float64 {
|
||||||
|
return 1.0 - binomialCDF(float64(numHealthy-minPieces), float64(numHealthy), failureRate)
|
||||||
|
}
|
||||||
|
|
||||||
|
// math.Lgamma without the returned sign parameter; it's unneeded here.
|
||||||
|
func lnGamma(x float64) float64 {
|
||||||
|
lg, _ := math.Lgamma(x)
|
||||||
|
return lg
|
||||||
|
}
|
||||||
|
|
||||||
|
// The following functions are based on code from
|
||||||
|
// Numerical Recipes in C, Second Edition, Section 6.4 (pp. 227-228).
|
||||||
|
|
||||||
|
// betaI calculates the incomplete beta function I_x(a, b).
|
||||||
|
func betaI(a, b, x float64) float64 {
|
||||||
|
if x < 0.0 || x > 1.0 {
|
||||||
|
return math.NaN()
|
||||||
|
}
|
||||||
|
bt := 0.0
|
||||||
|
if x > 0.0 && x < 1.0 {
|
||||||
|
// factors in front of the continued function
|
||||||
|
bt = math.Exp(lnGamma(a+b) - lnGamma(a) - lnGamma(b) + a*math.Log(x) + b*math.Log(1.0-x))
|
||||||
|
}
|
||||||
|
if x < (a+1.0)/(a+b+2.0) {
|
||||||
|
// use continued fraction directly
|
||||||
|
return bt * betaCF(a, b, x) / a
|
||||||
|
}
|
||||||
|
// use continued fraction after making the symmetry transformation
|
||||||
|
return 1.0 - bt*betaCF(b, a, 1.0-x)/b
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
// unlikely to go this far, as betaCF is expected to converge quickly for
|
||||||
|
// typical values.
|
||||||
|
maxIter = 100
|
||||||
|
|
||||||
|
// betaI outputs will be accurate to within this amount.
|
||||||
|
epsilon = 1.0e-14
|
||||||
|
)
|
||||||
|
|
||||||
|
// betaCF evaluates the continued fraction for the incomplete beta function
|
||||||
|
// by a modified Lentz's method.
|
||||||
|
func betaCF(a, b, x float64) float64 {
|
||||||
|
avoidZero := func(f float64) float64 {
|
||||||
|
if math.Abs(f) < math.SmallestNonzeroFloat64 {
|
||||||
|
return math.SmallestNonzeroFloat64
|
||||||
|
}
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
qab := a + b
|
||||||
|
qap := a + 1.0
|
||||||
|
qam := a - 1.0
|
||||||
|
c := 1.0
|
||||||
|
d := 1.0 / avoidZero(1.0-qab*x/qap)
|
||||||
|
h := d
|
||||||
|
|
||||||
|
for m := 1; m <= maxIter; m++ {
|
||||||
|
m := float64(m)
|
||||||
|
m2 := 2.0 * m
|
||||||
|
aa := m * (b - m) * x / ((qam + m2) * (a + m2))
|
||||||
|
// one step (the even one) of the recurrence
|
||||||
|
d = 1.0 / avoidZero(1.0+aa*d)
|
||||||
|
c = avoidZero(1.0 + aa/c)
|
||||||
|
h *= d * c
|
||||||
|
aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2))
|
||||||
|
// next step of the recurrence (the odd one)
|
||||||
|
d = 1.0 / avoidZero(1.0+aa*d)
|
||||||
|
c = avoidZero(1.0 + aa/c)
|
||||||
|
del := d * c
|
||||||
|
h *= del
|
||||||
|
if math.Abs(del-1.0) < epsilon {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// a or b too big, or maxIter too small
|
||||||
|
return math.NaN()
|
||||||
|
}
|
||||||
|
|
||||||
|
// binomialCDF evaluates the CDF of the binomial distribution Binom(n, p) at k.
|
||||||
|
// This is done using (1-p)**(n-k) when k is 0, or with the incomplete beta
|
||||||
|
// function otherwise.
|
||||||
|
func binomialCDF(k, n, p float64) float64 {
|
||||||
|
k = math.Floor(k)
|
||||||
|
if k < 0.0 || n < k {
|
||||||
|
return math.NaN()
|
||||||
|
}
|
||||||
|
if k == n {
|
||||||
|
return 1.0
|
||||||
|
}
|
||||||
|
if k == 0 {
|
||||||
|
return math.Pow(1.0-p, n-k)
|
||||||
|
}
|
||||||
|
return betaI(n-k, k+1.0, 1.0-p)
|
||||||
|
}
|
65
satellite/repair/priority_test.go
Normal file
65
satellite/repair/priority_test.go
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
// Copyright (C) 2020 Storj Labs, Inc.
|
||||||
|
// See LICENSE for copying information.
|
||||||
|
|
||||||
|
package repair
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBetaI(t *testing.T) {
|
||||||
|
// check a few places where betaI has some easily representable values
|
||||||
|
assert.Equal(t, 0.0, betaI(0.5, 5, 0))
|
||||||
|
assert.Equal(t, 0.0, betaI(1, 3, 0))
|
||||||
|
assert.Equal(t, 0.0, betaI(8, 10, 0))
|
||||||
|
assert.Equal(t, 0.0, betaI(8, 10, 0))
|
||||||
|
assert.InDelta(t, 0.5, betaI(0.5, 0.5, 0.5), epsilon)
|
||||||
|
assert.InDelta(t, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
|
||||||
|
assert.InDelta(t, 0.488, betaI(1, 3, 0.2), epsilon)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkBetaI(b *testing.B) {
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
assert.InDelta(b, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSegmentDanger(t *testing.T) {
|
||||||
|
const failureRate = 0.01
|
||||||
|
assert.Greater(t,
|
||||||
|
SegmentDanger(11, 10, failureRate),
|
||||||
|
SegmentDanger(10, 5, failureRate))
|
||||||
|
assert.Greater(t,
|
||||||
|
SegmentDanger(11, 10, failureRate),
|
||||||
|
SegmentDanger(10, 9, failureRate))
|
||||||
|
assert.Greater(t,
|
||||||
|
SegmentDanger(10, 10, failureRate),
|
||||||
|
SegmentDanger(9, 9, failureRate))
|
||||||
|
assert.Less(t,
|
||||||
|
SegmentDanger(11, 10, failureRate),
|
||||||
|
SegmentDanger(12, 11, failureRate))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSegmentHealth(t *testing.T) {
|
||||||
|
const failureRate = 0.01
|
||||||
|
assert.Less(t,
|
||||||
|
SegmentHealth(11, 10, failureRate),
|
||||||
|
SegmentHealth(10, 5, failureRate))
|
||||||
|
assert.Less(t,
|
||||||
|
SegmentHealth(11, 10, failureRate),
|
||||||
|
SegmentHealth(10, 9, failureRate))
|
||||||
|
assert.Less(t,
|
||||||
|
SegmentHealth(10, 10, failureRate),
|
||||||
|
SegmentHealth(9, 9, failureRate))
|
||||||
|
assert.Greater(t,
|
||||||
|
SegmentHealth(11, 10, failureRate),
|
||||||
|
SegmentHealth(12, 11, failureRate))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSegmentHealthForDecayedSegment(t *testing.T) {
|
||||||
|
const failureRate = 0.01
|
||||||
|
assert.True(t, math.IsNaN(SegmentHealth(9, 10, failureRate)))
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user