satellite/repair: add a repair health function

This will be used to rank segments in need of repair for attention by
the repair workers.

Change-Id: I5b70650cec933696b4c6d73bb7efb97e3efdf24a
This commit is contained in:
paul cannon 2020-10-28 13:35:47 -05:00 committed by Jennifer Li Johnson
parent 31533ed1a1
commit 3e56403599
2 changed files with 211 additions and 0 deletions

View File

@ -0,0 +1,146 @@
// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.
package repair
import (
"math"
)
// SegmentHealth returns a value corresponding to the health of a segment
// in the repair queue. Lower health segments should be repaired first.
func SegmentHealth(numHealthy, minPieces int, failureRate float64) float64 {
return 1.0 / SegmentDanger(numHealthy, minPieces, failureRate)
}
// SegmentDanger returns the chance of a segment with the given minPieces
// and the given number of healthy pieces of being lost in the next time
// period.
//
// It assumes:
//
// * Nodes fail at the given failureRate (i.e., each node has a failureRate
// chance of going offline within the next time period).
// * Node failures are entirely independent. Obviously this is not the case,
// because many nodes may be operated by a single entity or share network
// infrastructure, in which case their failures would be correlated. But we
// can't easily model that, so our best hope is to try to avoid putting
// pieces for the same segment on related nodes to maximize failure
// independence.
//
// (The "time period" we are talking about here could be anything. The returned
// danger value will be given in terms of whatever time period was used to
// determine failureRate. If it simplifies things, you can think of the time
// period as "one repair worker iteration".)
//
// If those things are true, then the number of nodes holding this segment
// that will go offline follows the Binomial distribution:
//
// X ~ Binom(numHealthy, failureRate)
//
// A segment is lost if the number of nodes that go offline is higher than
// (numHealthy - minPieces). So we want to find
//
// Pr[X > (numHealthy - minPieces)]
//
// If we invert the logic here, we can use the standard CDF for the binomial
// distribution.
//
// Pr[X > (numHealthy - minPieces)] = 1 - Pr[X <= (numHealthy - minPieces)]
//
// And that gives us the danger value.
func SegmentDanger(numHealthy, minPieces int, failureRate float64) float64 {
return 1.0 - binomialCDF(float64(numHealthy-minPieces), float64(numHealthy), failureRate)
}
// math.Lgamma without the returned sign parameter; it's unneeded here.
func lnGamma(x float64) float64 {
lg, _ := math.Lgamma(x)
return lg
}
// The following functions are based on code from
// Numerical Recipes in C, Second Edition, Section 6.4 (pp. 227-228).
// betaI calculates the incomplete beta function I_x(a, b).
func betaI(a, b, x float64) float64 {
if x < 0.0 || x > 1.0 {
return math.NaN()
}
bt := 0.0
if x > 0.0 && x < 1.0 {
// factors in front of the continued function
bt = math.Exp(lnGamma(a+b) - lnGamma(a) - lnGamma(b) + a*math.Log(x) + b*math.Log(1.0-x))
}
if x < (a+1.0)/(a+b+2.0) {
// use continued fraction directly
return bt * betaCF(a, b, x) / a
}
// use continued fraction after making the symmetry transformation
return 1.0 - bt*betaCF(b, a, 1.0-x)/b
}
const (
// unlikely to go this far, as betaCF is expected to converge quickly for
// typical values.
maxIter = 100
// betaI outputs will be accurate to within this amount.
epsilon = 1.0e-14
)
// betaCF evaluates the continued fraction for the incomplete beta function
// by a modified Lentz's method.
func betaCF(a, b, x float64) float64 {
avoidZero := func(f float64) float64 {
if math.Abs(f) < math.SmallestNonzeroFloat64 {
return math.SmallestNonzeroFloat64
}
return f
}
qab := a + b
qap := a + 1.0
qam := a - 1.0
c := 1.0
d := 1.0 / avoidZero(1.0-qab*x/qap)
h := d
for m := 1; m <= maxIter; m++ {
m := float64(m)
m2 := 2.0 * m
aa := m * (b - m) * x / ((qam + m2) * (a + m2))
// one step (the even one) of the recurrence
d = 1.0 / avoidZero(1.0+aa*d)
c = avoidZero(1.0 + aa/c)
h *= d * c
aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2))
// next step of the recurrence (the odd one)
d = 1.0 / avoidZero(1.0+aa*d)
c = avoidZero(1.0 + aa/c)
del := d * c
h *= del
if math.Abs(del-1.0) < epsilon {
return h
}
}
// a or b too big, or maxIter too small
return math.NaN()
}
// binomialCDF evaluates the CDF of the binomial distribution Binom(n, p) at k.
// This is done using (1-p)**(n-k) when k is 0, or with the incomplete beta
// function otherwise.
func binomialCDF(k, n, p float64) float64 {
k = math.Floor(k)
if k < 0.0 || n < k {
return math.NaN()
}
if k == n {
return 1.0
}
if k == 0 {
return math.Pow(1.0-p, n-k)
}
return betaI(n-k, k+1.0, 1.0-p)
}

View File

@ -0,0 +1,65 @@
// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.
package repair
import (
"math"
"testing"
"github.com/stretchr/testify/assert"
)
func TestBetaI(t *testing.T) {
// check a few places where betaI has some easily representable values
assert.Equal(t, 0.0, betaI(0.5, 5, 0))
assert.Equal(t, 0.0, betaI(1, 3, 0))
assert.Equal(t, 0.0, betaI(8, 10, 0))
assert.Equal(t, 0.0, betaI(8, 10, 0))
assert.InDelta(t, 0.5, betaI(0.5, 0.5, 0.5), epsilon)
assert.InDelta(t, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
assert.InDelta(t, 0.488, betaI(1, 3, 0.2), epsilon)
}
func BenchmarkBetaI(b *testing.B) {
for i := 0; i < b.N; i++ {
assert.InDelta(b, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
}
}
func TestSegmentDanger(t *testing.T) {
const failureRate = 0.01
assert.Greater(t,
SegmentDanger(11, 10, failureRate),
SegmentDanger(10, 5, failureRate))
assert.Greater(t,
SegmentDanger(11, 10, failureRate),
SegmentDanger(10, 9, failureRate))
assert.Greater(t,
SegmentDanger(10, 10, failureRate),
SegmentDanger(9, 9, failureRate))
assert.Less(t,
SegmentDanger(11, 10, failureRate),
SegmentDanger(12, 11, failureRate))
}
func TestSegmentHealth(t *testing.T) {
const failureRate = 0.01
assert.Less(t,
SegmentHealth(11, 10, failureRate),
SegmentHealth(10, 5, failureRate))
assert.Less(t,
SegmentHealth(11, 10, failureRate),
SegmentHealth(10, 9, failureRate))
assert.Less(t,
SegmentHealth(10, 10, failureRate),
SegmentHealth(9, 9, failureRate))
assert.Greater(t,
SegmentHealth(11, 10, failureRate),
SegmentHealth(12, 11, failureRate))
}
func TestSegmentHealthForDecayedSegment(t *testing.T) {
const failureRate = 0.01
assert.True(t, math.IsNaN(SegmentHealth(9, 10, failureRate)))
}