satellite/repair: add a repair health function

This will be used to rank segments in need of repair for attention by the repair workers. Change-Id: I5b70650cec933696b4c6d73bb7efb97e3efdf24a
2020-10-28 13:35:47 -05:00 · 2020-10-28 13:35:47 -05:00 · 3e56403599
commit 3e56403599
parent 31533ed1a1
2 changed files with 211 additions and 0 deletions
--- a/satellite/repair/priority.go
+++ b/satellite/repair/priority.go
@ -0,0 +1,146 @@
 // Copyright (C) 2020 Storj Labs, Inc.
 // See LICENSE for copying information.
 package repair
 import (
 	"math"
 )
 // SegmentHealth returns a value corresponding to the health of a segment
 // in the repair queue. Lower health segments should be repaired first.
 func SegmentHealth(numHealthy, minPieces int, failureRate float64) float64 {
 	return 1.0 / SegmentDanger(numHealthy, minPieces, failureRate)
 }
 // SegmentDanger returns the chance of a segment with the given minPieces
 // and the given number of healthy pieces of being lost in the next time
 // period.
 //
 // It assumes:
 //
 // * Nodes fail at the given failureRate (i.e., each node has a failureRate
 //   chance of going offline within the next time period).
 // * Node failures are entirely independent. Obviously this is not the case,
 //   because many nodes may be operated by a single entity or share network
 //   infrastructure, in which case their failures would be correlated. But we
 //   can't easily model that, so our best hope is to try to avoid putting
 //   pieces for the same segment on related nodes to maximize failure
 //   independence.
 //
 // (The "time period" we are talking about here could be anything. The returned
 // danger value will be given in terms of whatever time period was used to
 // determine failureRate. If it simplifies things, you can think of the time
 // period as "one repair worker iteration".)
 //
 // If those things are true, then the number of nodes holding this segment
 // that will go offline follows the Binomial distribution:
 //
 //     X ~ Binom(numHealthy, failureRate)
 //
 // A segment is lost if the number of nodes that go offline is higher than
 // (numHealthy - minPieces). So we want to find
 //
 //     Pr[X > (numHealthy - minPieces)]
 //
 // If we invert the logic here, we can use the standard CDF for the binomial
 // distribution.
 //
 //     Pr[X > (numHealthy - minPieces)] = 1 - Pr[X <= (numHealthy - minPieces)]
 //
 // And that gives us the danger value.
 func SegmentDanger(numHealthy, minPieces int, failureRate float64) float64 {
 	return 1.0 - binomialCDF(float64(numHealthy-minPieces), float64(numHealthy), failureRate)
 }
 // math.Lgamma without the returned sign parameter; it's unneeded here.
 func lnGamma(x float64) float64 {
 	lg, _ := math.Lgamma(x)
 	return lg
 }
 // The following functions are based on code from
 // Numerical Recipes in C, Second Edition, Section 6.4 (pp. 227-228).
 // betaI calculates the incomplete beta function I_x(a, b).
 func betaI(a, b, x float64) float64 {
 	if x < 0.0 || x > 1.0 {
 		return math.NaN()
 	}
 	bt := 0.0
 	if x > 0.0 && x < 1.0 {
 		// factors in front of the continued function
 		bt = math.Exp(lnGamma(a+b) - lnGamma(a) - lnGamma(b) + a*math.Log(x) + b*math.Log(1.0-x))
 	}
 	if x < (a+1.0)/(a+b+2.0) {
 		// use continued fraction directly
 		return bt * betaCF(a, b, x) / a
 	}
 	// use continued fraction after making the symmetry transformation
 	return 1.0 - bt*betaCF(b, a, 1.0-x)/b
 }
 const (
 	// unlikely to go this far, as betaCF is expected to converge quickly for
 	// typical values.
 	maxIter = 100
 	// betaI outputs will be accurate to within this amount.
 	epsilon = 1.0e-14
 )
 // betaCF evaluates the continued fraction for the incomplete beta function
 // by a modified Lentz's method.
 func betaCF(a, b, x float64) float64 {
 	avoidZero := func(f float64) float64 {
 		if math.Abs(f) < math.SmallestNonzeroFloat64 {
 			return math.SmallestNonzeroFloat64
 		}
 		return f
 	}
 	qab := a + b
 	qap := a + 1.0
 	qam := a - 1.0
 	c := 1.0
 	d := 1.0 / avoidZero(1.0-qab*x/qap)
 	h := d
 	for m := 1; m <= maxIter; m++ {
 		m := float64(m)
 		m2 := 2.0 * m
 		aa := m * (b - m) * x / ((qam + m2) * (a + m2))
 		// one step (the even one) of the recurrence
 		d = 1.0 / avoidZero(1.0+aa*d)
 		c = avoidZero(1.0 + aa/c)
 		h *= d * c
 		aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2))
 		// next step of the recurrence (the odd one)
 		d = 1.0 / avoidZero(1.0+aa*d)
 		c = avoidZero(1.0 + aa/c)
 		del := d * c
 		h *= del
 		if math.Abs(del-1.0) < epsilon {
 			return h
 		}
 	}
 	// a or b too big, or maxIter too small
 	return math.NaN()
 }
 // binomialCDF evaluates the CDF of the binomial distribution Binom(n, p) at k.
 // This is done using (1-p)**(n-k) when k is 0, or with the incomplete beta
 // function otherwise.
 func binomialCDF(k, n, p float64) float64 {
 	k = math.Floor(k)
 	if k < 0.0 || n < k {
 		return math.NaN()
 	}
 	if k == n {
 		return 1.0
 	}
 	if k == 0 {
 		return math.Pow(1.0-p, n-k)
 	}
 	return betaI(n-k, k+1.0, 1.0-p)
 }
--- a/satellite/repair/priority_test.go
+++ b/satellite/repair/priority_test.go
@ -0,0 +1,65 @@
 // Copyright (C) 2020 Storj Labs, Inc.
 // See LICENSE for copying information.
 package repair
 import (
 	"math"
 	"testing"
 	"github.com/stretchr/testify/assert"
 )
 func TestBetaI(t *testing.T) {
 	// check a few places where betaI has some easily representable values
 	assert.Equal(t, 0.0, betaI(0.5, 5, 0))
 	assert.Equal(t, 0.0, betaI(1, 3, 0))
 	assert.Equal(t, 0.0, betaI(8, 10, 0))
 	assert.Equal(t, 0.0, betaI(8, 10, 0))
 	assert.InDelta(t, 0.5, betaI(0.5, 0.5, 0.5), epsilon)
 	assert.InDelta(t, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
 	assert.InDelta(t, 0.488, betaI(1, 3, 0.2), epsilon)
 }
 func BenchmarkBetaI(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		assert.InDelta(b, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
 	}
 }
 func TestSegmentDanger(t *testing.T) {
 	const failureRate = 0.01
 	assert.Greater(t,
 		SegmentDanger(11, 10, failureRate),
 		SegmentDanger(10, 5, failureRate))
 	assert.Greater(t,
 		SegmentDanger(11, 10, failureRate),
 		SegmentDanger(10, 9, failureRate))
 	assert.Greater(t,
 		SegmentDanger(10, 10, failureRate),
 		SegmentDanger(9, 9, failureRate))
 	assert.Less(t,
 		SegmentDanger(11, 10, failureRate),
 		SegmentDanger(12, 11, failureRate))
 }
 func TestSegmentHealth(t *testing.T) {
 	const failureRate = 0.01
 	assert.Less(t,
 		SegmentHealth(11, 10, failureRate),
 		SegmentHealth(10, 5, failureRate))
 	assert.Less(t,
 		SegmentHealth(11, 10, failureRate),
 		SegmentHealth(10, 9, failureRate))
 	assert.Less(t,
 		SegmentHealth(10, 10, failureRate),
 		SegmentHealth(9, 9, failureRate))
 	assert.Greater(t,
 		SegmentHealth(11, 10, failureRate),
 		SegmentHealth(12, 11, failureRate))
 }
 func TestSegmentHealthForDecayedSegment(t *testing.T) {
 	const failureRate = 0.01
 	assert.True(t, math.IsNaN(SegmentHealth(9, 10, failureRate)))
 }