satellite/repair: add a repair health function

This will be used to rank segments in need of repair for attention by the repair workers. Change-Id: I5b70650cec933696b4c6d73bb7efb97e3efdf24a
2020-10-28 13:35:47 -05:00 · 2020-10-28 13:35:47 -05:00 · 3e56403599
commit 3e56403599
parent 31533ed1a1
2 changed files with 211 additions and 0 deletions
--- a/satellite/repair/priority.go
+++ b/satellite/repair/priority.go
@ -0,0 +1,146 @@
+// Copyright (C) 2020 Storj Labs, Inc.
+// See LICENSE for copying information.
+
+package repair
+
+import (
+	"math"
+)
+
+// SegmentHealth returns a value corresponding to the health of a segment
+// in the repair queue. Lower health segments should be repaired first.
+func SegmentHealth(numHealthy, minPieces int, failureRate float64) float64 {
+	return 1.0 / SegmentDanger(numHealthy, minPieces, failureRate)
+}
+
+// SegmentDanger returns the chance of a segment with the given minPieces
+// and the given number of healthy pieces of being lost in the next time
+// period.
+//
+// It assumes:
+//
+// * Nodes fail at the given failureRate (i.e., each node has a failureRate
+//   chance of going offline within the next time period).
+// * Node failures are entirely independent. Obviously this is not the case,
+//   because many nodes may be operated by a single entity or share network
+//   infrastructure, in which case their failures would be correlated. But we
+//   can't easily model that, so our best hope is to try to avoid putting
+//   pieces for the same segment on related nodes to maximize failure
+//   independence.
+//
+// (The "time period" we are talking about here could be anything. The returned
+// danger value will be given in terms of whatever time period was used to
+// determine failureRate. If it simplifies things, you can think of the time
+// period as "one repair worker iteration".)
+//
+// If those things are true, then the number of nodes holding this segment
+// that will go offline follows the Binomial distribution:
+//
+//     X ~ Binom(numHealthy, failureRate)
+//
+// A segment is lost if the number of nodes that go offline is higher than
+// (numHealthy - minPieces). So we want to find
+//
+//     Pr[X > (numHealthy - minPieces)]
+//
+// If we invert the logic here, we can use the standard CDF for the binomial
+// distribution.
+//
+//     Pr[X > (numHealthy - minPieces)] = 1 - Pr[X <= (numHealthy - minPieces)]
+//
+// And that gives us the danger value.
+func SegmentDanger(numHealthy, minPieces int, failureRate float64) float64 {
+	return 1.0 - binomialCDF(float64(numHealthy-minPieces), float64(numHealthy), failureRate)
+}
+
+// math.Lgamma without the returned sign parameter; it's unneeded here.
+func lnGamma(x float64) float64 {
+	lg, _ := math.Lgamma(x)
+	return lg
+}
+
+// The following functions are based on code from
+// Numerical Recipes in C, Second Edition, Section 6.4 (pp. 227-228).
+
+// betaI calculates the incomplete beta function I_x(a, b).
+func betaI(a, b, x float64) float64 {
+	if x < 0.0 || x > 1.0 {
+		return math.NaN()
+	}
+	bt := 0.0
+	if x > 0.0 && x < 1.0 {
+		// factors in front of the continued function
+		bt = math.Exp(lnGamma(a+b) - lnGamma(a) - lnGamma(b) + a*math.Log(x) + b*math.Log(1.0-x))
+	}
+	if x < (a+1.0)/(a+b+2.0) {
+		// use continued fraction directly
+		return bt * betaCF(a, b, x) / a
+	}
+	// use continued fraction after making the symmetry transformation
+	return 1.0 - bt*betaCF(b, a, 1.0-x)/b
+}
+
+const (
+	// unlikely to go this far, as betaCF is expected to converge quickly for
+	// typical values.
+	maxIter = 100
+
+	// betaI outputs will be accurate to within this amount.
+	epsilon = 1.0e-14
+)
+
+// betaCF evaluates the continued fraction for the incomplete beta function
+// by a modified Lentz's method.
+func betaCF(a, b, x float64) float64 {
+	avoidZero := func(f float64) float64 {
+		if math.Abs(f) < math.SmallestNonzeroFloat64 {
+			return math.SmallestNonzeroFloat64
+		}
+		return f
+	}
+
+	qab := a + b
+	qap := a + 1.0
+	qam := a - 1.0
+	c := 1.0
+	d := 1.0 / avoidZero(1.0-qab*x/qap)
+	h := d
+
+	for m := 1; m <= maxIter; m++ {
+		m := float64(m)
+		m2 := 2.0 * m
+		aa := m * (b - m) * x / ((qam + m2) * (a + m2))
+		// one step (the even one) of the recurrence
+		d = 1.0 / avoidZero(1.0+aa*d)
+		c = avoidZero(1.0 + aa/c)
+		h *= d * c
+		aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2))
+		// next step of the recurrence (the odd one)
+		d = 1.0 / avoidZero(1.0+aa*d)
+		c = avoidZero(1.0 + aa/c)
+		del := d * c
+		h *= del
+		if math.Abs(del-1.0) < epsilon {
+			return h
+		}
+	}
+	// a or b too big, or maxIter too small
+	return math.NaN()
+}
+
+// binomialCDF evaluates the CDF of the binomial distribution Binom(n, p) at k.
+// This is done using (1-p)**(n-k) when k is 0, or with the incomplete beta
+// function otherwise.
+func binomialCDF(k, n, p float64) float64 {
+	k = math.Floor(k)
+	if k < 0.0 || n < k {
+		return math.NaN()
+	}
+	if k == n {
+		return 1.0
+	}
+	if k == 0 {
+		return math.Pow(1.0-p, n-k)
+	}
+	return betaI(n-k, k+1.0, 1.0-p)
+}
--- a/satellite/repair/priority_test.go
+++ b/satellite/repair/priority_test.go
@ -0,0 +1,65 @@
+// Copyright (C) 2020 Storj Labs, Inc.
+// See LICENSE for copying information.
+
+package repair
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBetaI(t *testing.T) {
+	// check a few places where betaI has some easily representable values
+	assert.Equal(t, 0.0, betaI(0.5, 5, 0))
+	assert.Equal(t, 0.0, betaI(1, 3, 0))
+	assert.Equal(t, 0.0, betaI(8, 10, 0))
+	assert.Equal(t, 0.0, betaI(8, 10, 0))
+	assert.InDelta(t, 0.5, betaI(0.5, 0.5, 0.5), epsilon)
+	assert.InDelta(t, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
+	assert.InDelta(t, 0.488, betaI(1, 3, 0.2), epsilon)
+}
+
+func BenchmarkBetaI(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		assert.InDelta(b, 1.0/3.0, betaI(0.5, 0.5, 0.25), epsilon)
+	}
+}
+
+func TestSegmentDanger(t *testing.T) {
+	const failureRate = 0.01
+	assert.Greater(t,
+		SegmentDanger(11, 10, failureRate),
+		SegmentDanger(10, 5, failureRate))
+	assert.Greater(t,
+		SegmentDanger(11, 10, failureRate),
+		SegmentDanger(10, 9, failureRate))
+	assert.Greater(t,
+		SegmentDanger(10, 10, failureRate),
+		SegmentDanger(9, 9, failureRate))
+	assert.Less(t,
+		SegmentDanger(11, 10, failureRate),
+		SegmentDanger(12, 11, failureRate))
+}
+
+func TestSegmentHealth(t *testing.T) {
+	const failureRate = 0.01
+	assert.Less(t,
+		SegmentHealth(11, 10, failureRate),
+		SegmentHealth(10, 5, failureRate))
+	assert.Less(t,
+		SegmentHealth(11, 10, failureRate),
+		SegmentHealth(10, 9, failureRate))
+	assert.Less(t,
+		SegmentHealth(10, 10, failureRate),
+		SegmentHealth(9, 9, failureRate))
+	assert.Greater(t,
+		SegmentHealth(11, 10, failureRate),
+		SegmentHealth(12, 11, failureRate))
+}
+
+func TestSegmentHealthForDecayedSegment(t *testing.T) {
+	const failureRate = 0.01
+	assert.True(t, math.IsNaN(SegmentHealth(9, 10, failureRate)))
+}