satellite/audit: account for piece size during audit reservoir sampling

Treat the piece size as a weight, and perform weighted reservoir sampling as given in Algorithm A-Chao (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Chao)

Change-Id: I299d0026d9e02d03b3d2130b0f32192928e6e326
This commit is contained in:
dlamarmorgan 2021-11-29 09:29:02 -08:00 committed by Damein Morgan
parent 82fb0fce04
commit b3cea3d1b6
2 changed files with 87 additions and 10 deletions

View File

@ -19,6 +19,7 @@ type Reservoir struct {
Segments [maxReservoirSize]Segment
size int8
index int64
wSum int64
}
// NewReservoir instantiates a Reservoir.
@ -35,14 +36,22 @@ func NewReservoir(size int) *Reservoir {
}
// Sample makes sure that for every segment in metainfo from index i=size..n-1,
// pick a random number r = rand(0..i), and if r < size, replace reservoir.Segments[r] with segment.
// compute the relative weight based on segment size, and pick a random floating
// point number r = rand(0..1), and if r < the relative weight of the segment,
// select uniformly a random segment reservoir.Segments[rand(0..i)] to replace with
// segment. See https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Chao
// for the algorithm used.
func (reservoir *Reservoir) Sample(r *rand.Rand, segment Segment) {
if reservoir.index < int64(reservoir.size) {
reservoir.Segments[reservoir.index] = segment
reservoir.wSum += int64(segment.EncryptedSize)
} else {
random := r.Int63n(reservoir.index + 1)
if random < int64(reservoir.size) {
reservoir.Segments[random] = segment
reservoir.wSum += int64(segment.EncryptedSize)
p := float64(segment.EncryptedSize) / float64(reservoir.wSum)
random := r.Float64()
if random < p {
index := r.Int31n(int32(reservoir.size))
reservoir.Segments[index] = segment
}
}
reservoir.index++
@ -50,17 +59,19 @@ func (reservoir *Reservoir) Sample(r *rand.Rand, segment Segment) {
// Segment is a segment to audit.
type Segment struct {
StreamID uuid.UUID
Position metabase.SegmentPosition
ExpiresAt *time.Time
StreamID uuid.UUID
Position metabase.SegmentPosition
ExpiresAt *time.Time
EncryptedSize int32 // size of the whole segment (not a piece)
}
// NewSegment creates a new segment to audit from a metainfo loop segment.
func NewSegment(loopSegment *segmentloop.Segment) Segment {
return Segment{
StreamID: loopSegment.StreamID,
Position: loopSegment.Position,
ExpiresAt: loopSegment.ExpiresAt,
StreamID: loopSegment.StreamID,
Position: loopSegment.Position,
ExpiresAt: loopSegment.ExpiresAt,
EncryptedSize: loopSegment.EncryptedSize,
}
}

View File

@ -6,10 +6,13 @@ package audit
import (
"math/rand"
"testing"
"time"
"github.com/stretchr/testify/require"
"storj.io/common/testrand"
"storj.io/common/uuid"
"storj.io/storj/satellite/metabase"
)
func TestReservoir(t *testing.T) {
@ -25,3 +28,66 @@ func TestReservoir(t *testing.T) {
require.Equal(t, r.Segments[:], []Segment{seg(1), seg(2), seg(3)})
}
func TestReservoirBias(t *testing.T) {
var weight10StreamID = testrand.UUID()
var weight5StreamID = testrand.UUID()
var weight2StreamID = testrand.UUID()
var weight1StreamID = testrand.UUID()
streamIDCountsMap := map[uuid.UUID]int{
weight10StreamID: 0,
weight5StreamID: 0,
weight2StreamID: 0,
weight1StreamID: 0,
}
segments := []Segment{
{
StreamID: weight10StreamID,
Position: metabase.SegmentPosition{},
ExpiresAt: nil,
EncryptedSize: 10,
},
{
StreamID: weight5StreamID,
Position: metabase.SegmentPosition{},
ExpiresAt: nil,
EncryptedSize: 5,
},
{
StreamID: weight2StreamID,
Position: metabase.SegmentPosition{},
ExpiresAt: nil,
EncryptedSize: 2,
},
{
StreamID: weight1StreamID,
Position: metabase.SegmentPosition{},
ExpiresAt: nil,
EncryptedSize: 1,
},
}
// run a large number of times in loop for bias to show up
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := 1; i < 100000; i++ {
r := NewReservoir(3)
for _, segment := range segments {
r.Sample(rng, segment)
}
for _, segment := range r.Segments {
streamIDCountsMap[segment.StreamID]++
}
// shuffle the segments order after each result
rng.Shuffle(len(segments),
func(i, j int) {
segments[i], segments[j] = segments[j], segments[i]
})
}
require.Greater(t, streamIDCountsMap[weight10StreamID], streamIDCountsMap[weight5StreamID])
require.Greater(t, streamIDCountsMap[weight5StreamID], streamIDCountsMap[weight2StreamID])
require.Greater(t, streamIDCountsMap[weight2StreamID], streamIDCountsMap[weight1StreamID])
}