satellite/audit: account for piece size during audit reservoir sampling
Treat the piece size as a weight, and perform weighted reservoir sampling as given in Algorithm A-Chao (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Chao) Change-Id: I299d0026d9e02d03b3d2130b0f32192928e6e326
This commit is contained in:
parent
82fb0fce04
commit
b3cea3d1b6
@ -19,6 +19,7 @@ type Reservoir struct {
|
||||
Segments [maxReservoirSize]Segment
|
||||
size int8
|
||||
index int64
|
||||
wSum int64
|
||||
}
|
||||
|
||||
// NewReservoir instantiates a Reservoir.
|
||||
@ -35,14 +36,22 @@ func NewReservoir(size int) *Reservoir {
|
||||
}
|
||||
|
||||
// Sample makes sure that for every segment in metainfo from index i=size..n-1,
|
||||
// pick a random number r = rand(0..i), and if r < size, replace reservoir.Segments[r] with segment.
|
||||
// compute the relative weight based on segment size, and pick a random floating
|
||||
// point number r = rand(0..1), and if r < the relative weight of the segment,
|
||||
// select uniformly a random segment reservoir.Segments[rand(0..i)] to replace with
|
||||
// segment. See https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Chao
|
||||
// for the algorithm used.
|
||||
func (reservoir *Reservoir) Sample(r *rand.Rand, segment Segment) {
|
||||
if reservoir.index < int64(reservoir.size) {
|
||||
reservoir.Segments[reservoir.index] = segment
|
||||
reservoir.wSum += int64(segment.EncryptedSize)
|
||||
} else {
|
||||
random := r.Int63n(reservoir.index + 1)
|
||||
if random < int64(reservoir.size) {
|
||||
reservoir.Segments[random] = segment
|
||||
reservoir.wSum += int64(segment.EncryptedSize)
|
||||
p := float64(segment.EncryptedSize) / float64(reservoir.wSum)
|
||||
random := r.Float64()
|
||||
if random < p {
|
||||
index := r.Int31n(int32(reservoir.size))
|
||||
reservoir.Segments[index] = segment
|
||||
}
|
||||
}
|
||||
reservoir.index++
|
||||
@ -50,17 +59,19 @@ func (reservoir *Reservoir) Sample(r *rand.Rand, segment Segment) {
|
||||
|
||||
// Segment is a segment to audit.
|
||||
type Segment struct {
|
||||
StreamID uuid.UUID
|
||||
Position metabase.SegmentPosition
|
||||
ExpiresAt *time.Time
|
||||
StreamID uuid.UUID
|
||||
Position metabase.SegmentPosition
|
||||
ExpiresAt *time.Time
|
||||
EncryptedSize int32 // size of the whole segment (not a piece)
|
||||
}
|
||||
|
||||
// NewSegment creates a new segment to audit from a metainfo loop segment.
|
||||
func NewSegment(loopSegment *segmentloop.Segment) Segment {
|
||||
return Segment{
|
||||
StreamID: loopSegment.StreamID,
|
||||
Position: loopSegment.Position,
|
||||
ExpiresAt: loopSegment.ExpiresAt,
|
||||
StreamID: loopSegment.StreamID,
|
||||
Position: loopSegment.Position,
|
||||
ExpiresAt: loopSegment.ExpiresAt,
|
||||
EncryptedSize: loopSegment.EncryptedSize,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,10 +6,13 @@ package audit
|
||||
import (
|
||||
"math/rand"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"storj.io/common/testrand"
|
||||
"storj.io/common/uuid"
|
||||
"storj.io/storj/satellite/metabase"
|
||||
)
|
||||
|
||||
func TestReservoir(t *testing.T) {
|
||||
@ -25,3 +28,66 @@ func TestReservoir(t *testing.T) {
|
||||
|
||||
require.Equal(t, r.Segments[:], []Segment{seg(1), seg(2), seg(3)})
|
||||
}
|
||||
|
||||
func TestReservoirBias(t *testing.T) {
|
||||
var weight10StreamID = testrand.UUID()
|
||||
var weight5StreamID = testrand.UUID()
|
||||
var weight2StreamID = testrand.UUID()
|
||||
var weight1StreamID = testrand.UUID()
|
||||
streamIDCountsMap := map[uuid.UUID]int{
|
||||
weight10StreamID: 0,
|
||||
weight5StreamID: 0,
|
||||
weight2StreamID: 0,
|
||||
weight1StreamID: 0,
|
||||
}
|
||||
|
||||
segments := []Segment{
|
||||
{
|
||||
StreamID: weight10StreamID,
|
||||
Position: metabase.SegmentPosition{},
|
||||
ExpiresAt: nil,
|
||||
EncryptedSize: 10,
|
||||
},
|
||||
{
|
||||
StreamID: weight5StreamID,
|
||||
Position: metabase.SegmentPosition{},
|
||||
ExpiresAt: nil,
|
||||
EncryptedSize: 5,
|
||||
},
|
||||
{
|
||||
StreamID: weight2StreamID,
|
||||
Position: metabase.SegmentPosition{},
|
||||
ExpiresAt: nil,
|
||||
EncryptedSize: 2,
|
||||
},
|
||||
{
|
||||
StreamID: weight1StreamID,
|
||||
Position: metabase.SegmentPosition{},
|
||||
ExpiresAt: nil,
|
||||
EncryptedSize: 1,
|
||||
},
|
||||
}
|
||||
|
||||
// run a large number of times in loop for bias to show up
|
||||
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
for i := 1; i < 100000; i++ {
|
||||
r := NewReservoir(3)
|
||||
|
||||
for _, segment := range segments {
|
||||
r.Sample(rng, segment)
|
||||
}
|
||||
|
||||
for _, segment := range r.Segments {
|
||||
streamIDCountsMap[segment.StreamID]++
|
||||
}
|
||||
|
||||
// shuffle the segments order after each result
|
||||
rng.Shuffle(len(segments),
|
||||
func(i, j int) {
|
||||
segments[i], segments[j] = segments[j], segments[i]
|
||||
})
|
||||
}
|
||||
require.Greater(t, streamIDCountsMap[weight10StreamID], streamIDCountsMap[weight5StreamID])
|
||||
require.Greater(t, streamIDCountsMap[weight5StreamID], streamIDCountsMap[weight2StreamID])
|
||||
require.Greater(t, streamIDCountsMap[weight2StreamID], streamIDCountsMap[weight1StreamID])
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user