storj/satellite/audit/reservoir.go
paul cannon 231c783698 satellite/audit: fix reservoir sampling bias
While researching logs from a large set of audits, I noticed that nearly
all of them had streamIDs starting with 0 or 1. This seemed very odd,
because streamIDs are supposed to be pretty much entirely random, and
every hex digit from 0-f should have been represented with roughly equal
frequency.

It turned out that our A-Chao implementation of reservoir sampling is
flawed. As far as we can tell, so is the Wikipedia implementation. No
one has yet reviewed the original 1982 paper by Dr. Chao in enough
detail to know where the error originated, but we do know that we have
been auditing segments near the beginning of the segment loop (low
streamIDs) far more often than segments near the end of the segment loop
(high streamIDs).

This change uses an algorithm Wikipedia calls "A-Res" instead, and adds
a test to check for that sort of bias creeping back in somehow. A-Res
will be slightly slower than A-Chao, because of a few extra steps that
need to be done, but it does appear to be selecting items uniformly.

Change-Id: I45eba4c522bafc729cebe2aab6f3fe65cd6336be
2022-12-09 18:16:58 -06:00

88 lines
2.4 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package audit
import (
"math"
"math/rand"
"time"
"storj.io/common/uuid"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/metabase/segmentloop"
)
const maxReservoirSize = 3
// Reservoir holds a certain number of segments to reflect a random sample.
type Reservoir struct {
Segments [maxReservoirSize]segmentloop.Segment
Keys [maxReservoirSize]float64
size int8
index int64
}
// NewReservoir instantiates a Reservoir.
func NewReservoir(size int) *Reservoir {
if size < 1 {
size = 1
} else if size > maxReservoirSize {
size = maxReservoirSize
}
return &Reservoir{
size: int8(size),
index: 0,
}
}
// Sample tries to ensure that each segment passed in has a chance (proportional
// to its size) to be in the reservoir when sampling is complete.
//
// The tricky part is that we do not know ahead of time how many segments will
// be passed in. The way this is accomplished is known as _Reservoir Sampling_.
// The specific algorithm we are using here is called A-Res on the Wikipedia
// article: https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Res
func (reservoir *Reservoir) Sample(r *rand.Rand, segment *segmentloop.Segment) {
k := -math.Log(r.Float64()) / float64(segment.EncryptedSize)
if reservoir.index < int64(reservoir.size) {
reservoir.Segments[reservoir.index] = *segment
reservoir.Keys[reservoir.index] = k
} else {
max := 0
for i := 1; i < int(reservoir.size); i++ {
if reservoir.Keys[i] > reservoir.Keys[max] {
max = i
}
}
if k < reservoir.Keys[max] {
reservoir.Segments[max] = *segment
reservoir.Keys[max] = k
}
}
reservoir.index++
}
// Segment is a segment to audit.
type Segment struct {
StreamID uuid.UUID
Position metabase.SegmentPosition
ExpiresAt *time.Time
EncryptedSize int32 // size of the whole segment (not a piece)
}
// NewSegment creates a new segment to audit from a metainfo loop segment.
func NewSegment(loopSegment segmentloop.Segment) Segment {
return Segment{
StreamID: loopSegment.StreamID,
Position: loopSegment.Position,
ExpiresAt: loopSegment.ExpiresAt,
EncryptedSize: loopSegment.EncryptedSize,
}
}
// Expired checks if segment is expired relative to now.
func (segment *Segment) Expired(now time.Time) bool {
return segment.ExpiresAt != nil && segment.ExpiresAt.Before(now)
}