satellite/reputation: offset write times by random, not by satelliteID

In an effort to distribute load on the reputation database, the
reputation write cache scheduled nodes to be written at a time offset by
the local nodeID. The idea was that no two repair workers would have the
same nodeID, so they would not tend to write to the same row at the same
time.

Instead, since all satellite processes share the same satellite ID
(duh), this caused _all_ workers to try and write to the same row at the
same time _always_. This was not ideal.

This change uses a random number instead of the satellite ID. The random
number is sourced from the number of nanoseconds since the Unix epoch.
As long as workers are not started at the exact same nanosecond, they
ought to get well-distributed offsets.

Change-Id: I149bdaa6ca1ee6043cfedcf1489dd9d3e3c7a163
This commit is contained in:
paul cannon 2022-07-27 18:43:02 -05:00
parent 92be1d878f
commit 799b159bba
7 changed files with 28 additions and 21 deletions

View File

@ -288,7 +288,7 @@ func NewAPI(log *zap.Logger, full *identity.FullIdentity, db DB,
{ // setup reputation
reputationDB := peer.DB.Reputation()
if config.Reputation.FlushInterval > 0 {
cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), peer.Identity.ID, reputationDB, config.Reputation)
cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), reputationDB, config.Reputation)
peer.Services.Add(lifecycle.Item{
Name: "reputation:writecache",
Run: cachingDB.Manage,

View File

@ -342,7 +342,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
{ // setup reputation
reputationDB := peer.DB.Reputation()
if config.Reputation.FlushInterval > 0 {
cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), peer.Identity.ID, reputationDB, config.Reputation)
cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), reputationDB, config.Reputation)
peer.Services.Add(lifecycle.Item{
Name: "reputation:writecache",
Run: cachingDB.Manage,

View File

@ -158,7 +158,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,
{ // setup reputation
if config.Reputation.FlushInterval > 0 {
cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), peer.Identity.ID, reputationdb, config.Reputation)
cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), reputationdb, config.Reputation)
peer.Services.Add(lifecycle.Item{
Name: "reputation:writecache",
Run: cachingDB.Manage,

View File

@ -223,7 +223,7 @@ func TestApplyUpdatesEquivalentToMultipleUpdatesCached(t *testing.T) {
}
satellitedbtest.Run(t, func(ctx *testcontext.Context, t *testing.T, db satellite.DB) {
cachingDB := reputation.NewCachingDB(zaptest.NewLogger(t), testrand.NodeID(), db.Reputation(), config)
cachingDB := reputation.NewCachingDB(zaptest.NewLogger(t), db.Reputation(), config)
cancelCtx, cancel := context.WithCancel(ctx)
defer cancel()
ctx.Go(func() error {

View File

@ -8,6 +8,7 @@ import (
"context"
"encoding/binary"
"errors"
"math/rand"
"sync"
"time"
@ -22,10 +23,11 @@ import (
var _ DB = (*CachingDB)(nil)
// NewCachingDB creates a new CachingDB instance.
func NewCachingDB(log *zap.Logger, peerID storj.NodeID, backingStore DB, reputationConfig Config) *CachingDB {
func NewCachingDB(log *zap.Logger, backingStore DB, reputationConfig Config) *CachingDB {
randSource := rand.New(rand.NewSource(time.Now().UnixNano()))
return &CachingDB{
log: log,
peerID: peerID,
instanceOffset: randSource.Uint64(),
backingStore: backingStore,
nowFunc: time.Now,
reputationConfig: reputationConfig,
@ -43,7 +45,7 @@ type CachingDB struct {
// These fields must be populated before the cache starts being used.
// They are not expected to change.
log *zap.Logger
peerID storj.NodeID
instanceOffset uint64
backingStore DB
nowFunc func() time.Time
reputationConfig Config
@ -558,7 +560,7 @@ func (cdb *CachingDB) getEntriesToSync(now time.Time, f func(entryToSync *cached
}
func (cdb *CachingDB) nextTimeForSync(nodeID storj.NodeID, now time.Time) time.Time {
return nextTimeForSync(cdb.peerID, nodeID, now, cdb.syncInterval)
return nextTimeForSync(cdb.instanceOffset, nodeID, now, cdb.syncInterval)
}
// nextTimeForSync decides the next time at which the given nodeID should next
@ -566,15 +568,14 @@ func (cdb *CachingDB) nextTimeForSync(nodeID storj.NodeID, now time.Time) time.T
//
// We make an effort to distribute the nodes in time, so that the service
// is not usually trying to retrieve or update many rows at the same time. We
// also make an effort to offset this sync schedule by the instance ID of this
// process so that in most cases, instances will not be trying to update the
// same row at the same time, minimizing contention.
func nextTimeForSync(peerID, nodeID storj.NodeID, now time.Time, syncInterval time.Duration) time.Time {
// also make an effort to offset this sync schedule by a random value unique
// to this process so that in most cases, instances will not be trying to
// update the same row at the same time, minimizing contention.
func nextTimeForSync(instanceOffset uint64, nodeID storj.NodeID, now time.Time, syncInterval time.Duration) time.Time {
// calculate the fraction into the FlushInterval at which this node
// should always be synchronized.
initialPosition := binary.BigEndian.Uint64(nodeID[:8])
offset := binary.BigEndian.Uint64(peerID[:8])
finalPosition := initialPosition + offset
finalPosition := initialPosition + instanceOffset
positionAsFraction := float64(finalPosition) / (1 << 64)
// and apply that fraction to the actual interval
periodStart := now.Truncate(syncInterval)

View File

@ -68,7 +68,7 @@ func TestHeavyLockContention(t *testing.T) {
ErrorRetryInterval: 0,
}
reputationDB := db.Reputation()
writecacheDB := reputation.NewCachingDB(zaptest.NewLogger(t), testrand.NodeID(), reputationDB, config)
writecacheDB := reputation.NewCachingDB(zaptest.NewLogger(t), reputationDB, config)
var group errgroup.Group
// Make room for results ahead of time, so we don't need to use any
@ -148,7 +148,7 @@ func TestFetchingInfoWhileEntryIsSyncing(t *testing.T) {
}
logger := zaptest.NewLogger(t)
reputationDB := db.Reputation()
writecache := reputation.NewCachingDB(logger.Named("writecache"), testrand.NodeID(), reputationDB, config)
writecache := reputation.NewCachingDB(logger.Named("writecache"), reputationDB, config)
const positiveAudits = 123
for i := 0; i < numRounds; i++ {

View File

@ -23,22 +23,28 @@ func TestNextTimeForSync(t *testing.T) {
var quarterwayID storj.NodeID
binary.BigEndian.PutUint64(quarterwayID[:8], 1<<62)
const (
zeroOffset = 0
halfwayOffset = 1 << 63
quarterwayOffset = 1 << 62
)
startOfHour := time.Now().Truncate(time.Hour)
now := startOfHour.Add(15 * time.Minute)
nextTime := nextTimeForSync(zeroesID, halfwayID, now, time.Hour)
nextTime := nextTimeForSync(zeroOffset, halfwayID, now, time.Hour)
requireInDeltaTime(t, startOfHour.Add(30*time.Minute), nextTime, time.Second)
nextTime = nextTimeForSync(halfwayID, zeroesID, now, time.Hour)
nextTime = nextTimeForSync(halfwayOffset, zeroesID, now, time.Hour)
requireInDeltaTime(t, startOfHour.Add(30*time.Minute), nextTime, time.Second)
nextTime = nextTimeForSync(zeroesID, zeroesID, now, time.Hour)
nextTime = nextTimeForSync(zeroOffset, zeroesID, now, time.Hour)
requireInDeltaTime(t, startOfHour.Add(time.Hour), nextTime, time.Second)
nextTime = nextTimeForSync(halfwayID, halfwayID, now, time.Hour)
nextTime = nextTimeForSync(halfwayOffset, halfwayID, now, time.Hour)
requireInDeltaTime(t, startOfHour.Add(time.Hour), nextTime, time.Second)
nextTime = nextTimeForSync(quarterwayID, halfwayID, now, time.Hour)
nextTime = nextTimeForSync(quarterwayOffset, halfwayID, now, time.Hour)
requireInDeltaTime(t, startOfHour.Add(45*time.Minute), nextTime, time.Second)
}