satellite/reputation: offset write times by random, not by satelliteID

In an effort to distribute load on the reputation database, the reputation write cache scheduled nodes to be written at a time offset by the local nodeID. The idea was that no two repair workers would have the same nodeID, so they would not tend to write to the same row at the same time. Instead, since all satellite processes share the same satellite ID (duh), this caused _all_ workers to try and write to the same row at the same time _always_. This was not ideal. This change uses a random number instead of the satellite ID. The random number is sourced from the number of nanoseconds since the Unix epoch. As long as workers are not started at the exact same nanosecond, they ought to get well-distributed offsets. Change-Id: I149bdaa6ca1ee6043cfedcf1489dd9d3e3c7a163
2022-07-27 18:43:02 -05:00 · 2022-07-27 18:43:02 -05:00 · 799b159bba
commit 799b159bba
parent 92be1d878f
7 changed files with 28 additions and 21 deletions
--- a/satellite/api.go
+++ b/satellite/api.go
@ -288,7 +288,7 @@ func NewAPI(log *zap.Logger, full *identity.FullIdentity, db DB,
 	{ // setup reputation
 		reputationDB := peer.DB.Reputation()
 		if config.Reputation.FlushInterval > 0 {
-			cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), peer.Identity.ID, reputationDB, config.Reputation)
+			cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), reputationDB, config.Reputation)
 			peer.Services.Add(lifecycle.Item{
 				Name: "reputation:writecache",
 				Run:  cachingDB.Manage,
--- a/satellite/core.go
+++ b/satellite/core.go
@ -342,7 +342,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
 	{ // setup reputation
 		reputationDB := peer.DB.Reputation()
 		if config.Reputation.FlushInterval > 0 {
-			cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), peer.Identity.ID, reputationDB, config.Reputation)
+			cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), reputationDB, config.Reputation)
 			peer.Services.Add(lifecycle.Item{
 				Name: "reputation:writecache",
 				Run:  cachingDB.Manage,
--- a/satellite/repairer.go
+++ b/satellite/repairer.go
@ -158,7 +158,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,

 	{ // setup reputation
 		if config.Reputation.FlushInterval > 0 {
-			cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), peer.Identity.ID, reputationdb, config.Reputation)
+			cachingDB := reputation.NewCachingDB(log.Named("reputation:writecache"), reputationdb, config.Reputation)
 			peer.Services.Add(lifecycle.Item{
 				Name: "reputation:writecache",
 				Run:  cachingDB.Manage,
--- a/satellite/reputation/db_test.go
+++ b/satellite/reputation/db_test.go
@ -223,7 +223,7 @@ func TestApplyUpdatesEquivalentToMultipleUpdatesCached(t *testing.T) {
 	}

 	satellitedbtest.Run(t, func(ctx *testcontext.Context, t *testing.T, db satellite.DB) {
-		cachingDB := reputation.NewCachingDB(zaptest.NewLogger(t), testrand.NodeID(), db.Reputation(), config)
+		cachingDB := reputation.NewCachingDB(zaptest.NewLogger(t), db.Reputation(), config)
 		cancelCtx, cancel := context.WithCancel(ctx)
 		defer cancel()
 		ctx.Go(func() error {
--- a/satellite/reputation/writecache.go
+++ b/satellite/reputation/writecache.go
@ -8,6 +8,7 @@ import (
 	"context"
 	"encoding/binary"
 	"errors"
+	"math/rand"
 	"sync"
 	"time"

@ -22,10 +23,11 @@ import (
 var _ DB = (*CachingDB)(nil)

 // NewCachingDB creates a new CachingDB instance.
-func NewCachingDB(log *zap.Logger, peerID storj.NodeID, backingStore DB, reputationConfig Config) *CachingDB {
+func NewCachingDB(log *zap.Logger, backingStore DB, reputationConfig Config) *CachingDB {
+	randSource := rand.New(rand.NewSource(time.Now().UnixNano()))
 	return &CachingDB{
 		log:                log,
-		peerID:             peerID,
+		instanceOffset:     randSource.Uint64(),
 		backingStore:       backingStore,
 		nowFunc:            time.Now,
 		reputationConfig:   reputationConfig,
@ -43,7 +45,7 @@ type CachingDB struct {
 	// These fields must be populated before the cache starts being used.
 	// They are not expected to change.
 	log                *zap.Logger
-	peerID             storj.NodeID
+	instanceOffset     uint64
 	backingStore       DB
 	nowFunc            func() time.Time
 	reputationConfig   Config
@ -558,7 +560,7 @@ func (cdb *CachingDB) getEntriesToSync(now time.Time, f func(entryToSync *cached
 }

 func (cdb *CachingDB) nextTimeForSync(nodeID storj.NodeID, now time.Time) time.Time {
-	return nextTimeForSync(cdb.peerID, nodeID, now, cdb.syncInterval)
+	return nextTimeForSync(cdb.instanceOffset, nodeID, now, cdb.syncInterval)
 }

 // nextTimeForSync decides the next time at which the given nodeID should next
@ -566,15 +568,14 @@ func (cdb *CachingDB) nextTimeForSync(nodeID storj.NodeID, now time.Time) time.T
 //
 // We make an effort to distribute the nodes in time, so that the service
 // is not usually trying to retrieve or update many rows at the same time. We
-// also make an effort to offset this sync schedule by the instance ID of this
-// process so that in most cases, instances will not be trying to update the
-// same row at the same time, minimizing contention.
-func nextTimeForSync(peerID, nodeID storj.NodeID, now time.Time, syncInterval time.Duration) time.Time {
+// also make an effort to offset this sync schedule by a random value unique
+// to this process so that in most cases, instances will not be trying to
+// update the same row at the same time, minimizing contention.
+func nextTimeForSync(instanceOffset uint64, nodeID storj.NodeID, now time.Time, syncInterval time.Duration) time.Time {
 	// calculate the fraction into the FlushInterval at which this node
 	// should always be synchronized.
 	initialPosition := binary.BigEndian.Uint64(nodeID[:8])
-	offset := binary.BigEndian.Uint64(peerID[:8])
-	finalPosition := initialPosition + offset
+	finalPosition := initialPosition + instanceOffset
 	positionAsFraction := float64(finalPosition) / (1 << 64)
 	// and apply that fraction to the actual interval
 	periodStart := now.Truncate(syncInterval)
--- a/satellite/reputation/writecache_service_test.go
+++ b/satellite/reputation/writecache_service_test.go
@ -68,7 +68,7 @@ func TestHeavyLockContention(t *testing.T) {
 			ErrorRetryInterval: 0,
 		}
 		reputationDB := db.Reputation()
-		writecacheDB := reputation.NewCachingDB(zaptest.NewLogger(t), testrand.NodeID(), reputationDB, config)
+		writecacheDB := reputation.NewCachingDB(zaptest.NewLogger(t), reputationDB, config)
 		var group errgroup.Group

 		// Make room for results ahead of time, so we don't need to use any
@ -148,7 +148,7 @@ func TestFetchingInfoWhileEntryIsSyncing(t *testing.T) {
 		}
 		logger := zaptest.NewLogger(t)
 		reputationDB := db.Reputation()
-		writecache := reputation.NewCachingDB(logger.Named("writecache"), testrand.NodeID(), reputationDB, config)
+		writecache := reputation.NewCachingDB(logger.Named("writecache"), reputationDB, config)
 		const positiveAudits = 123

 		for i := 0; i < numRounds; i++ {
--- a/satellite/reputation/writecache_unit_test.go
+++ b/satellite/reputation/writecache_unit_test.go
@ -23,22 +23,28 @@ func TestNextTimeForSync(t *testing.T) {
 	var quarterwayID storj.NodeID
 	binary.BigEndian.PutUint64(quarterwayID[:8], 1<<62)

+	const (
+		zeroOffset       = 0
+		halfwayOffset    = 1 << 63
+		quarterwayOffset = 1 << 62
+	)
+
 	startOfHour := time.Now().Truncate(time.Hour)
 	now := startOfHour.Add(15 * time.Minute)

-	nextTime := nextTimeForSync(zeroesID, halfwayID, now, time.Hour)
+	nextTime := nextTimeForSync(zeroOffset, halfwayID, now, time.Hour)
 	requireInDeltaTime(t, startOfHour.Add(30*time.Minute), nextTime, time.Second)

-	nextTime = nextTimeForSync(halfwayID, zeroesID, now, time.Hour)
+	nextTime = nextTimeForSync(halfwayOffset, zeroesID, now, time.Hour)
 	requireInDeltaTime(t, startOfHour.Add(30*time.Minute), nextTime, time.Second)

-	nextTime = nextTimeForSync(zeroesID, zeroesID, now, time.Hour)
+	nextTime = nextTimeForSync(zeroOffset, zeroesID, now, time.Hour)
 	requireInDeltaTime(t, startOfHour.Add(time.Hour), nextTime, time.Second)

-	nextTime = nextTimeForSync(halfwayID, halfwayID, now, time.Hour)
+	nextTime = nextTimeForSync(halfwayOffset, halfwayID, now, time.Hour)
 	requireInDeltaTime(t, startOfHour.Add(time.Hour), nextTime, time.Second)

-	nextTime = nextTimeForSync(quarterwayID, halfwayID, now, time.Hour)
+	nextTime = nextTimeForSync(quarterwayOffset, halfwayID, now, time.Hour)
 	requireInDeltaTime(t, startOfHour.Add(45*time.Minute), nextTime, time.Second)
 }