satellite/orders: bound RollupsWriteCache flushes

In the situation where the flushes take longer than the incoming
rate of writes, the RollupsWriteCache will take every connection
in the database pool and use them forever. Instead of doing that
and taking down satellite availability, bound the number of flush
operations that it will perform and drop incoming writes earlier
to keep memory usage constant.

Adds monitoring events for if any flushes or updates are lost.

Change-Id: I81b169b73501ee9b999f4b03d1e79645fc56f167
This commit is contained in:
Jeff Wendling 2021-09-15 14:32:27 -04:00 committed by Yingrong Zhao
parent 86845351c3
commit b160ec4c1b

View File

@ -42,9 +42,9 @@ type RollupsWriteCache struct {
mu sync.Mutex mu sync.Mutex
pendingRollups RollupData pendingRollups RollupData
currentSize int
latestTime time.Time latestTime time.Time
stopped bool stopped bool
flushing bool
nextFlushCompletion *sync2.Fence nextFlushCompletion *sync2.Fence
} }
@ -78,23 +78,42 @@ func (cache *RollupsWriteCache) UpdateBucketBandwidthSettle(ctx context.Context,
// resetCache should only be called after you have acquired the cache lock. It // resetCache should only be called after you have acquired the cache lock. It
// will reset the various cache values and return the pendingRollups, // will reset the various cache values and return the pendingRollups,
// latestTime, and currentSize. // latestTime, and currentSize.
func (cache *RollupsWriteCache) resetCache() (RollupData, time.Time, int) { func (cache *RollupsWriteCache) resetCache() (RollupData, time.Time) {
pendingRollups := cache.pendingRollups pendingRollups := cache.pendingRollups
cache.pendingRollups = make(RollupData) cache.pendingRollups = make(RollupData)
oldSize := cache.currentSize
cache.currentSize = 0
latestTime := cache.latestTime latestTime := cache.latestTime
cache.latestTime = time.Time{} cache.latestTime = time.Time{}
return pendingRollups, latestTime, oldSize
return pendingRollups, latestTime
} }
// Flush resets cache then flushes the everything in the rollups write cache to the database. // Flush resets cache then flushes the everything in the rollups write cache to the database.
func (cache *RollupsWriteCache) Flush(ctx context.Context) { func (cache *RollupsWriteCache) Flush(ctx context.Context) {
defer mon.Task()(&ctx)(nil) defer mon.Task()(&ctx)(nil)
cache.mu.Lock() cache.mu.Lock()
pendingRollups, latestTime, oldSize := cache.resetCache()
// while we're already flushing, wait for it to complete.
for cache.flushing {
done := cache.nextFlushCompletion.Done()
cache.mu.Unlock() cache.mu.Unlock()
cache.flush(ctx, pendingRollups, latestTime, oldSize)
select {
case <-done:
case <-ctx.Done():
return
}
cache.mu.Lock()
}
cache.flushing = true
pendingRollups, latestTime := cache.resetCache()
cache.mu.Unlock()
cache.flush(ctx, pendingRollups, latestTime)
} }
// CloseAndFlush flushes anything in the cache and marks the cache as stopped. // CloseAndFlush flushes anything in the cache and marks the cache as stopped.
@ -102,6 +121,7 @@ func (cache *RollupsWriteCache) CloseAndFlush(ctx context.Context) error {
cache.mu.Lock() cache.mu.Lock()
cache.stopped = true cache.stopped = true
cache.mu.Unlock() cache.mu.Unlock()
cache.wg.Wait() cache.wg.Wait()
cache.Flush(ctx) cache.Flush(ctx)
@ -109,10 +129,11 @@ func (cache *RollupsWriteCache) CloseAndFlush(ctx context.Context) error {
} }
// flush flushes the everything in the rollups write cache to the database. // flush flushes the everything in the rollups write cache to the database.
func (cache *RollupsWriteCache) flush(ctx context.Context, pendingRollups RollupData, latestTime time.Time, oldSize int) { func (cache *RollupsWriteCache) flush(ctx context.Context, pendingRollups RollupData, latestTime time.Time) {
defer mon.Task()(&ctx)(nil) defer mon.Task()(&ctx)(nil)
rollups := make([]BucketBandwidthRollup, 0, oldSize) if len(pendingRollups) > 0 {
rollups := make([]BucketBandwidthRollup, 0, len(pendingRollups))
for cacheKey, cacheData := range pendingRollups { for cacheKey, cacheData := range pendingRollups {
rollups = append(rollups, BucketBandwidthRollup{ rollups = append(rollups, BucketBandwidthRollup{
ProjectID: cacheKey.ProjectID, ProjectID: cacheKey.ProjectID,
@ -126,14 +147,17 @@ func (cache *RollupsWriteCache) flush(ctx context.Context, pendingRollups Rollup
err := cache.DB.UpdateBucketBandwidthBatch(ctx, latestTime, rollups) err := cache.DB.UpdateBucketBandwidthBatch(ctx, latestTime, rollups)
if err != nil { if err != nil {
mon.Event("rollups_write_cache_flush_lost")
cache.log.Error("MONEY LOST! Bucket bandwidth rollup batch flush failed.", zap.Error(err)) cache.log.Error("MONEY LOST! Bucket bandwidth rollup batch flush failed.", zap.Error(err))
} }
}
var completion *sync2.Fence
cache.mu.Lock() cache.mu.Lock()
cache.nextFlushCompletion, completion = new(sync2.Fence), cache.nextFlushCompletion defer cache.mu.Unlock()
cache.mu.Unlock()
completion.Release() cache.nextFlushCompletion.Release()
cache.nextFlushCompletion = new(sync2.Fence)
cache.flushing = false
} }
func (cache *RollupsWriteCache) updateCacheValue(ctx context.Context, projectID uuid.UUID, bucketName []byte, action pb.PieceAction, allocated, inline, settled int64, intervalStart time.Time) error { func (cache *RollupsWriteCache) updateCacheValue(ctx context.Context, projectID uuid.UUID, bucketName []byte, action pb.PieceAction, allocated, inline, settled int64, intervalStart time.Time) error {
@ -146,35 +170,43 @@ func (cache *RollupsWriteCache) updateCacheValue(ctx context.Context, projectID
return Error.New("RollupsWriteCache is stopped") return Error.New("RollupsWriteCache is stopped")
} }
if intervalStart.After(cache.latestTime) {
cache.latestTime = intervalStart
}
key := CacheKey{ key := CacheKey{
ProjectID: projectID, ProjectID: projectID,
BucketName: string(bucketName), BucketName: string(bucketName),
Action: action, Action: action,
} }
// pevent unbounded memory memory growth if we're not flushing fast enough
// to keep up with incoming writes.
data, ok := cache.pendingRollups[key] data, ok := cache.pendingRollups[key]
if !ok { if !ok && len(cache.pendingRollups) >= cache.batchSize {
cache.currentSize++ mon.Event("rollups_write_cache_update_lost")
cache.log.Error("MONEY LOST! Flushing too slow to keep up with demand.")
} else {
if cache.latestTime.IsZero() || intervalStart.After(cache.latestTime) {
cache.latestTime = intervalStart
} }
data.Allocated += allocated data.Allocated += allocated
data.Inline += inline data.Inline += inline
data.Settled += settled data.Settled += settled
cache.pendingRollups[key] = data cache.pendingRollups[key] = data
}
if cache.currentSize < cache.batchSize { if len(cache.pendingRollups) < cache.batchSize {
return nil return nil
} }
pendingRollups, latestTime, oldSize := cache.resetCache()
if !cache.flushing {
cache.flushing = true
pendingRollups, latestTime := cache.resetCache()
cache.wg.Add(1) cache.wg.Add(1)
go func() { go func() {
cache.flush(ctx, pendingRollups, latestTime, oldSize) defer cache.wg.Done()
cache.wg.Done() cache.flush(ctx, pendingRollups, latestTime)
}() }()
}
return nil return nil
} }
@ -183,22 +215,24 @@ func (cache *RollupsWriteCache) updateCacheValue(ctx context.Context, projectID
// the returned channel. // the returned channel.
func (cache *RollupsWriteCache) OnNextFlush() <-chan struct{} { func (cache *RollupsWriteCache) OnNextFlush() <-chan struct{} {
cache.mu.Lock() cache.mu.Lock()
fence := cache.nextFlushCompletion defer cache.mu.Unlock()
cache.mu.Unlock()
return fence.Done() return cache.nextFlushCompletion.Done()
} }
// CurrentSize returns the current size of the cache. // CurrentSize returns the current size of the cache.
func (cache *RollupsWriteCache) CurrentSize() int { func (cache *RollupsWriteCache) CurrentSize() int {
cache.mu.Lock() cache.mu.Lock()
defer cache.mu.Unlock() defer cache.mu.Unlock()
return cache.currentSize
return len(cache.pendingRollups)
} }
// CurrentData returns the contents of the cache. // CurrentData returns the contents of the cache.
func (cache *RollupsWriteCache) CurrentData() RollupData { func (cache *RollupsWriteCache) CurrentData() RollupData {
cache.mu.Lock() cache.mu.Lock()
defer cache.mu.Unlock() defer cache.mu.Unlock()
copyCache := RollupData{} copyCache := RollupData{}
for k, v := range cache.pendingRollups { for k, v := range cache.pendingRollups {
copyCache[k] = v copyCache[k] = v