storj/storagenode/pieces/cache.go
Cameron ab1147afb6
storagenode/pieces: fix race condition in cache service (#2972)
* add mutex lock to PersistCacheTotals, move lock around copyCacheTotals to inside function
2019-09-12 12:42:39 -04:00

302 lines
9.8 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package pieces
import (
"context"
"sync"
"time"
"go.uber.org/zap"
"storj.io/storj/internal/sync2"
"storj.io/storj/pkg/storj"
"storj.io/storj/storage"
)
// CacheService updates the space used cache
//
// architecture: Chore
type CacheService struct {
log *zap.Logger
usageCache *BlobsUsageCache
store *Store
loop sync2.Cycle
}
// NewService creates a new cache service that updates the space usage cache on startup and syncs the cache values to
// persistent storage on an interval
func NewService(log *zap.Logger, usageCache *BlobsUsageCache, pieces *Store, interval time.Duration) *CacheService {
return &CacheService{
log: log,
usageCache: usageCache,
store: pieces,
loop: *sync2.NewCycle(interval),
}
}
// Run recalculates the space used cache once and also runs a loop to sync the space used cache
// to persistent storage on an interval
func (service *CacheService) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
totalAtStart := service.usageCache.copyCacheTotals()
// recalculate the cache once
newTotal, newTotalBySatellite, err := service.store.SpaceUsedTotalAndBySatellite(ctx)
if err != nil {
service.log.Error("error getting current space used calculation: ", zap.Error(err))
}
if err = service.usageCache.Recalculate(ctx, newTotal,
totalAtStart.totalSpaceUsed,
newTotalBySatellite,
totalAtStart.totalSpaceUsedBySatellite,
); err != nil {
service.log.Error("error during recalculating space usage cache: ", zap.Error(err))
}
if err = service.store.spaceUsedDB.Init(ctx); err != nil {
service.log.Error("error during init space usage db: ", zap.Error(err))
}
return service.loop.Run(ctx, func(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
// on a loop sync the cache values to the db so that we have the them saved
// in the case that the storagenode restarts
if err := service.PersistCacheTotals(ctx); err != nil {
service.log.Error("error persisting cache totals to the database: ", zap.Error(err))
}
return err
})
}
// PersistCacheTotals saves the current totals of the space used cache to the database
// so that if the storagenode restarts it can retrieve the latest space used
// values without needing to recalculate since that could take a long time
func (service *CacheService) PersistCacheTotals(ctx context.Context) error {
cache := service.usageCache
cache.mu.Lock()
defer cache.mu.Unlock()
if err := service.store.spaceUsedDB.UpdateTotal(ctx, cache.totalSpaceUsed); err != nil {
return err
}
if err := service.store.spaceUsedDB.UpdateTotalsForAllSatellites(ctx, cache.totalSpaceUsedBySatellite); err != nil {
return err
}
return nil
}
// Init initializes the space used cache with the most recent values that were stored persistently
func (service *CacheService) Init(ctx context.Context) (err error) {
total, err := service.store.spaceUsedDB.GetTotal(ctx)
if err != nil {
service.log.Error("CacheServiceInit error during initializing space usage cache GetTotal:", zap.Error(err))
return err
}
totalBySatellite, err := service.store.spaceUsedDB.GetTotalsForAllSatellites(ctx)
if err != nil {
service.log.Error("CacheServiceInit error during initializing space usage cache GetTotalsForAllSatellites:", zap.Error(err))
return err
}
service.usageCache.init(total, totalBySatellite)
return nil
}
// Close closes the loop
func (service *CacheService) Close() (err error) {
service.loop.Close()
return nil
}
// BlobsUsageCache is a blob storage with a cache for storing
// totals of current space used
//
// architecture: Database
type BlobsUsageCache struct {
storage.Blobs
mu sync.Mutex
totalSpaceUsed int64
totalSpaceUsedBySatellite map[storj.NodeID]int64
}
// NewBlobsUsageCache creates a new disk blob store with a space used cache
func NewBlobsUsageCache(blob storage.Blobs) *BlobsUsageCache {
return &BlobsUsageCache{
Blobs: blob,
totalSpaceUsedBySatellite: map[storj.NodeID]int64{},
}
}
// NewBlobsUsageCacheTest creates a new disk blob store with a space used cache
func NewBlobsUsageCacheTest(blob storage.Blobs, total int64, totalSpaceUsedBySatellite map[storj.NodeID]int64) *BlobsUsageCache {
return &BlobsUsageCache{
Blobs: blob,
totalSpaceUsed: total,
totalSpaceUsedBySatellite: totalSpaceUsedBySatellite,
}
}
func (blobs *BlobsUsageCache) init(total int64, totalBySatellite map[storj.NodeID]int64) {
blobs.mu.Lock()
defer blobs.mu.Unlock()
blobs.totalSpaceUsed = total
blobs.totalSpaceUsedBySatellite = totalBySatellite
}
// SpaceUsedBySatellite returns the current total space used for a specific
// satellite for all pieces (not including header bytes)
func (blobs *BlobsUsageCache) SpaceUsedBySatellite(ctx context.Context, satelliteID storj.NodeID) (int64, error) {
blobs.mu.Lock()
defer blobs.mu.Unlock()
return blobs.totalSpaceUsedBySatellite[satelliteID], nil
}
// SpaceUsedForPieces returns the current total used space for
//// all pieces content (not including header bytes)
func (blobs *BlobsUsageCache) SpaceUsedForPieces(ctx context.Context) (int64, error) {
blobs.mu.Lock()
defer blobs.mu.Unlock()
return blobs.totalSpaceUsed, nil
}
// Delete gets the size of the piece that is going to be deleted then deletes it and
// updates the space used cache accordingly
func (blobs *BlobsUsageCache) Delete(ctx context.Context, blobRef storage.BlobRef) error {
blobInfo, err := blobs.Stat(ctx, blobRef)
if err != nil {
return err
}
pieceAccess, err := newStoredPieceAccess(nil, blobInfo)
if err != nil {
return err
}
pieceContentSize, err := pieceAccess.ContentSize(ctx)
if err != nil {
return Error.Wrap(err)
}
if err := blobs.Blobs.Delete(ctx, blobRef); err != nil {
return Error.Wrap(err)
}
satelliteID := storj.NodeID{}
copy(satelliteID[:], blobRef.Namespace)
blobs.Update(ctx, satelliteID, -pieceContentSize)
return nil
}
// Update updates the cache totals with the piece content size
func (blobs *BlobsUsageCache) Update(ctx context.Context, satelliteID storj.NodeID, pieceContentSize int64) {
blobs.mu.Lock()
defer blobs.mu.Unlock()
blobs.totalSpaceUsed += pieceContentSize
blobs.totalSpaceUsedBySatellite[satelliteID] += pieceContentSize
}
func (blobs *BlobsUsageCache) copyCacheTotals() BlobsUsageCache {
blobs.mu.Lock()
defer blobs.mu.Unlock()
var copyMap = map[storj.NodeID]int64{}
for k, v := range blobs.totalSpaceUsedBySatellite {
copyMap[k] = v
}
return BlobsUsageCache{
totalSpaceUsed: blobs.totalSpaceUsed,
totalSpaceUsedBySatellite: copyMap,
}
}
// Recalculate estimates new totals for the space used cache. In order to get new totals for the
// space used cache, we had to iterate over all the pieces on disk. Since that can potentially take
// a long time, here we need to check if we missed any additions/deletions while we were iterating and
// estimate how many bytes missed then add those to the space used result of iteration.
func (blobs *BlobsUsageCache) Recalculate(ctx context.Context, newTotal, totalAtIterationStart int64, newTotalBySatellite, totalBySatelliteAtIterationStart map[storj.NodeID]int64) error {
totalsAtIterationEnd := blobs.copyCacheTotals()
estimatedTotals := estimate(newTotal,
totalAtIterationStart,
totalsAtIterationEnd.totalSpaceUsed,
)
var estimatedTotalsBySatellite = map[storj.NodeID]int64{}
for ID, newTotal := range newTotalBySatellite {
estimatedNewTotal := estimate(newTotal,
totalBySatelliteAtIterationStart[ID],
totalsAtIterationEnd.totalSpaceUsedBySatellite[ID],
)
// if the estimatedNewTotal is zero then there is no data stored
// for this satelliteID so don't add it to the cache
if estimatedNewTotal == 0 {
continue
}
estimatedTotalsBySatellite[ID] = estimatedNewTotal
}
// find any saIDs that are in totalsAtIterationEnd but not in newTotalSpaceUsedBySatellite
missedWhenIterationEnded := getMissed(totalsAtIterationEnd.totalSpaceUsedBySatellite,
newTotalBySatellite,
)
if len(missedWhenIterationEnded) > 0 {
for ID := range missedWhenIterationEnded {
estimatedNewTotal := estimate(0,
totalBySatelliteAtIterationStart[ID],
totalsAtIterationEnd.totalSpaceUsedBySatellite[ID],
)
if estimatedNewTotal == 0 {
continue
}
estimatedTotalsBySatellite[ID] = estimatedNewTotal
}
}
blobs.mu.Lock()
blobs.totalSpaceUsed = estimatedTotals
blobs.totalSpaceUsedBySatellite = estimatedTotalsBySatellite
blobs.mu.Unlock()
return nil
}
func estimate(newSpaceUsedTotal, totalAtIterationStart, totalAtIterationEnd int64) int64 {
if newSpaceUsedTotal == totalAtIterationEnd {
return newSpaceUsedTotal
}
// If we missed writes/deletes while iterating, we will assume that half of those missed occurred before
// the iteration and half occurred after. So here we add half of the delta to the result space used totals
// from the iteration to account for those missed.
spaceUsedDeltaDuringIteration := totalAtIterationEnd - totalAtIterationStart
estimatedTotal := newSpaceUsedTotal + (spaceUsedDeltaDuringIteration / 2)
if estimatedTotal < 0 {
return 0
}
return estimatedTotal
}
func getMissed(endTotals, newTotals map[storj.NodeID]int64) map[storj.NodeID]int64 {
var missed = map[storj.NodeID]int64{}
for id, total := range endTotals {
if _, ok := newTotals[id]; !ok {
missed[id] = total
}
}
return missed
}
// Close satisfies the pieces interface
func (blobs *BlobsUsageCache) Close() error {
return nil
}
// TestCreateV0 creates a new V0 blob that can be written. This is only appropriate in test situations.
func (blobs *BlobsUsageCache) TestCreateV0(ctx context.Context, ref storage.BlobRef) (_ storage.BlobWriter, err error) {
fStore := blobs.Blobs.(interface {
TestCreateV0(ctx context.Context, ref storage.BlobRef) (_ storage.BlobWriter, err error)
})
return fStore.TestCreateV0(ctx, ref)
}