satellite/satellitedb: batch delete storage node tallies

Currently we have a significant number of tallies that need to be
deleted together. Add a limit (by default 10k) to how many will
be deleted at the same time.

Change-Id: If530383f19b4d3bb83ed5fe956610a2e52f130a1
This commit is contained in:
Egon Elbre 2022-07-20 11:10:29 +03:00
parent a9de5ce6c3
commit 82fede2132
7 changed files with 74 additions and 27 deletions

View File

@ -188,7 +188,7 @@ type StoragenodeAccounting interface {
// QueryStorageNodeUsage returns slice of StorageNodeUsage for given period // QueryStorageNodeUsage returns slice of StorageNodeUsage for given period
QueryStorageNodeUsage(ctx context.Context, nodeID storj.NodeID, start time.Time, end time.Time) ([]StorageNodeUsage, error) QueryStorageNodeUsage(ctx context.Context, nodeID storj.NodeID, start time.Time, end time.Time) ([]StorageNodeUsage, error)
// DeleteTalliesBefore deletes all tallies prior to some time // DeleteTalliesBefore deletes all tallies prior to some time
DeleteTalliesBefore(ctx context.Context, latestRollup time.Time) error DeleteTalliesBefore(ctx context.Context, latestRollup time.Time, batchSize int) error
// ArchiveRollupsBefore archives rollups older than a given time and returns num storagenode and bucket bandwidth rollups archived. // ArchiveRollupsBefore archives rollups older than a given time and returns num storagenode and bucket bandwidth rollups archived.
ArchiveRollupsBefore(ctx context.Context, before time.Time, batchSize int) (numArchivedNodeBW int, err error) ArchiveRollupsBefore(ctx context.Context, before time.Time, batchSize int) (numArchivedNodeBW int, err error)
// GetRollupsSince retrieves all archived bandwidth rollup records since a given time. A hard limit batch size is used for results. // GetRollupsSince retrieves all archived bandwidth rollup records since a given time. A hard limit batch size is used for results.

View File

@ -17,29 +17,32 @@ import (
// Config contains configurable values for rollup. // Config contains configurable values for rollup.
type Config struct { type Config struct {
Interval time.Duration `help:"how frequently rollup should run" releaseDefault:"24h" devDefault:"120s" testDefault:"$TESTINTERVAL"` Interval time.Duration `help:"how frequently rollup should run" releaseDefault:"24h" devDefault:"120s" testDefault:"$TESTINTERVAL"`
DeleteTallies bool `help:"option for deleting tallies after they are rolled up" default:"true"` DeleteTallies bool `help:"option for deleting tallies after they are rolled up" default:"true"`
DeleteTalliesBatchSize int `help:"how many tallies to delete in a batch" default:"10000"`
} }
// Service is the rollup service for totalling data on storage nodes on daily intervals. // Service is the rollup service for totalling data on storage nodes on daily intervals.
// //
// architecture: Chore // architecture: Chore
type Service struct { type Service struct {
logger *zap.Logger logger *zap.Logger
Loop *sync2.Cycle Loop *sync2.Cycle
sdb accounting.StoragenodeAccounting sdb accounting.StoragenodeAccounting
deleteTallies bool deleteTallies bool
OrderExpiration time.Duration deleteTalliesBatchSize int
OrderExpiration time.Duration
} }
// New creates a new rollup service. // New creates a new rollup service.
func New(logger *zap.Logger, sdb accounting.StoragenodeAccounting, interval time.Duration, deleteTallies bool, orderExpiration time.Duration) *Service { func New(logger *zap.Logger, sdb accounting.StoragenodeAccounting, config Config, orderExpiration time.Duration) *Service {
return &Service{ return &Service{
logger: logger, logger: logger,
Loop: sync2.NewCycle(interval), Loop: sync2.NewCycle(config.Interval),
sdb: sdb, sdb: sdb,
deleteTallies: deleteTallies, deleteTallies: config.DeleteTallies,
OrderExpiration: orderExpiration, deleteTalliesBatchSize: config.DeleteTalliesBatchSize,
OrderExpiration: orderExpiration,
} }
} }
@ -101,7 +104,7 @@ func (r *Service) Rollup(ctx context.Context) (err error) {
if r.deleteTallies { if r.deleteTallies {
// Delete already rolled up tallies // Delete already rolled up tallies
latestTally = latestTally.Add(-r.OrderExpiration) latestTally = latestTally.Add(-r.OrderExpiration)
err = r.sdb.DeleteTalliesBefore(ctx, latestTally) err = r.sdb.DeleteTalliesBefore(ctx, latestTally, r.deleteTalliesBatchSize)
if err != nil { if err != nil {
return Error.Wrap(err) return Error.Wrap(err)
} }

View File

@ -46,7 +46,7 @@ func TestRollupNoDeletes(t *testing.T) {
storageNodes = createNodes(ctx, t, db) storageNodes = createNodes(ctx, t, db)
) )
rollupService := rollup.New(testplanet.NewLogger(t), snAccountingDB, 120*time.Second, false, time.Hour) rollupService := rollup.New(testplanet.NewLogger(t), snAccountingDB, rollup.Config{Interval: 120 * time.Second}, time.Hour)
// disqualifying nodes is unrelated to this test, but it is added here // disqualifying nodes is unrelated to this test, but it is added here
// to confirm the disqualification shows up in the accounting CSVRow // to confirm the disqualification shows up in the accounting CSVRow
@ -147,7 +147,7 @@ func TestRollupDeletes(t *testing.T) {
storageNodes = createNodes(ctx, t, db) storageNodes = createNodes(ctx, t, db)
) )
rollupService := rollup.New(testplanet.NewLogger(t), snAccountingDB, 120*time.Second, true, time.Hour) rollupService := rollup.New(testplanet.NewLogger(t), snAccountingDB, rollup.Config{Interval: 120 * time.Second, DeleteTallies: true}, time.Hour)
// disqualifying nodes is unrelated to this test, but it is added here // disqualifying nodes is unrelated to this test, but it is added here
// to confirm the disqualification shows up in the accounting CSVRow // to confirm the disqualification shows up in the accounting CSVRow

View File

@ -17,7 +17,6 @@ import (
"storj.io/common/testrand" "storj.io/common/testrand"
"storj.io/common/uuid" "storj.io/common/uuid"
"storj.io/storj/private/testplanet" "storj.io/storj/private/testplanet"
"storj.io/storj/private/teststorj"
"storj.io/storj/satellite/accounting" "storj.io/storj/satellite/accounting"
"storj.io/storj/satellite/accounting/tally" "storj.io/storj/satellite/accounting/tally"
"storj.io/storj/satellite/metabase" "storj.io/storj/satellite/metabase"
@ -30,7 +29,7 @@ func TestDeleteTalliesBefore(t *testing.T) {
}{ }{
{ {
eraseBefore: time.Now(), eraseBefore: time.Now(),
expectedRaws: 1, expectedRaws: 3,
}, },
{ {
eraseBefore: time.Now().Add(24 * time.Hour), eraseBefore: time.Now().Add(24 * time.Hour),
@ -43,14 +42,15 @@ func TestDeleteTalliesBefore(t *testing.T) {
testplanet.Run(t, testplanet.Config{ testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 0, UplinkCount: 0, SatelliteCount: 1, StorageNodeCount: 0, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
id := teststorj.NodeIDFromBytes([]byte{})
nodeData := make(map[storj.NodeID]float64) nodeData := make(map[storj.NodeID]float64)
nodeData[id] = float64(1000) nodeData[storj.NodeID{1}] = float64(1000)
nodeData[storj.NodeID{2}] = float64(1000)
nodeData[storj.NodeID{3}] = float64(1000)
err := planet.Satellites[0].DB.StoragenodeAccounting().SaveTallies(ctx, time.Now(), nodeData) err := planet.Satellites[0].DB.StoragenodeAccounting().SaveTallies(ctx, time.Now(), nodeData)
require.NoError(t, err) require.NoError(t, err)
err = planet.Satellites[0].DB.StoragenodeAccounting().DeleteTalliesBefore(ctx, test.eraseBefore) err = planet.Satellites[0].DB.StoragenodeAccounting().DeleteTalliesBefore(ctx, test.eraseBefore, 1)
require.NoError(t, err) require.NoError(t, err)
raws, err := planet.Satellites[0].DB.StoragenodeAccounting().GetTallies(ctx) raws, err := planet.Satellites[0].DB.StoragenodeAccounting().GetTallies(ctx)

View File

@ -472,7 +472,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
// Lets add 1 more day so we catch any off by one errors when deleting tallies // Lets add 1 more day so we catch any off by one errors when deleting tallies
orderExpirationPlusDay := config.Orders.Expiration + config.Rollup.Interval orderExpirationPlusDay := config.Orders.Expiration + config.Rollup.Interval
peer.Accounting.Rollup = rollup.New(peer.Log.Named("accounting:rollup"), peer.DB.StoragenodeAccounting(), config.Rollup.Interval, config.Rollup.DeleteTallies, orderExpirationPlusDay) peer.Accounting.Rollup = rollup.New(peer.Log.Named("accounting:rollup"), peer.DB.StoragenodeAccounting(), config.Rollup, orderExpirationPlusDay)
peer.Services.Add(lifecycle.Item{ peer.Services.Add(lifecycle.Item{
Name: "accounting:rollup", Name: "accounting:rollup",
Run: peer.Accounting.Rollup.Run, Run: peer.Accounting.Rollup.Run,

View File

@ -519,11 +519,52 @@ func (db *StoragenodeAccounting) QueryStorageNodeUsage(ctx context.Context, node
} }
// DeleteTalliesBefore deletes all raw tallies prior to some time. // DeleteTalliesBefore deletes all raw tallies prior to some time.
func (db *StoragenodeAccounting) DeleteTalliesBefore(ctx context.Context, latestRollup time.Time) (err error) { func (db *StoragenodeAccounting) DeleteTalliesBefore(ctx context.Context, latestRollup time.Time, batchSize int) (err error) {
defer mon.Task()(&ctx)(&err) defer mon.Task()(&ctx)(&err)
deleteRawSQL := `DELETE FROM storagenode_storage_tallies WHERE interval_end_time < ?`
_, err = db.db.DB.ExecContext(ctx, db.db.Rebind(deleteRawSQL), latestRollup) if batchSize <= 0 {
return err batchSize = 10000
}
var query string
switch db.db.impl {
case dbutil.Cockroach:
query = `
DELETE FROM storagenode_storage_tallies
WHERE interval_end_time < ?
LIMIT ?`
case dbutil.Postgres:
query = `
DELETE FROM storagenode_storage_tallies
WHERE ctid IN (
SELECT ctid
FROM storagenode_storage_tallies
WHERE interval_end_time < ?
ORDER BY interval_end_time
LIMIT ?
)`
default:
return Error.New("unsupported database: %v", db.db.impl)
}
query = db.db.Rebind(query)
for {
res, err := db.db.DB.ExecContext(ctx, query, latestRollup, batchSize)
if err != nil {
if errs.Is(err, sql.ErrNoRows) {
return nil
}
return Error.Wrap(err)
}
affected, err := res.RowsAffected()
if err != nil {
return Error.Wrap(err)
}
if affected == 0 {
return nil
}
}
} }
// ArchiveRollupsBefore archives rollups older than a given time. // ArchiveRollupsBefore archives rollups older than a given time.

View File

@ -817,6 +817,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# option for deleting tallies after they are rolled up # option for deleting tallies after they are rolled up
# rollup.delete-tallies: true # rollup.delete-tallies: true
# how many tallies to delete in a batch
# rollup.delete-tallies-batch-size: 10000
# how frequently rollup should run # how frequently rollup should run
# rollup.interval: 24h0m0s # rollup.interval: 24h0m0s