2019-08-19 19:52:47 +01:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package retain
import (
"context"
2023-01-29 21:16:19 +00:00
"os"
2019-08-19 19:52:47 +01:00
"runtime"
"sync"
"time"
2019-11-08 20:40:39 +00:00
"github.com/spacemonkeygo/monkit/v3"
2019-08-19 19:52:47 +01:00
"github.com/zeebo/errs"
"go.uber.org/zap"
2019-08-28 21:35:25 +01:00
"golang.org/x/sync/errgroup"
2019-08-19 19:52:47 +01:00
2019-12-27 11:48:47 +00:00
"storj.io/common/bloomfilter"
"storj.io/common/storj"
2019-08-19 19:52:47 +01:00
"storj.io/storj/storagenode/pieces"
)
var (
mon = monkit . Package ( )
// Error is the default error class for retain errors.
Error = errs . Class ( "retain" )
)
// Config defines parameters for the retain service.
type Config struct {
2019-12-08 21:02:04 +00:00
MaxTimeSkew time . Duration ` help:"allows for small differences in the satellite and storagenode clocks" default:"72h0m0s" `
2019-12-16 16:25:38 +00:00
Status Status ` help:"allows configuration to enable, disable, or test retain requests from the satellite. Options: (disabled/enabled/debug)" default:"enabled" `
2019-08-28 21:35:25 +01:00
Concurrency int ` help:"how many concurrent retain requests can be processed at the same time." default:"5" `
2019-08-19 19:52:47 +01:00
}
// Request contains all the info necessary to process a retain request.
type Request struct {
SatelliteID storj . NodeID
CreatedBefore time . Time
Filter * bloomfilter . Filter
}
// Status is a type defining the enabled/disabled status of retain requests.
type Status uint32
const (
// Disabled means we do not do anything with retain requests.
Disabled Status = iota + 1
// Enabled means we fully enable retain requests and delete data not defined by bloom filter.
Enabled
// Debug means we partially enable retain requests, and print out pieces we should delete, without actually deleting them.
Debug
)
// Set implements pflag.Value.
func ( v * Status ) Set ( s string ) error {
switch s {
case "disabled" :
* v = Disabled
case "enabled" :
* v = Enabled
case "debug" :
* v = Debug
default :
2019-08-28 21:35:25 +01:00
return Error . New ( "invalid status %q" , s )
2019-08-19 19:52:47 +01:00
}
return nil
}
// Type implements pflag.Value.
2019-08-28 21:35:25 +01:00
func ( * Status ) Type ( ) string { return "storj.Status" }
2019-08-19 19:52:47 +01:00
// String implements pflag.Value.
func ( v * Status ) String ( ) string {
switch * v {
case Disabled :
return "disabled"
case Enabled :
return "enabled"
case Debug :
return "debug"
default :
return "invalid"
}
}
// Service queues and processes retain requests from satellites.
2019-09-10 14:24:16 +01:00
//
// architecture: Worker
2019-08-19 19:52:47 +01:00
type Service struct {
log * zap . Logger
config Config
2019-08-28 21:35:25 +01:00
cond sync . Cond
queued map [ storj . NodeID ] Request
working map [ storj . NodeID ] struct { }
group errgroup . Group
2019-08-19 19:52:47 +01:00
2019-08-28 21:35:25 +01:00
closedOnce sync . Once
closed chan struct { }
started bool
2019-08-19 19:52:47 +01:00
store * pieces . Store
}
// NewService creates a new retain service.
func NewService ( log * zap . Logger , store * pieces . Store , config Config ) * Service {
return & Service {
2019-08-28 21:35:25 +01:00
log : log ,
config : config ,
cond : * sync . NewCond ( & sync . Mutex { } ) ,
queued : make ( map [ storj . NodeID ] Request ) ,
working : make ( map [ storj . NodeID ] struct { } ) ,
closed : make ( chan struct { } ) ,
store : store ,
2019-08-19 19:52:47 +01:00
}
}
// Queue adds a retain request to the queue.
// It discards a request for a satellite that already has a queued request.
2020-07-16 15:18:02 +01:00
// true is returned if the request is queued and false is returned if it is discarded.
2019-08-19 19:52:47 +01:00
func ( s * Service ) Queue ( req Request ) bool {
2019-08-28 21:35:25 +01:00
s . cond . L . Lock ( )
defer s . cond . L . Unlock ( )
2019-08-19 19:52:47 +01:00
2019-08-28 21:35:25 +01:00
select {
case <- s . closed :
return false
default :
2019-08-19 19:52:47 +01:00
}
2019-08-28 21:35:25 +01:00
s . queued [ req . SatelliteID ] = req
s . cond . Broadcast ( )
return true
2019-08-19 19:52:47 +01:00
}
// Run listens for queued retain requests and processes them as they come in.
2021-06-04 13:56:42 +01:00
func ( s * Service ) Run ( ctx context . Context ) ( err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
2019-08-28 21:35:25 +01:00
// Hold the lock while we spawn the workers because a concurrent Close call
// can race and wait for them. We later temporarily drop the lock while we
// wait for the workers to exit.
s . cond . L . Lock ( )
defer s . cond . L . Unlock ( )
// Ensure Run is only ever called once. If not, there's many subtle
// bugs with concurrency.
if s . started {
return Error . New ( "service already started" )
}
s . started = true
// Ensure Run doesn't start after it's closed. Then we may leak some
// workers.
select {
case <- s . closed :
return Error . New ( "service Closed" )
default :
}
// Create a sub-context that we can cancel.
ctx , cancel := context . WithCancel ( ctx )
defer cancel ( )
// Start a goroutine that does most of the work of Close in the case
// the context is canceled and broadcasts to anyone waiting on the condition
// variable to check the state.
s . group . Go ( func ( ) error {
defer s . cond . Broadcast ( )
defer cancel ( )
2019-08-19 19:52:47 +01:00
select {
2019-08-28 21:35:25 +01:00
case <- s . closed :
return nil
2019-08-19 19:52:47 +01:00
case <- ctx . Done ( ) :
2019-08-28 21:35:25 +01:00
s . cond . L . Lock ( )
s . closedOnce . Do ( func ( ) { close ( s . closed ) } )
s . cond . L . Unlock ( )
2019-08-19 19:52:47 +01:00
return ctx . Err ( )
}
2019-08-28 21:35:25 +01:00
} )
2019-08-19 19:52:47 +01:00
2019-08-28 21:35:25 +01:00
for i := 0 ; i < s . config . Concurrency ; i ++ {
s . group . Go ( func ( ) error {
// Grab lock to check things.
s . cond . L . Lock ( )
defer s . cond . L . Unlock ( )
for {
// If we have closed, exit.
select {
case <- ctx . Done ( ) :
return ctx . Err ( )
case <- s . closed :
return nil
default :
}
2019-08-19 19:52:47 +01:00
2019-08-28 21:35:25 +01:00
// Grab next item from queue.
request , ok := s . next ( )
if ! ok {
// Nothing in queue, go to sleep and wait for
// things shutting down or next item.
s . cond . Wait ( )
continue
}
// Temporarily Unlock so others can work on the queue while
// we're working on the retain request.
s . cond . L . Unlock ( )
// Signal to anyone waiting that the queue state changed.
s . cond . Broadcast ( )
// Run retaining process.
err := s . retainPieces ( ctx , request )
if err != nil {
s . log . Error ( "retain pieces failed" , zap . Error ( err ) )
}
2019-08-19 19:52:47 +01:00
2019-08-28 21:35:25 +01:00
// Mark the request as finished. Relock to maintain that
// at the top of the for loop the lock is held.
s . cond . L . Lock ( )
s . finish ( request )
s . cond . Broadcast ( )
}
} )
2019-08-19 19:52:47 +01:00
}
2019-08-28 21:35:25 +01:00
// Unlock while we wait for the workers to exit.
s . cond . L . Unlock ( )
2021-06-04 13:56:42 +01:00
err = s . group . Wait ( )
2019-08-28 21:35:25 +01:00
s . cond . L . Lock ( )
// Clear the queue after Wait has exited. We're sure no more entries
// can be added after we acquire the mutex because wait spawned a
// worker that ensures the closed channel is closed before it exits.
s . queued = nil
s . cond . Broadcast ( )
return err
2019-08-19 19:52:47 +01:00
}
2020-07-16 15:18:02 +01:00
// next returns next item from queue, requires mutex to be held.
2019-08-28 21:35:25 +01:00
func ( s * Service ) next ( ) ( Request , bool ) {
for id , request := range s . queued {
// Check whether a worker is retaining this satellite,
// if, yes, then try to get something else from the queue.
if _ , ok := s . working [ request . SatelliteID ] ; ok {
continue
}
delete ( s . queued , id )
// Mark this satellite as being worked on.
s . working [ request . SatelliteID ] = struct { } { }
return request , true
2019-08-19 19:52:47 +01:00
}
2019-08-28 21:35:25 +01:00
return Request { } , false
}
2020-07-16 15:18:02 +01:00
// finish marks the request as finished, requires mutex to be held.
2019-08-28 21:35:25 +01:00
func ( s * Service ) finish ( request Request ) {
delete ( s . working , request . SatelliteID )
}
// Close causes any pending Run to exit and waits for any retain requests to
// clean up.
func ( s * Service ) Close ( ) error {
s . cond . L . Lock ( )
s . closedOnce . Do ( func ( ) { close ( s . closed ) } )
s . cond . L . Unlock ( )
s . cond . Broadcast ( )
2019-08-29 17:42:17 +01:00
// ignoring error here, because the same error is already returned from Run.
2019-08-28 21:35:25 +01:00
_ = s . group . Wait ( )
return nil
}
// TestWaitUntilEmpty blocks until the queue and working is empty.
// When Run exits, it empties the queue.
func ( s * Service ) TestWaitUntilEmpty ( ) {
s . cond . L . Lock ( )
defer s . cond . L . Unlock ( )
for len ( s . queued ) > 0 || len ( s . working ) > 0 {
s . cond . Wait ( )
2019-08-19 19:52:47 +01:00
}
}
// Status returns the retain status.
func ( s * Service ) Status ( ) Status {
2019-08-28 21:35:25 +01:00
return s . config . Status
2019-08-19 19:52:47 +01:00
}
// ------------------------------------------------------------------------------------------------
// On the correctness of using access.ModTime() in place of the more precise access.CreationTime()
// in retainPieces():
// ------------------------------------------------------------------------------------------------
//
// Background: for pieces not stored with storage.FormatV0, the access.CreationTime() value can
// only be retrieved by opening the piece file, and reading and unmarshaling the piece header.
// This is far slower than access.ModTime(), which gets the file modification time from the file
// system and only needs to do a stat(2) on the piece file. If we can make Retain() work with
// ModTime, we should.
//
// Possibility of mismatch: We do not force or require piece file modification times to be equal to
// or close to the CreationTime specified by the uplink, but we do expect that piece files will be
// written to the filesystem _after_ the CreationTime. We make the assumption already that storage
// nodes and satellites and uplinks have system clocks that are very roughly in sync (that is, they
// are out of sync with each other by less than an hour of real time, or whatever is configured as
2019-08-28 21:35:25 +01:00
// MaxTimeSkew). So if an uplink is not lying about CreationTime and it uploads a piece that
2019-08-19 19:52:47 +01:00
// makes it to a storagenode's disk as quickly as possible, even in the worst-synchronized-clocks
2019-08-28 21:35:25 +01:00
// case we can assume that `ModTime > (CreationTime - MaxTimeSkew)`. We also allow for storage
2019-08-19 19:52:47 +01:00
// node operators doing file system manipulations after a piece has been written. If piece files
// are copied between volumes and their attributes are not preserved, it will be possible for their
// modification times to be changed to something later in time. This still preserves the inequality
2019-08-28 21:35:25 +01:00
// relationship mentioned above, `ModTime > (CreationTime - MaxTimeSkew)`. We only stipulate
2019-08-19 19:52:47 +01:00
// that storage node operators must not artificially change blob file modification times to be in
// the past.
//
// If there is a mismatch: in most cases, a mismatch between ModTime and CreationTime has no
// effect. In certain remaining cases, the only effect is that a piece file which _should_ be
// garbage collected survives until the next round of garbage collection. The only really
// problematic case is when there is a relatively new piece file which was created _after_ this
// node's Retain bloom filter started being built on the satellite, and is recorded in this
// storage node's blob store before the Retain operation has completed. Then, it might be possible
// for that new piece to be garbage collected incorrectly, because it does not show up in the
// bloom filter and the node incorrectly thinks that it was created before the bloom filter.
// But if the uplink is not lying about CreationTime and its clock drift versus the storage node
2019-08-28 21:35:25 +01:00
// is less than `MaxTimeSkew`, and the ModTime on a blob file is correctly set from the
2019-08-19 19:52:47 +01:00
// storage node system time, then it is still true that `ModTime > (CreationTime -
2019-08-28 21:35:25 +01:00
// MaxTimeSkew)`.
2019-08-19 19:52:47 +01:00
//
// The rule that storage node operators need to be aware of is only this: do not artificially set
// mtimes on blob files to be in the past. Let the filesystem manage mtimes. If blob files need to
// be moved or copied between locations, and this updates the mtime, that is ok. A secondary effect
// of this rule is that if the storage node's system clock needs to be changed forward by a
// nontrivial amount, mtimes on existing blobs should also be adjusted (by the same interval,
// ideally, but just running "touch" on all blobs is sufficient to avoid incorrect deletion of
// data).
func ( s * Service ) retainPieces ( ctx context . Context , req Request ) ( err error ) {
// if retain status is disabled, return immediately
2019-08-28 21:35:25 +01:00
if s . config . Status == Disabled {
2019-08-19 19:52:47 +01:00
return nil
}
2021-06-04 13:56:42 +01:00
defer mon . Task ( ) ( & ctx , req . SatelliteID , req . CreatedBefore ) ( & err )
2019-08-19 19:52:47 +01:00
2021-06-04 13:56:42 +01:00
var piecesCount int64
var piecesSkipped int64
var piecesToDeleteCount int64
2019-08-19 19:52:47 +01:00
numDeleted := 0
satelliteID := req . SatelliteID
filter := req . Filter
2019-08-28 21:35:25 +01:00
// subtract some time to leave room for clock difference between the satellite and storage node
createdBefore := req . CreatedBefore . Add ( - s . config . MaxTimeSkew )
2021-06-04 13:56:42 +01:00
started := time . Now ( ) . UTC ( )
filterHashCount , _ := req . Filter . Parameters ( )
mon . IntVal ( "garbage_collection_created_before" ) . Observe ( createdBefore . Unix ( ) )
mon . IntVal ( "garbage_collection_filter_hash_count" ) . Observe ( int64 ( filterHashCount ) )
mon . IntVal ( "garbage_collection_filter_size" ) . Observe ( filter . Size ( ) )
mon . IntVal ( "garbage_collection_started" ) . Observe ( started . Unix ( ) )
2019-08-19 19:52:47 +01:00
2022-07-08 17:19:08 +01:00
s . log . Info ( "Prepared to run a Retain request." ,
2019-11-05 21:04:07 +00:00
zap . Time ( "Created Before" , createdBefore ) ,
zap . Int64 ( "Filter Size" , filter . Size ( ) ) ,
zap . Stringer ( "Satellite ID" , satelliteID ) )
2019-08-19 19:52:47 +01:00
2021-06-04 13:56:42 +01:00
err = s . store . WalkSatellitePieces ( ctx , satelliteID , func ( access pieces . StoredPieceAccess ) ( err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
piecesCount ++
2019-08-19 19:52:47 +01:00
// We call Gosched() when done because the GC process is expected to be long and we want to keep it at low priority,
// so other goroutines can continue serving requests.
defer runtime . Gosched ( )
2023-01-29 21:16:19 +00:00
pieceID := access . PieceID ( )
if filter . Contains ( pieceID ) {
// This piece is explicitly not trash. Move on.
return nil
}
// If the blob's mtime is at or after the createdBefore line, we can't safely delete it;
// it might not be trash. If it is, we can expect to get it next time.
//
2019-08-19 19:52:47 +01:00
// See the comment above the retainPieces() function for a discussion on the correctness
// of using ModTime in place of the more precise CreationTime.
mTime , err := access . ModTime ( ctx )
if err != nil {
2023-01-29 21:16:19 +00:00
if os . IsNotExist ( err ) {
// piece was deleted while we were scanning.
return nil
}
2023-01-30 14:39:31 +00:00
2021-06-04 13:56:42 +01:00
piecesSkipped ++
2019-08-19 19:52:47 +01:00
s . log . Warn ( "failed to determine mtime of blob" , zap . Error ( err ) )
// but continue iterating.
return nil
}
if ! mTime . Before ( createdBefore ) {
return nil
}
2023-01-29 21:16:19 +00:00
s . log . Debug ( "About to move piece to trash" ,
zap . Stringer ( "Satellite ID" , satelliteID ) ,
zap . Stringer ( "Piece ID" , pieceID ) ,
zap . String ( "Status" , s . config . Status . String ( ) ) )
piecesToDeleteCount ++
// if retain status is enabled, delete pieceid
if s . config . Status == Enabled {
if err = s . trash ( ctx , satelliteID , pieceID ) ; err != nil {
s . log . Warn ( "failed to delete piece" ,
zap . Stringer ( "Satellite ID" , satelliteID ) ,
zap . Stringer ( "Piece ID" , pieceID ) ,
zap . Error ( err ) )
return nil
2019-08-19 19:52:47 +01:00
}
}
2023-01-29 21:16:19 +00:00
numDeleted ++
2019-08-19 19:52:47 +01:00
select {
case <- ctx . Done ( ) :
return ctx . Err ( )
default :
}
return nil
} )
if err != nil {
return Error . Wrap ( err )
}
2021-06-04 13:56:42 +01:00
mon . IntVal ( "garbage_collection_pieces_count" ) . Observe ( piecesCount )
mon . IntVal ( "garbage_collection_pieces_skipped" ) . Observe ( piecesSkipped )
mon . IntVal ( "garbage_collection_pieces_to_delete_count" ) . Observe ( piecesToDeleteCount )
2019-08-19 19:52:47 +01:00
mon . IntVal ( "garbage_collection_pieces_deleted" ) . Observe ( int64 ( numDeleted ) )
2021-06-04 13:56:42 +01:00
mon . DurationVal ( "garbage_collection_loop_duration" ) . Observe ( time . Now ( ) . UTC ( ) . Sub ( started ) )
2022-07-08 17:19:08 +01:00
s . log . Info ( "Moved pieces to trash during retain" , zap . Int ( "num deleted" , numDeleted ) , zap . String ( "Retain Status" , s . config . Status . String ( ) ) )
2019-08-19 19:52:47 +01:00
return nil
}
2021-06-04 13:56:42 +01:00
// trash wraps retains piece deletion to monitor moving retained piece to trash error during garbage collection.
func ( s * Service ) trash ( ctx context . Context , satelliteID storj . NodeID , pieceID storj . PieceID ) ( err error ) {
defer mon . Task ( ) ( & ctx , satelliteID ) ( & err )
return s . store . Trash ( ctx , satelliteID , pieceID )
}
2022-08-29 10:44:54 +01:00
// HowManyQueued peeks at the number of bloom filters queued.
func ( s * Service ) HowManyQueued ( ) int {
return len ( s . queued )
}