2020-03-12 07:03:46 +00:00
// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.
package piecedeletion
import (
"context"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
2020-08-17 19:49:44 +01:00
"golang.org/x/sync/semaphore"
2020-03-12 07:03:46 +00:00
"storj.io/common/rpc"
"storj.io/common/storj"
"storj.io/common/sync2"
2022-07-19 22:04:12 +01:00
"storj.io/storj/satellite/overlay"
2020-03-12 07:03:46 +00:00
)
// Config defines configuration options for Service.
type Config struct {
2020-08-17 19:49:44 +01:00
MaxConcurrency int ` help:"maximum number of concurrent requests to storage nodes" default:"100" `
testplanet/satellite: reduce the number of places default values need to be configured
Satellites set their configuration values to default values using
cfgstruct, however, it turns out our tests don't test these values
at all! Instead, they have a completely separate definition system
that is easy to forget about.
As is to be expected, these values have drifted, and it appears
in a few cases test planet is testing unreasonable values that we
won't see in production, or perhaps worse, features enabled in
production were missed and weren't enabled in testplanet.
This change makes it so all values are configured the same,
systematic way, so it's easy to see when test values are different
than dev values or release values, and it's less hard to forget
to enable features in testplanet.
In terms of reviewing, this change should be actually fairly
easy to review, considering private/testplanet/satellite.go keeps
the current config system and the new one and confirms that they
result in identical configurations, so you can be certain that
nothing was missed and the config is all correct.
You can also check the config lock to see what actual config
values changed.
Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e
2021-05-31 22:15:00 +01:00
MaxConcurrentPieces int ` help:"maximum number of concurrent pieces can be processed" default:"1000000" testDefault:"1000" `
2020-03-12 07:03:46 +00:00
testplanet/satellite: reduce the number of places default values need to be configured
Satellites set their configuration values to default values using
cfgstruct, however, it turns out our tests don't test these values
at all! Instead, they have a completely separate definition system
that is easy to forget about.
As is to be expected, these values have drifted, and it appears
in a few cases test planet is testing unreasonable values that we
won't see in production, or perhaps worse, features enabled in
production were missed and weren't enabled in testplanet.
This change makes it so all values are configured the same,
systematic way, so it's easy to see when test values are different
than dev values or release values, and it's less hard to forget
to enable features in testplanet.
In terms of reviewing, this change should be actually fairly
easy to review, considering private/testplanet/satellite.go keeps
the current config system and the new one and confirms that they
result in identical configurations, so you can be certain that
nothing was missed and the config is all correct.
You can also check the config lock to see what actual config
values changed.
Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e
2021-05-31 22:15:00 +01:00
MaxPiecesPerBatch int ` help:"maximum number of pieces per batch" default:"5000" testDefault:"4000" `
MaxPiecesPerRequest int ` help:"maximum number pieces per single request" default:"1000" testDefault:"2000" `
2020-03-12 07:03:46 +00:00
2022-10-15 01:38:09 +01:00
DialTimeout time . Duration ` help:"timeout for dialing nodes (0 means satellite default)" default:"3s" testDefault:"2s" `
FailThreshold time . Duration ` help:"threshold for retrying a failed node" releaseDefault:"10m" devDefault:"2s" `
RequestTimeout time . Duration ` help:"timeout for a single delete request" releaseDefault:"15s" devDefault:"2s" `
DeleteSuccessThreshold float64 ` help:"Which fraction of nodes should be contacted successfully until the delete of a batch of pieces is considered completed" default:".75" `
2020-03-12 07:03:46 +00:00
}
const (
minTimeout = 5 * time . Millisecond
maxTimeout = 5 * time . Minute
)
// Verify verifies configuration sanity.
func ( config * Config ) Verify ( ) errs . Group {
var errlist errs . Group
if config . MaxConcurrency <= 0 {
errlist . Add ( Error . New ( "concurrency %d must be greater than 0" , config . MaxConcurrency ) )
}
2020-08-17 19:49:44 +01:00
if config . MaxConcurrentPieces <= 0 {
errlist . Add ( Error . New ( "max concurrent pieces %d must be greater than 0" , config . MaxConcurrentPieces ) )
}
2020-03-12 07:03:46 +00:00
if config . MaxPiecesPerBatch < config . MaxPiecesPerRequest {
errlist . Add ( Error . New ( "max pieces per batch %d should be larger than max pieces per request %d" , config . MaxPiecesPerBatch , config . MaxPiecesPerRequest ) )
}
if config . MaxPiecesPerBatch <= 0 {
errlist . Add ( Error . New ( "max pieces per batch %d must be greater than 0" , config . MaxPiecesPerBatch ) )
}
if config . MaxPiecesPerRequest <= 0 {
errlist . Add ( Error . New ( "max pieces per request %d must be greater than 0" , config . MaxPiecesPerRequest ) )
}
if config . DialTimeout != 0 && ( config . DialTimeout <= minTimeout || maxTimeout <= config . DialTimeout ) {
errlist . Add ( Error . New ( "dial timeout %v must be between %v and %v" , config . DialTimeout , minTimeout , maxTimeout ) )
}
if config . RequestTimeout < minTimeout || maxTimeout < config . RequestTimeout {
errlist . Add ( Error . New ( "request timeout %v should be between %v and %v" , config . RequestTimeout , minTimeout , maxTimeout ) )
}
return errlist
}
2020-08-07 19:13:55 +01:00
// Nodes stores reliable nodes information.
type Nodes interface {
2022-07-19 22:04:12 +01:00
GetNodes ( ctx context . Context , nodes [ ] storj . NodeID ) ( _ map [ storj . NodeID ] * overlay . SelectedNode , err error )
2020-08-07 19:13:55 +01:00
}
2020-03-12 07:03:46 +00:00
// Service handles combining piece deletion requests.
//
// architecture: Service
type Service struct {
log * zap . Logger
config Config
2020-08-17 19:49:44 +01:00
concurrentRequests * semaphore . Weighted
2020-03-12 07:03:46 +00:00
rpcDialer rpc . Dialer
2020-08-07 19:13:55 +01:00
nodesDB Nodes
2020-03-12 07:03:46 +00:00
running sync2 . Fence
combiner * Combiner
dialer * Dialer
limited * LimitedHandler
}
// NewService creates a new service.
2020-08-07 19:13:55 +01:00
func NewService ( log * zap . Logger , dialer rpc . Dialer , nodesDB Nodes , config Config ) ( * Service , error ) {
2020-03-12 07:03:46 +00:00
var errlist errs . Group
if log == nil {
errlist . Add ( Error . New ( "log is nil" ) )
}
if dialer == ( rpc . Dialer { } ) {
errlist . Add ( Error . New ( "dialer is zero" ) )
}
2020-08-07 19:13:55 +01:00
if nodesDB == nil {
errlist . Add ( Error . New ( "nodesDB is nil" ) )
}
2020-03-12 07:03:46 +00:00
if errs := config . Verify ( ) ; len ( errs ) > 0 {
errlist . Add ( errs ... )
}
if err := errlist . Err ( ) ; err != nil {
return nil , Error . Wrap ( err )
}
dialerClone := dialer
if config . DialTimeout > 0 {
dialerClone . DialTimeout = config . DialTimeout
}
2021-10-06 14:29:03 +01:00
if dialerClone . Pool == nil {
dialerClone . Pool = rpc . NewDefaultConnectionPool ( )
}
2021-10-05 07:45:08 +01:00
2020-03-12 07:03:46 +00:00
return & Service {
2020-08-17 19:49:44 +01:00
log : log ,
config : config ,
concurrentRequests : semaphore . NewWeighted ( int64 ( config . MaxConcurrentPieces ) ) ,
rpcDialer : dialerClone ,
nodesDB : nodesDB ,
2020-03-12 07:03:46 +00:00
} , nil
}
// newQueue creates the configured queue.
func ( service * Service ) newQueue ( ) Queue {
return NewLimitedJobs ( service . config . MaxPiecesPerBatch )
}
// Run initializes the service.
func ( service * Service ) Run ( ctx context . Context ) error {
defer service . running . Release ( )
config := service . config
service . dialer = NewDialer ( service . log . Named ( "dialer" ) , service . rpcDialer , config . RequestTimeout , config . FailThreshold , config . MaxPiecesPerRequest )
service . limited = NewLimitedHandler ( service . dialer , config . MaxConcurrency )
service . combiner = NewCombiner ( ctx , service . limited , service . newQueue )
return nil
}
// Close shuts down the service.
func ( service * Service ) Close ( ) error {
2021-09-24 15:42:54 +01:00
if service . combiner != nil {
service . combiner . Close ( )
}
2020-03-12 07:03:46 +00:00
return nil
}
2022-10-15 01:38:09 +01:00
// DeleteWithCustomThreshold deletes the pieces specified in the requests,
// returning when they have been deleted from the specified fraction of storage nodes.
func ( service * Service ) DeleteWithCustomThreshold ( ctx context . Context , requests [ ] Request , successThreshold float64 ) ( err error ) {
2020-03-12 07:03:46 +00:00
defer mon . Task ( ) ( & ctx , len ( requests ) , requestsPieceCount ( requests ) , successThreshold ) ( & err )
2020-04-16 10:29:48 +01:00
if len ( requests ) == 0 {
return nil
}
2020-03-12 07:03:46 +00:00
// wait for combiner and dialer to set themselves up.
if ! service . running . Wait ( ctx ) {
return Error . Wrap ( ctx . Err ( ) )
}
for i , req := range requests {
if ! req . IsValid ( ) {
return Error . New ( "request #%d is invalid" , i )
}
}
2020-08-17 19:49:44 +01:00
// When number of pieces are more than the maximum limit, we let it overflow,
// so we don't have to split requests in to separate batches.
totalPieceCount := requestsPieceCount ( requests )
if totalPieceCount > service . config . MaxConcurrentPieces {
totalPieceCount = service . config . MaxConcurrentPieces
}
if err := service . concurrentRequests . Acquire ( ctx , int64 ( totalPieceCount ) ) ; err != nil {
return Error . Wrap ( err )
}
defer service . concurrentRequests . Release ( int64 ( totalPieceCount ) )
2020-08-07 19:13:55 +01:00
// Create a map for matching node information with the corresponding
// request.
nodesReqs := make ( map [ storj . NodeID ] Request , len ( requests ) )
nodeIDs := [ ] storj . NodeID { }
2020-03-12 07:03:46 +00:00
for _ , req := range requests {
2020-08-07 19:13:55 +01:00
if req . Node . Address == "" {
nodeIDs = append ( nodeIDs , req . Node . ID )
}
nodesReqs [ req . Node . ID ] = req
}
if len ( nodeIDs ) > 0 {
2022-07-19 22:04:12 +01:00
nodes , err := service . nodesDB . GetNodes ( ctx , nodeIDs )
2020-08-07 19:13:55 +01:00
if err != nil {
// Pieces will be collected by garbage collector
return Error . Wrap ( err )
}
for _ , node := range nodes {
2022-07-19 22:04:12 +01:00
req := nodesReqs [ node . ID ]
2020-08-07 19:13:55 +01:00
2022-07-19 22:04:12 +01:00
nodesReqs [ node . ID ] = Request {
2020-08-07 19:13:55 +01:00
Node : storj . NodeURL {
2022-07-19 22:04:12 +01:00
ID : node . ID ,
2020-08-07 19:13:55 +01:00
Address : node . Address . Address ,
} ,
Pieces : req . Pieces ,
}
}
}
2020-09-02 02:20:48 +01:00
threshold , err := sync2 . NewSuccessThreshold ( len ( nodesReqs ) , successThreshold )
if err != nil {
return Error . Wrap ( err )
}
2020-08-07 19:13:55 +01:00
for _ , req := range nodesReqs {
2020-03-12 07:03:46 +00:00
service . combiner . Enqueue ( req . Node , Job {
Pieces : req . Pieces ,
Resolve : threshold ,
} )
}
threshold . Wait ( ctx )
return nil
}
2022-10-15 01:38:09 +01:00
// Delete deletes the pieces specified in the requests,
// returning when they have been deleted from the default fraction of storage nodes.
func ( service * Service ) Delete ( ctx context . Context , requests [ ] Request ) ( err error ) {
return service . DeleteWithCustomThreshold ( ctx , requests , service . config . DeleteSuccessThreshold )
}
2020-03-12 07:03:46 +00:00
// Request defines a deletion requests for a node.
type Request struct {
2020-05-20 14:10:25 +01:00
Node storj . NodeURL
2020-03-12 07:03:46 +00:00
Pieces [ ] storj . PieceID
}
// IsValid returns whether the request is valid.
func ( req * Request ) IsValid ( ) bool {
2020-05-20 14:10:25 +01:00
return ! req . Node . ID . IsZero ( ) && len ( req . Pieces ) > 0
2020-03-12 07:03:46 +00:00
}
func requestsPieceCount ( requests [ ] Request ) int {
total := 0
for _ , r := range requests {
total += len ( r . Pieces )
}
return total
}