2019-01-24 20:15:10 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
2018-10-02 20:46:29 +01:00
// See LICENSE for copying information.
package repairer
import (
"context"
2023-09-27 14:13:13 +01:00
"fmt"
"strconv"
"strings"
2018-10-05 16:58:07 +01:00
"time"
2018-10-02 20:46:29 +01:00
2019-11-08 20:40:39 +00:00
"github.com/spacemonkeygo/monkit/v3"
2023-09-27 14:13:13 +01:00
"github.com/spf13/pflag"
2019-04-22 16:16:21 +01:00
"github.com/zeebo/errs"
2018-10-12 19:04:16 +01:00
"go.uber.org/zap"
2020-01-08 18:33:15 +00:00
"golang.org/x/sync/semaphore"
2018-10-12 19:04:16 +01:00
2019-12-27 11:48:47 +00:00
"storj.io/common/memory"
2023-09-27 14:13:13 +01:00
"storj.io/common/storj"
2019-12-27 11:48:47 +00:00
"storj.io/common/sync2"
2019-07-28 06:55:36 +01:00
"storj.io/storj/satellite/repair/queue"
2018-10-02 20:46:29 +01:00
)
2019-04-22 16:16:21 +01:00
// Error is a standard error class for this package.
var (
2021-04-28 09:06:17 +01:00
Error = errs . Class ( "repairer" )
2019-04-22 16:16:21 +01:00
mon = monkit . Package ( )
)
2020-07-16 15:18:02 +01:00
// Config contains configurable values for repairer.
2019-04-22 16:16:21 +01:00
type Config struct {
testplanet/satellite: reduce the number of places default values need to be configured
Satellites set their configuration values to default values using
cfgstruct, however, it turns out our tests don't test these values
at all! Instead, they have a completely separate definition system
that is easy to forget about.
As is to be expected, these values have drifted, and it appears
in a few cases test planet is testing unreasonable values that we
won't see in production, or perhaps worse, features enabled in
production were missed and weren't enabled in testplanet.
This change makes it so all values are configured the same,
systematic way, so it's easy to see when test values are different
than dev values or release values, and it's less hard to forget
to enable features in testplanet.
In terms of reviewing, this change should be actually fairly
easy to review, considering private/testplanet/satellite.go keeps
the current config system and the new one and confirms that they
result in identical configurations, so you can be certain that
nothing was missed and the config is all correct.
You can also check the config lock to see what actual config
values changed.
Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e
2021-05-31 22:15:00 +01:00
MaxRepair int ` help:"maximum segments that can be repaired concurrently" releaseDefault:"5" devDefault:"1" testDefault:"10" `
Interval time . Duration ` help:"how frequently repairer should try and repair more data" releaseDefault:"5m0s" devDefault:"1m0s" testDefault:"$TESTINTERVAL" `
2023-06-16 10:42:05 +01:00
DialTimeout time . Duration ` help:"time limit for dialing storage node" default:"5s" `
testplanet/satellite: reduce the number of places default values need to be configured
Satellites set their configuration values to default values using
cfgstruct, however, it turns out our tests don't test these values
at all! Instead, they have a completely separate definition system
that is easy to forget about.
As is to be expected, these values have drifted, and it appears
in a few cases test planet is testing unreasonable values that we
won't see in production, or perhaps worse, features enabled in
production were missed and weren't enabled in testplanet.
This change makes it so all values are configured the same,
systematic way, so it's easy to see when test values are different
than dev values or release values, and it's less hard to forget
to enable features in testplanet.
In terms of reviewing, this change should be actually fairly
easy to review, considering private/testplanet/satellite.go keeps
the current config system and the new one and confirms that they
result in identical configurations, so you can be certain that
nothing was missed and the config is all correct.
You can also check the config lock to see what actual config
values changed.
Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e
2021-05-31 22:15:00 +01:00
Timeout time . Duration ` help:"time limit for uploading repaired pieces to new storage nodes" default:"5m0s" testDefault:"1m" `
DownloadTimeout time . Duration ` help:"time limit for downloading pieces from a node for repair" default:"5m0s" testDefault:"1m" `
TotalTimeout time . Duration ` help:"time limit for an entire repair job, from queue pop to upload completion" default:"45m" testDefault:"10m" `
MaxBufferMem memory . Size ` help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4.0 MiB" `
2019-07-11 23:44:47 +01:00
MaxExcessRateOptimalThreshold float64 ` help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05" `
2020-03-18 23:55:09 +00:00
InMemoryRepair bool ` help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false" `
2022-11-24 13:02:08 +00:00
ReputationUpdateEnabled bool ` help:"whether the audit score of nodes should be updated as a part of repair" default:"false" `
2023-04-25 09:40:22 +01:00
UseRangedLoop bool ` help:"whether to enable repair checker observer with ranged loop" default:"true" `
2023-06-30 10:02:01 +01:00
RepairExcludedCountryCodes [ ] string ` help:"list of country codes to treat node from this country as offline" default:"" hidden:"true" `
2023-07-17 15:23:20 +01:00
DoDeclumping bool ` help:"repair pieces on the same network to other nodes" default:"true" `
2023-05-26 13:22:15 +01:00
DoPlacementCheck bool ` help:"repair pieces out of segment placement" default:"true" `
2023-09-27 14:13:13 +01:00
IncludedPlacements PlacementList ` help:"comma separated placement IDs (numbers), which should checked by the repairer (other placements are ignored)" default:"" `
ExcludedPlacements PlacementList ` help:"comma separated placement IDs (numbers), placements which should be ignored by the repairer" default:"" `
}
// PlacementList is a configurable, comma separated list of PlacementConstraint IDs.
type PlacementList struct {
Placements [ ] storj . PlacementConstraint
2019-04-22 16:16:21 +01:00
}
2023-09-27 14:13:13 +01:00
// String implements pflag.Value.
func ( p * PlacementList ) String ( ) string {
var s [ ] string
for _ , pl := range p . Placements {
s = append ( s , fmt . Sprintf ( "%d" , pl ) )
}
return strings . Join ( s , "," )
}
// Set implements pflag.Value.
func ( p * PlacementList ) Set ( s string ) error {
parts := strings . Split ( s , "," )
for _ , pNumStr := range parts {
pNumStr = strings . TrimSpace ( pNumStr )
if pNumStr == "" {
continue
}
pNum , err := strconv . Atoi ( pNumStr )
if err != nil {
return errs . New ( "Placement list should contain numbers: %s" , s )
}
p . Placements = append ( p . Placements , storj . PlacementConstraint ( pNum ) )
}
return nil
}
// Type implements pflag.Value.
func ( p PlacementList ) Type ( ) string {
return "placement-list"
}
var _ pflag . Value = & PlacementList { }
2020-12-05 16:01:42 +00:00
// Service contains the information needed to run the repair service.
2019-09-10 14:24:16 +01:00
//
// architecture: Worker
2019-01-18 13:54:08 +00:00
type Service struct {
2020-01-08 18:33:15 +00:00
log * zap . Logger
queue queue . RepairQueue
config * Config
JobLimiter * semaphore . Weighted
Loop * sync2 . Cycle
repairer * SegmentRepairer
2020-12-18 08:49:31 +00:00
nowFn func ( ) time . Time
2018-10-24 13:35:59 +01:00
}
2020-07-16 15:18:02 +01:00
// NewService creates repairing service.
2021-06-15 22:45:31 +01:00
func NewService ( log * zap . Logger , queue queue . RepairQueue , config * Config , repairer * SegmentRepairer ) * Service {
2019-01-18 13:54:08 +00:00
return & Service {
2020-01-08 18:33:15 +00:00
log : log ,
queue : queue ,
config : config ,
JobLimiter : semaphore . NewWeighted ( int64 ( config . MaxRepair ) ) ,
Loop : sync2 . NewCycle ( config . Interval ) ,
repairer : repairer ,
2020-12-18 08:49:31 +00:00
nowFn : time . Now ,
2018-10-24 13:35:59 +01:00
}
2018-10-02 20:46:29 +01:00
}
2020-07-16 15:18:02 +01:00
// Close closes resources.
2019-01-18 13:54:08 +00:00
func ( service * Service ) Close ( ) error { return nil }
2020-01-08 18:33:15 +00:00
// WaitForPendingRepairs waits for all ongoing repairs to complete.
//
// NB: this assumes that service.config.MaxRepair will never be changed once this Service instance
// is initialized. If that is not a valid assumption, we should keep a copy of its initial value to
// use here instead.
func ( service * Service ) WaitForPendingRepairs ( ) {
// Acquire and then release the entire capacity of the semaphore, ensuring that
// it is completely empty (or, at least it was empty at some point).
//
// No error return is possible here; context.Background() can't be canceled
_ = service . JobLimiter . Acquire ( context . Background ( ) , int64 ( service . config . MaxRepair ) )
service . JobLimiter . Release ( int64 ( service . config . MaxRepair ) )
}
2023-03-31 18:52:27 +01:00
// TestingSetMinFailures sets the minFailures attribute, which tells the Repair machinery that we _expect_
// there to be failures and that we should wait for them if necessary. This is only used in tests.
func ( service * Service ) TestingSetMinFailures ( minFailures int ) {
service . repairer . ec . TestingSetMinFailures ( minFailures )
}
2020-07-16 15:18:02 +01:00
// Run runs the repairer service.
2019-01-18 13:54:08 +00:00
func ( service * Service ) Run ( ctx context . Context ) ( err error ) {
2018-10-25 19:59:36 +01:00
defer mon . Task ( ) ( & ctx ) ( & err )
2018-10-02 20:46:29 +01:00
2020-01-08 18:33:15 +00:00
// Wait for all repairs to complete
defer service . WaitForPendingRepairs ( )
2018-10-02 20:46:29 +01:00
2020-01-08 18:33:15 +00:00
return service . Loop . Run ( ctx , service . processWhileQueueHasItems )
2018-11-01 14:03:45 +00:00
}
2020-01-08 18:33:15 +00:00
// processWhileQueueHasItems keeps calling process() until the queue is empty or something
// else goes wrong in fetching from the queue.
func ( service * Service ) processWhileQueueHasItems ( ctx context . Context ) error {
2019-04-22 16:16:21 +01:00
for {
2020-01-08 18:33:15 +00:00
err := service . process ( ctx )
2018-10-30 20:14:15 +00:00
if err != nil {
2023-04-06 14:44:54 +01:00
if queue . ErrEmpty . Has ( err ) {
2019-04-22 16:16:21 +01:00
return nil
}
2020-01-08 18:33:15 +00:00
service . log . Error ( "process" , zap . Error ( Error . Wrap ( err ) ) )
2019-04-22 16:16:21 +01:00
return err
2018-10-30 20:14:15 +00:00
}
2020-01-08 18:33:15 +00:00
}
}
2018-10-30 20:14:15 +00:00
2020-07-16 15:18:02 +01:00
// process picks items from repair queue and spawns a repair worker.
2020-01-08 18:33:15 +00:00
func ( service * Service ) process ( ctx context . Context ) ( err error ) {
defer mon . Task ( ) ( & ctx ) ( & err )
// wait until we are allowed to spawn a new job
if err := service . JobLimiter . Acquire ( ctx , 1 ) ; err != nil {
return err
2019-04-22 16:16:21 +01:00
}
2020-01-08 18:33:15 +00:00
// IMPORTANT: this timeout must be started before service.queue.Select(), in case
// service.queue.Select() takes some non-negligible amount of time, so that we can depend on
// repair jobs being given up within some set interval after the time in the 'attempted'
// column in the queue table.
//
// This is the reason why we are using a semaphore in this somewhat awkward way instead of
// using a simpler sync2.Limiter pattern. We don't want this timeout to include the waiting
// time from the semaphore acquisition, but it _must_ include the queue fetch time. At the
// same time, we don't want to do the queue pop in a separate goroutine, because we want to
// return from service.Run when queue fetch fails.
ctx , cancel := context . WithTimeout ( ctx , service . config . TotalTimeout )
2023-09-27 14:13:13 +01:00
seg , err := service . queue . Select ( ctx , service . config . IncludedPlacements . Placements , service . config . ExcludedPlacements . Placements )
2020-01-08 18:33:15 +00:00
if err != nil {
service . JobLimiter . Release ( 1 )
cancel ( )
return err
}
2020-03-30 10:59:56 +01:00
service . log . Debug ( "Retrieved segment from repair queue" )
2020-01-08 18:33:15 +00:00
// this goroutine inherits the JobLimiter semaphore acquisition and is now responsible
// for releasing it.
go func ( ) {
defer service . JobLimiter . Release ( 1 )
defer cancel ( )
if err := service . worker ( ctx , seg ) ; err != nil {
2020-03-30 10:59:56 +01:00
service . log . Error ( "repair worker failed:" , zap . Error ( err ) )
2020-01-08 18:33:15 +00:00
}
} ( )
return nil
2018-10-02 20:46:29 +01:00
}
2019-06-04 12:36:27 +01:00
2021-06-17 16:05:04 +01:00
func ( service * Service ) worker ( ctx context . Context , seg * queue . InjuredSegment ) ( err error ) {
2019-06-04 12:36:27 +01:00
defer mon . Task ( ) ( & ctx ) ( & err )
2019-07-10 22:27:46 +01:00
2020-12-18 08:49:31 +00:00
workerStartTime := service . nowFn ( ) . UTC ( )
2019-07-10 22:27:46 +01:00
2020-03-30 10:59:56 +01:00
service . log . Debug ( "Limiter running repair on segment" )
2019-08-05 16:09:16 +01:00
// note that shouldDelete is used even in the case where err is not null
2021-06-17 16:05:04 +01:00
shouldDelete , err := service . repairer . Repair ( ctx , seg )
2019-08-05 16:09:16 +01:00
if shouldDelete {
2021-06-15 22:45:31 +01:00
if err != nil {
service . log . Error ( "unexpected error repairing segment!" , zap . Error ( err ) )
2019-08-05 16:09:16 +01:00
} else {
2020-03-30 10:59:56 +01:00
service . log . Debug ( "removing repaired segment from repair queue" )
2019-08-05 16:09:16 +01:00
}
2022-04-28 14:19:00 +01:00
delErr := service . queue . Delete ( ctx , seg )
if delErr != nil {
err = errs . Combine ( err , Error . New ( "failed to remove segment from queue: %v" , delErr ) )
2019-07-30 16:38:25 +01:00
}
2019-06-04 12:36:27 +01:00
}
if err != nil {
2020-02-24 20:13:12 +00:00
return Error . Wrap ( err )
2019-06-04 12:36:27 +01:00
}
2019-07-10 22:27:46 +01:00
2020-12-18 08:49:31 +00:00
repairedTime := service . nowFn ( ) . UTC ( )
2019-07-23 15:28:06 +01:00
timeForRepair := repairedTime . Sub ( workerStartTime )
2020-10-13 13:13:41 +01:00
mon . FloatVal ( "time_for_repair" ) . Observe ( timeForRepair . Seconds ( ) ) //mon:locked
2019-07-23 15:28:06 +01:00
2021-06-17 16:05:04 +01:00
insertedTime := seg . InsertedAt
2019-07-10 22:27:46 +01:00
// do not send metrics if segment was added before the InsertedTime field was added
if ! insertedTime . IsZero ( ) {
timeSinceQueued := workerStartTime . Sub ( insertedTime )
2020-10-13 13:13:41 +01:00
mon . FloatVal ( "time_since_checker_queue" ) . Observe ( timeSinceQueued . Seconds ( ) ) //mon:locked
2019-07-10 22:27:46 +01:00
}
2019-06-04 12:36:27 +01:00
return nil
}
2020-12-18 08:49:31 +00:00
// SetNow allows tests to have the server act as if the current time is whatever they want.
func ( service * Service ) SetNow ( nowFn func ( ) time . Time ) {
service . nowFn = nowFn
service . repairer . SetNow ( nowFn )
}