2020-10-27 18:26:46 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"fmt"
"strconv"
"strings"
"time"
"storj.io/common/pb"
"storj.io/common/storj"
)
// Config contains configurable values for checker.
type Config struct {
2021-06-15 22:45:31 +01:00
Interval time . Duration ` help:"how frequently checker should check for bad segments" releaseDefault:"30s" devDefault:"0h0m10s" testDefault:"$TESTINTERVAL" `
2020-10-27 18:26:46 +00:00
testplanet/satellite: reduce the number of places default values need to be configured
Satellites set their configuration values to default values using
cfgstruct, however, it turns out our tests don't test these values
at all! Instead, they have a completely separate definition system
that is easy to forget about.
As is to be expected, these values have drifted, and it appears
in a few cases test planet is testing unreasonable values that we
won't see in production, or perhaps worse, features enabled in
production were missed and weren't enabled in testplanet.
This change makes it so all values are configured the same,
systematic way, so it's easy to see when test values are different
than dev values or release values, and it's less hard to forget
to enable features in testplanet.
In terms of reviewing, this change should be actually fairly
easy to review, considering private/testplanet/satellite.go keeps
the current config system and the new one and confirms that they
result in identical configurations, so you can be certain that
nothing was missed and the config is all correct.
You can also check the config lock to see what actual config
values changed.
Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e
2021-05-31 22:15:00 +01:00
ReliabilityCacheStaleness time . Duration ` help:"how stale reliable node cache can be" releaseDefault:"5m" devDefault:"5m" testDefault:"1m" `
2021-01-28 21:55:16 +00:00
RepairOverrides RepairOverrides ` help:"comma-separated override values for repair threshold in the format k/o/n-override (min/optimal/total-override)" releaseDefault:"29/80/110-52,29/80/95-52,29/80/130-52" devDefault:"" `
2020-10-27 18:26:46 +00:00
// Node failure rate is an estimation based on a 6 hour checker run interval (4 checker iterations per day), a network of about 9200 nodes, and about 2 nodes churning per day.
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
2022-04-29 23:33:08 +01:00
NodeFailureRate float64 ` help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
RepairQueueInsertBatchSize int ` help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
2023-05-18 19:47:23 +01:00
DoDeclumping bool ` help:"Treat pieces on the same network as in need of repair" default:"false" `
2023-05-30 15:44:36 +01:00
DoPlacementCheck bool ` help:"Treat pieces out of segment placement as in need of repair" default:"true" `
2020-10-27 18:26:46 +00:00
}
// RepairOverride is a configuration struct that contains an override repair
// value for a given RS k/o/n (min/success/total).
//
// Can be used as a flag.
type RepairOverride struct {
Min int
Success int
Total int
Override int32
}
// Type implements pflag.Value.
func ( RepairOverride ) Type ( ) string { return "checker.RepairOverride" }
// String is required for pflag.Value.
func ( ro * RepairOverride ) String ( ) string {
return fmt . Sprintf ( "%d/%d/%d-%d" ,
ro . Min ,
ro . Success ,
ro . Total ,
ro . Override )
}
// Set sets the value from a string in the format k/o/n-override (min/optimal/total-repairOverride).
func ( ro * RepairOverride ) Set ( s string ) error {
// Split on dash. Expect two items. First item is RS numbers. Second item is Override.
info := strings . Split ( s , "-" )
if len ( info ) != 2 {
return Error . New ( "Invalid default repair override config (expect format k/o/n-override, got %s)" , s )
}
rsNumbersString := info [ 0 ]
overrideString := info [ 1 ]
// Split on forward slash. Expect exactly three positive non-decreasing integers.
rsNumbers := strings . Split ( rsNumbersString , "/" )
if len ( rsNumbers ) != 3 {
return Error . New ( "Invalid default RS numbers (wrong size, expect 3): %s" , rsNumbersString )
}
minValue := 1
values := [ ] int { }
for _ , nextValueString := range rsNumbers {
nextValue , err := strconv . Atoi ( nextValueString )
if err != nil {
return Error . New ( "Invalid default RS numbers (should all be valid integers): %s, %w" , rsNumbersString , err )
}
if nextValue < minValue {
return Error . New ( "Invalid default RS numbers (should be non-decreasing): %s" , rsNumbersString )
}
values = append ( values , nextValue )
minValue = nextValue
}
ro . Min = values [ 0 ]
ro . Success = values [ 1 ]
ro . Total = values [ 2 ]
// Attempt to parse "-override" part of config.
override , err := strconv . Atoi ( overrideString )
if err != nil {
return Error . New ( "Invalid override value (should be valid integer): %s, %w" , overrideString , err )
}
if override < ro . Min || override >= ro . Success {
return Error . New ( "Invalid override value (should meet criteria min <= override < success). Min: %d, Override: %d, Success: %d." , ro . Min , override , ro . Success )
}
ro . Override = int32 ( override )
return nil
}
// RepairOverrides is a configuration struct that contains a list of override repair
// values for various given RS combinations of k/o/n (min/success/total).
//
// Can be used as a flag.
type RepairOverrides struct {
List [ ] RepairOverride
}
// Type implements pflag.Value.
func ( RepairOverrides ) Type ( ) string { return "checker.RepairOverrides" }
// String is required for pflag.Value. It is a comma separated list of RepairOverride configs.
func ( ros * RepairOverrides ) String ( ) string {
var s strings . Builder
for i , ro := range ros . List {
if i > 0 {
s . WriteString ( "," )
}
s . WriteString ( ro . String ( ) )
}
return s . String ( )
}
// Set sets the value from a string in the format "k/o/n-override,k/o/n-override,...".
func ( ros * RepairOverrides ) Set ( s string ) error {
ros . List = nil
roStrings := strings . Split ( s , "," )
for _ , roString := range roStrings {
roString = strings . TrimSpace ( roString )
if roString == "" {
continue
}
newRo := RepairOverride { }
err := newRo . Set ( roString )
if err != nil {
return err
}
ros . List = append ( ros . List , newRo )
}
return nil
}
// GetMap creates a RepairOverridesMap from the config.
func ( ros * RepairOverrides ) GetMap ( ) RepairOverridesMap {
newMap := RepairOverridesMap {
overrideMap : make ( map [ string ] int32 ) ,
}
for _ , ro := range ros . List {
2020-11-20 22:20:03 +00:00
key := getRepairOverrideKey ( ro . Min , ro . Success , ro . Total )
2020-10-27 18:26:46 +00:00
newMap . overrideMap [ key ] = ro . Override
}
return newMap
}
// RepairOverridesMap is derived from the RepairOverrides config, and is used for quickly retrieving
// repair override values.
type RepairOverridesMap struct {
// map of "k/o/n" -> override value
overrideMap map [ string ] int32
}
// GetOverrideValuePB returns the override value for a pb RS scheme if it exists, or 0 otherwise.
func ( rom * RepairOverridesMap ) GetOverrideValuePB ( rs * pb . RedundancyScheme ) int32 {
2020-11-20 22:20:03 +00:00
key := getRepairOverrideKey ( int ( rs . MinReq ) , int ( rs . SuccessThreshold ) , int ( rs . Total ) )
2020-10-27 18:26:46 +00:00
return rom . overrideMap [ key ]
}
// GetOverrideValue returns the override value for an RS scheme if it exists, or 0 otherwise.
func ( rom * RepairOverridesMap ) GetOverrideValue ( rs storj . RedundancyScheme ) int32 {
2020-11-20 22:20:03 +00:00
key := getRepairOverrideKey ( int ( rs . RequiredShares ) , int ( rs . OptimalShares ) , int ( rs . TotalShares ) )
2020-10-27 18:26:46 +00:00
return rom . overrideMap [ key ]
}
2020-11-20 22:20:03 +00:00
func getRepairOverrideKey ( min , success , total int ) string {
2020-10-27 18:26:46 +00:00
return fmt . Sprintf ( "%d/%d/%d" , min , success , total )
}