satellite/repair: Add flag to allow disabling reputation updates
Reputation updates during repair currently consumes a lot of database resources. Sometimes increasing the rate of repair is more important than auditing a node based on whether they have or don't have the correct piece during repair. This is the job of the audit service. This commit is to implement an intermediate solution from this issue: https://github.com/storj/storj/issues/5089 This commit does not address the more in-depth fix discussed here: https://github.com/storj/storj/issues/4939 Change-Id: I4163b18d78a96fadf5265789fd73c8aa8def0e9f
This commit is contained in:
parent
94dcfd77ee
commit
3501656e98
@ -132,8 +132,7 @@ func cmdRepairSegment(cmd *cobra.Command, args []string) (err error) {
|
||||
nil, // TODO add noop version
|
||||
ecRepairer,
|
||||
config.Checker.RepairOverrides,
|
||||
config.Repairer.Timeout,
|
||||
config.Repairer.MaxExcessRateOptimalThreshold,
|
||||
config.Repairer,
|
||||
)
|
||||
|
||||
// TODO reorganize to avoid using peer.
|
||||
|
@ -371,6 +371,7 @@ func TestMinRequiredDataRepair(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.InitialBeta = 0.01
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
@ -480,6 +481,7 @@ func TestFailedDataRepair(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.InitialBeta = 0.01
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
@ -600,6 +602,7 @@ func TestOfflineNodeDataRepair(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.InitialBeta = 0.01
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
@ -722,6 +725,7 @@ func TestUnknownErrorDataRepair(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.InitialBeta = 0.01
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
@ -843,6 +847,7 @@ func TestMissingPieceDataRepair_Succeed(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.InitialBeta = 0.01
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
@ -959,6 +964,7 @@ func TestMissingPieceDataRepair(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
},
|
||||
@ -1076,6 +1082,7 @@ func TestCorruptDataRepair_Succeed(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
},
|
||||
@ -1190,6 +1197,7 @@ func TestCorruptDataRepair_Failed(t *testing.T) {
|
||||
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
||||
config.Repairer.InMemoryRepair = true
|
||||
config.Repairer.ReputationUpdateEnabled = true
|
||||
config.Reputation.InitialAlpha = 1
|
||||
config.Reputation.AuditLambda = 0.95
|
||||
},
|
||||
|
@ -34,6 +34,7 @@ type Config struct {
|
||||
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4.0 MiB"`
|
||||
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
|
||||
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
|
||||
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
|
||||
}
|
||||
|
||||
// Service contains the information needed to run the repair service.
|
||||
|
@ -84,6 +84,8 @@ type SegmentRepairer struct {
|
||||
timeout time.Duration
|
||||
reporter audit.Reporter
|
||||
|
||||
reputationUpdateEnabled bool
|
||||
|
||||
// multiplierOptimalThreshold is the value that multiplied by the optimal
|
||||
// threshold results in the maximum limit of number of nodes to upload
|
||||
// repaired pieces
|
||||
@ -110,9 +112,10 @@ func NewSegmentRepairer(
|
||||
reporter audit.Reporter,
|
||||
ecRepairer *ECRepairer,
|
||||
repairOverrides checker.RepairOverrides,
|
||||
timeout time.Duration, excessOptimalThreshold float64,
|
||||
config Config,
|
||||
) *SegmentRepairer {
|
||||
|
||||
excessOptimalThreshold := config.MaxExcessRateOptimalThreshold
|
||||
if excessOptimalThreshold < 0 {
|
||||
excessOptimalThreshold = 0
|
||||
}
|
||||
@ -124,10 +127,11 @@ func NewSegmentRepairer(
|
||||
orders: orders,
|
||||
overlay: overlay,
|
||||
ec: ecRepairer,
|
||||
timeout: timeout,
|
||||
timeout: config.Timeout,
|
||||
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
|
||||
repairOverrides: repairOverrides.GetMap(),
|
||||
reporter: reporter,
|
||||
reputationUpdateEnabled: config.ReputationUpdateEnabled,
|
||||
|
||||
nowFn: time.Now,
|
||||
}
|
||||
@ -448,10 +452,12 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
||||
for _, outcome := range piecesReport.Unknown {
|
||||
report.Unknown = append(report.Unknown, outcome.Piece.StorageNode)
|
||||
}
|
||||
_, reportErr := repairer.reporter.RecordAudits(ctx, report)
|
||||
if reportErr != nil {
|
||||
// failed updates should not affect repair, therefore we will not return the error
|
||||
repairer.log.Debug("failed to record audit", zap.Error(reportErr))
|
||||
if repairer.reputationUpdateEnabled {
|
||||
_, reportErr := repairer.reporter.RecordAudits(ctx, report)
|
||||
if reportErr != nil {
|
||||
// failed updates should not affect repair, therefore we will not return the error
|
||||
repairer.log.Debug("failed to record audit", zap.Error(reportErr))
|
||||
}
|
||||
}
|
||||
|
||||
// Upload the repaired pieces
|
||||
|
@ -239,8 +239,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,
|
||||
peer.Audit.Reporter,
|
||||
peer.EcRepairer,
|
||||
config.Checker.RepairOverrides,
|
||||
config.Repairer.Timeout,
|
||||
config.Repairer.MaxExcessRateOptimalThreshold,
|
||||
config.Repairer,
|
||||
)
|
||||
peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer)
|
||||
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -868,6 +868,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# maximum segments that can be repaired concurrently
|
||||
# repairer.max-repair: 5
|
||||
|
||||
# whether the audit score of nodes should be updated as a part of repair
|
||||
# repairer.reputation-update-enabled: false
|
||||
|
||||
# time limit for uploading repaired pieces to new storage nodes
|
||||
# repairer.timeout: 5m0s
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user