satellite/repair: Add flag to allow disabling reputation updates

Reputation updates during repair currently consumes a lot of database
resources. Sometimes increasing the rate of repair is more important
than auditing a node based on whether they have or don't have the
correct piece during repair. This is the job of the audit service.

This commit is to implement an intermediate solution from this issue: https://github.com/storj/storj/issues/5089
This commit does not address the more in-depth fix discussed here: https://github.com/storj/storj/issues/4939

Change-Id: I4163b18d78a96fadf5265789fd73c8aa8def0e9f
This commit is contained in:
Moby von Briesen 2022-11-24 08:02:08 -05:00
parent 94dcfd77ee
commit 3501656e98
6 changed files with 26 additions and 10 deletions

View File

@ -132,8 +132,7 @@ func cmdRepairSegment(cmd *cobra.Command, args []string) (err error) {
nil, // TODO add noop version
ecRepairer,
config.Checker.RepairOverrides,
config.Repairer.Timeout,
config.Repairer.MaxExcessRateOptimalThreshold,
config.Repairer,
)
// TODO reorganize to avoid using peer.

View File

@ -371,6 +371,7 @@ func TestMinRequiredDataRepair(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.InitialBeta = 0.01
config.Reputation.AuditLambda = 0.95
@ -480,6 +481,7 @@ func TestFailedDataRepair(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.InitialBeta = 0.01
config.Reputation.AuditLambda = 0.95
@ -600,6 +602,7 @@ func TestOfflineNodeDataRepair(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.InitialBeta = 0.01
config.Reputation.AuditLambda = 0.95
@ -722,6 +725,7 @@ func TestUnknownErrorDataRepair(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.InitialBeta = 0.01
config.Reputation.AuditLambda = 0.95
@ -843,6 +847,7 @@ func TestMissingPieceDataRepair_Succeed(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.InitialBeta = 0.01
config.Reputation.AuditLambda = 0.95
@ -959,6 +964,7 @@ func TestMissingPieceDataRepair(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.AuditLambda = 0.95
},
@ -1076,6 +1082,7 @@ func TestCorruptDataRepair_Succeed(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.AuditLambda = 0.95
},
@ -1190,6 +1197,7 @@ func TestCorruptDataRepair_Failed(t *testing.T) {
func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
config.Repairer.InMemoryRepair = true
config.Repairer.ReputationUpdateEnabled = true
config.Reputation.InitialAlpha = 1
config.Reputation.AuditLambda = 0.95
},

View File

@ -34,6 +34,7 @@ type Config struct {
MaxBufferMem memory.Size `help:"maximum buffer memory (in bytes) to be allocated for read buffers" default:"4.0 MiB"`
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
}
// Service contains the information needed to run the repair service.

View File

@ -84,6 +84,8 @@ type SegmentRepairer struct {
timeout time.Duration
reporter audit.Reporter
reputationUpdateEnabled bool
// multiplierOptimalThreshold is the value that multiplied by the optimal
// threshold results in the maximum limit of number of nodes to upload
// repaired pieces
@ -110,9 +112,10 @@ func NewSegmentRepairer(
reporter audit.Reporter,
ecRepairer *ECRepairer,
repairOverrides checker.RepairOverrides,
timeout time.Duration, excessOptimalThreshold float64,
config Config,
) *SegmentRepairer {
excessOptimalThreshold := config.MaxExcessRateOptimalThreshold
if excessOptimalThreshold < 0 {
excessOptimalThreshold = 0
}
@ -124,10 +127,11 @@ func NewSegmentRepairer(
orders: orders,
overlay: overlay,
ec: ecRepairer,
timeout: timeout,
timeout: config.Timeout,
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
repairOverrides: repairOverrides.GetMap(),
reporter: reporter,
reputationUpdateEnabled: config.ReputationUpdateEnabled,
nowFn: time.Now,
}
@ -448,10 +452,12 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
for _, outcome := range piecesReport.Unknown {
report.Unknown = append(report.Unknown, outcome.Piece.StorageNode)
}
_, reportErr := repairer.reporter.RecordAudits(ctx, report)
if reportErr != nil {
// failed updates should not affect repair, therefore we will not return the error
repairer.log.Debug("failed to record audit", zap.Error(reportErr))
if repairer.reputationUpdateEnabled {
_, reportErr := repairer.reporter.RecordAudits(ctx, report)
if reportErr != nil {
// failed updates should not affect repair, therefore we will not return the error
repairer.log.Debug("failed to record audit", zap.Error(reportErr))
}
}
// Upload the repaired pieces

View File

@ -239,8 +239,7 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,
peer.Audit.Reporter,
peer.EcRepairer,
config.Checker.RepairOverrides,
config.Repairer.Timeout,
config.Repairer.MaxExcessRateOptimalThreshold,
config.Repairer,
)
peer.Repairer = repairer.NewService(log.Named("repairer"), repairQueue, &config.Repairer, peer.SegmentRepairer)

View File

@ -868,6 +868,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# maximum segments that can be repaired concurrently
# repairer.max-repair: 5
# whether the audit score of nodes should be updated as a part of repair
# repairer.reputation-update-enabled: false
# time limit for uploading repaired pieces to new storage nodes
# repairer.timeout: 5m0s