storj/satellite/reputation/config.go
paul cannon 0dcc0a9ee0 satellite/reputation: reconfigure lambda and alpha
This is in response to community feedback that our existing reputation
calculation is too likely to disqualify storage nodes unfairly with
extreme swings up and down.

For details and analysis, please see the data_loss_vs_dq_chance_sim.py
tool, the "tuning reputation further.ipynb" Jupyter notebook in the
storj/datascience repository, and the discussion at

    https://forum.storj.io/t/tuning-audit-scoring/14084

In brief: changing the lambda and initial-alpha parameters in this way
causes the swings in reputation to be smaller and less likely to put a
node past the disqualification threshold unfairly.

Note: this change will cause a one-time reset of all (non-disqualified)
node reputations, because the new initial alpha value of 1000 is
dramatically different, and the disqualification threshold is going to
be much higher.

Change-Id: Id6dc4ba8fde1be3db4255b72282207bab5491ca3
2022-08-17 18:52:53 +00:00

91 lines
5.1 KiB
Go

// Copyright (C) 2021 Storj Labs, Inc.
// See LICENSE for copying information.
package reputation
import (
"fmt"
"time"
"github.com/spacemonkeygo/monkit/v3"
"github.com/zeebo/errs"
"storj.io/common/storj"
)
var (
mon = monkit.Package()
// Error is the default reputation errs class.
Error = errs.Class("reputation")
// ErrNodeNotFound is returned if a node does not exist in database.
ErrNodeNotFound = errs.Class("node not found")
)
// Config contains all config values for the reputation service.
type Config struct {
AuditRepairWeight float64 `help:"weight to apply to audit reputation for total repair reputation calculation" default:"1.0"`
AuditUplinkWeight float64 `help:"weight to apply to audit reputation for total uplink reputation calculation" default:"1.0"`
AuditLambda float64 `help:"the forgetting factor used to update storage node reputation due to audits" default:"0.999"`
AuditWeight float64 `help:"the normalization weight used to calculate the audit SNs reputation" default:"1.0"`
AuditDQ float64 `help:"the reputation cut-off for disqualifying SNs based on audit history" default:"0.96"`
UnknownAuditLambda float64 `help:"the forgetting factor used to update storage node reputation due to returning 'unknown' errors during audit'" default:"0.95"`
UnknownAuditDQ float64 `help:"the reputation cut-off for disqualifying SNs based on returning 'unknown' errors during audit" default:"0.6"`
SuspensionGracePeriod time.Duration `help:"the time period that must pass before suspended nodes will be disqualified" releaseDefault:"168h" devDefault:"1h"`
SuspensionDQEnabled bool `help:"whether nodes will be disqualified if they have been suspended for longer than the suspended grace period" releaseDefault:"false" devDefault:"true"`
AuditCount int64 `help:"the number of times a node has been audited to not be considered a New Node" releaseDefault:"100" devDefault:"0"`
AuditHistory AuditHistoryConfig
FlushInterval time.Duration `help:"the maximum amount of time that should elapse before cached reputation writes are flushed to the database (if 0, no reputation cache is used)" releaseDefault:"2h" devDefault:"2m"`
ErrorRetryInterval time.Duration `help:"the amount of time that should elapse before the cache retries failed database operations" releaseDefault:"1m" devDefault:"5s"`
InitialAlpha float64 `help:"the value to which an alpha reputation value should be initialized" default:"1000"`
InitialBeta float64 `help:"the value to which a beta reputation value should be initialized" default:"0"`
}
// UpdateRequest is used to update a node's reputation status.
type UpdateRequest struct {
NodeID storj.NodeID
AuditOutcome AuditType
// Config is a copy of the Config struct from the satellite.
// It is part of the UpdateRequest struct in order to be more easily
// accessible from satellitedb code.
Config
}
// AuditHistoryConfig is a configuration struct defining time periods and thresholds for penalizing nodes for being offline.
// It is used for downtime suspension and disqualification.
type AuditHistoryConfig struct {
WindowSize time.Duration `help:"The length of time spanning a single audit window" releaseDefault:"12h" devDefault:"5m" testDefault:"10m"`
TrackingPeriod time.Duration `help:"The length of time to track audit windows for node suspension and disqualification" releaseDefault:"720h" devDefault:"1h"`
GracePeriod time.Duration `help:"The length of time to give suspended SNOs to diagnose and fix issues causing downtime. Afterwards, they will have one tracking period to reach the minimum online score before disqualification" releaseDefault:"168h" devDefault:"1h"`
OfflineThreshold float64 `help:"The point below which a node is punished for offline audits. Determined by calculating the ratio of online/total audits within each window and finding the average across windows within the tracking period." default:"0.6"`
OfflineDQEnabled bool `help:"whether nodes will be disqualified if they have low online score after a review period" releaseDefault:"false" devDefault:"true"`
OfflineSuspensionEnabled bool `help:"whether nodes will be suspended if they have low online score" releaseDefault:"true" devDefault:"true"`
}
// AuditType is an enum representing the outcome of a particular audit.
type AuditType int
const (
// AuditSuccess represents a successful audit.
AuditSuccess AuditType = iota
// AuditFailure represents a failed audit.
AuditFailure
// AuditUnknown represents an audit that resulted in an unknown error from the node.
AuditUnknown
// AuditOffline represents an audit where a node was offline.
AuditOffline
)
func (auditType AuditType) String() string {
switch auditType {
case AuditSuccess:
return "AuditSuccess"
case AuditFailure:
return "AuditFailure"
case AuditUnknown:
return "AuditUnknown"
case AuditOffline:
return "AuditOffline"
}
return fmt.Sprintf("<unregistered audittype %d>", auditType)
}