storj/satellite/reputation/service.go
Yingrong Zhao 1f8f7ebf06 satellite/{audit, reputation}: fix potential nodes reputation status
inconsistency

The original design had a flaw which can potentially cause discrepancy
for nodes reputation status between reputations table and nodes table.
In the event of a failure(network issue, db failure, satellite failure, etc.)
happens between update to reputations table and update to nodes table, data
can be out of sync.
This PR tries to fix above issue by passing through node's reputation from
the beginning of an audit/repair(this data is from nodes table) to the next
update in reputation service. If the updated reputation status from the service
is different from the existing node status, the service will try to update nodes
table. In the case of a failure, the service will be able to try update nodes
table again since it can see the discrepancy of the data. This will allow
both tables to be in-sync eventually.

Change-Id: Ic22130b4503a594b7177237b18f7e68305c2f122
2022-01-06 21:05:59 +00:00

189 lines
6.4 KiB
Go

// Copyright (C) 2021 Storj Labs, Inc.
// See LICENSE for copying information.
package reputation
import (
"context"
"time"
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/storj/satellite/overlay"
)
// DB is an interface for storing reputation data.
type DB interface {
Update(ctx context.Context, request UpdateRequest, now time.Time) (_ *overlay.ReputationStatus, err error)
Get(ctx context.Context, nodeID storj.NodeID) (*Info, error)
// UnsuspendNodeUnknownAudit unsuspends a storage node for unknown audits.
UnsuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID) (err error)
// DisqualifyNode disqualifies a storage node.
DisqualifyNode(ctx context.Context, nodeID storj.NodeID) (err error)
// SuspendNodeUnknownAudit suspends a storage node for unknown audits.
SuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error)
// UpdateAuditHistory updates a node's audit history
UpdateAuditHistory(ctx context.Context, oldHistory []byte, updateReq UpdateRequest, auditTime time.Time) (res *UpdateAuditHistoryResponse, err error)
}
// Info contains all reputation data to be stored in DB.
type Info struct {
AuditSuccessCount int64
TotalAuditCount int64
VettedAt *time.Time
Disqualified *time.Time
Suspended *time.Time
UnknownAuditSuspended *time.Time
OfflineSuspended *time.Time
UnderReview *time.Time
OnlineScore float64
AuditHistory AuditHistory
AuditReputationAlpha float64
AuditReputationBeta float64
UnknownAuditReputationAlpha float64
UnknownAuditReputationBeta float64
}
// Service handles storing node reputation data and updating
// the overlay cache when a node's status changes.
type Service struct {
log *zap.Logger
overlay overlay.DB
db DB
config Config
}
// NewService creates a new reputation service.
func NewService(log *zap.Logger, overlay overlay.DB, db DB, config Config) *Service {
return &Service{
log: log,
overlay: overlay,
db: db,
config: config,
}
}
// ApplyAudit receives an audit result and applies it to the relevant node in DB.
func (service *Service) ApplyAudit(ctx context.Context, nodeID storj.NodeID, reputation overlay.ReputationStatus, result AuditType) (err error) {
defer mon.Task()(&ctx)(&err)
now := time.Now()
statusUpdate, err := service.db.Update(ctx, UpdateRequest{
NodeID: nodeID,
AuditOutcome: result,
AuditLambda: service.config.AuditLambda,
AuditWeight: service.config.AuditWeight,
AuditDQ: service.config.AuditDQ,
SuspensionGracePeriod: service.config.SuspensionGracePeriod,
SuspensionDQEnabled: service.config.SuspensionDQEnabled,
AuditsRequiredForVetting: service.config.AuditCount,
AuditHistory: service.config.AuditHistory,
}, now)
if err != nil {
return err
}
// only update node if its health status has changed, or it's a newly vetted
// node.
// this prevents the need to require caller of ApplyAudit() to always know
// the VettedAt time for a node.
// Due to inconsistencies in the precision of time.Now() on different platforms and databases, the time comparison
// for the VettedAt status is done using time values that are truncated to second precision.
if hasReputationChanged(*statusUpdate, reputation, now) {
err = service.overlay.UpdateReputation(ctx, nodeID, statusUpdate)
if err != nil {
return err
}
}
return err
}
// Get returns a node's reputation info from DB.
// If a node is not found in the DB, default reputation information is returned.
func (service *Service) Get(ctx context.Context, nodeID storj.NodeID) (info *Info, err error) {
defer mon.Task()(&ctx)(&err)
info, err = service.db.Get(ctx, nodeID)
if err != nil {
if ErrNodeNotFound.Has(err) {
// if there is no audit reputation for the node, that's fine and we
// return default reputation values
info = &Info{
UnknownAuditReputationAlpha: 1,
AuditReputationAlpha: 1,
OnlineScore: 1,
}
return info, nil
}
return nil, Error.Wrap(err)
}
return info, nil
}
// TestSuspendNodeUnknownAudit suspends a storage node for unknown audits.
func (service *Service) TestSuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error) {
err = service.db.SuspendNodeUnknownAudit(ctx, nodeID, suspendedAt)
if err != nil {
return err
}
return service.overlay.TestSuspendNodeUnknownAudit(ctx, nodeID, suspendedAt)
}
// TestDisqualifyNode disqualifies a storage node.
func (service *Service) TestDisqualifyNode(ctx context.Context, nodeID storj.NodeID) (err error) {
err = service.db.DisqualifyNode(ctx, nodeID)
if err != nil {
return err
}
return service.overlay.DisqualifyNode(ctx, nodeID)
}
// TestUnsuspendNodeUnknownAudit unsuspends a storage node for unknown audits.
func (service *Service) TestUnsuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID) (err error) {
err = service.db.UnsuspendNodeUnknownAudit(ctx, nodeID)
if err != nil {
return err
}
return service.overlay.TestUnsuspendNodeUnknownAudit(ctx, nodeID)
}
// Close closes resources.
func (service *Service) Close() error { return nil }
// hasReputationChanged determines if the current node reputation is different from the newly updated reputation. This
// function will only consider the Disqualified, UnknownAudiSuspended and OfflineSuspended statuses for changes.
func hasReputationChanged(updated, current overlay.ReputationStatus, now time.Time) bool {
if statusChanged(current.Disqualified, updated.Disqualified) ||
statusChanged(current.UnknownAuditSuspended, updated.UnknownAuditSuspended) ||
statusChanged(current.OfflineSuspended, updated.OfflineSuspended) {
return true
}
// check for newly vetted nodes.
// Due to inconsistencies in the precision of time.Now() on different platforms and databases, the time comparison
// for the VettedAt status is done using time values that are truncated to second precision.
if updated.VettedAt != nil && updated.VettedAt.Truncate(time.Second).Equal(now.Truncate(time.Second)) {
return true
}
return false
}
// statusChanged determines if the two given statuses are different.
func statusChanged(s1, s2 *time.Time) bool {
if s1 == nil && s2 == nil {
return false
} else if s1 != nil && s2 != nil {
return !s1.Equal(*s1)
}
return true
}