storj/satellite/audit/reporter.go
paul cannon 1854351da6 satellite/audit: teach Reporter about piecewise audits
The Reporter is responsible for processing results from auditing
operations, logging the results, disqualifying nodes that reached
the maximum reverification count, and passing the results on to
the reputation system.

In this commit, we extend the Reporter so that it knows how to process
the results of piecewise reverification audits.

We also change most reporter-related tests so that reverifications
happen as piecewise reverification audits, exercising the new code.

Note that piecewise reverification audits are not yet being done outside
of tests. In a later commit, we will switch from doing segmentwise
reverifications to piecewise reverifications, as part of the
audit-scaling effort.

Refs: https://github.com/storj/storj/issues/5230
Change-Id: I9438164ce1ea4d9a1790d18d0e1046a8eb04d8e9
2022-12-12 11:28:02 +00:00

303 lines
11 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package audit
import (
"context"
"strings"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/reputation"
)
// reporter records audit reports in overlay and implements the Reporter interface.
//
// architecture: Service
type reporter struct {
log *zap.Logger
reputations *reputation.Service
overlay *overlay.Service
containment Containment
// newContainment is temporary, and will replace containment
newContainment NewContainment
maxRetries int
maxReverifyCount int32
}
// Reporter records audit reports in the overlay and database.
type Reporter interface {
RecordAudits(ctx context.Context, req Report)
ReportReverificationNeeded(ctx context.Context, piece *PieceLocator) (err error)
RecordReverificationResult(ctx context.Context, pendingJob *ReverificationJob, outcome Outcome, reputation overlay.ReputationStatus) (err error)
}
// Report contains audit result.
// It records whether an audit is able to be completed, the total number of
// pieces a given audit has conducted for, lists for nodes that
// succeeded, failed, were offline, have pending audits, or failed for unknown
// reasons and their current reputation status.
type Report struct {
Successes storj.NodeIDList
Fails storj.NodeIDList
Offlines storj.NodeIDList
PendingAudits []*PendingAudit
// PieceAudits is temporary and will replace PendingAudits.
PieceAudits []*ReverificationJob
Unknown storj.NodeIDList
NodesReputation map[storj.NodeID]overlay.ReputationStatus
}
// NewReporter instantiates a reporter.
func NewReporter(log *zap.Logger, reputations *reputation.Service, overlay *overlay.Service, containment Containment, newContainment NewContainment, maxRetries int, maxReverifyCount int32) Reporter {
return &reporter{
log: log,
reputations: reputations,
overlay: overlay,
containment: containment,
newContainment: newContainment,
maxRetries: maxRetries,
maxReverifyCount: maxReverifyCount,
}
}
// RecordAudits saves audit results, applying reputation changes as appropriate.
// If some records can not be updated after a number of attempts, the failures
// are logged at level ERROR, but are otherwise thrown away.
func (reporter *reporter) RecordAudits(ctx context.Context, req Report) {
defer mon.Task()(&ctx)(nil)
successes := req.Successes
fails := req.Fails
unknowns := req.Unknown
offlines := req.Offlines
pendingAudits := req.PendingAudits
pieceAudits := req.PieceAudits
reporter.log.Debug("Reporting audits",
zap.Int("successes", len(successes)),
zap.Int("failures", len(fails)),
zap.Int("unknowns", len(unknowns)),
zap.Int("offlines", len(offlines)),
zap.Int("pending", len(pendingAudits)),
zap.Int("piece-pending", len(pieceAudits)),
)
nodesReputation := req.NodesReputation
reportFailures := func(tries int, resultType string, err error, nodes storj.NodeIDList, pending []*PendingAudit, pieces []*ReverificationJob) {
if err == nil || tries < reporter.maxRetries {
// don't need to report anything until the last time through
return
}
reporter.log.Error("failed to update reputation information with audit results",
zap.String("result type", resultType),
zap.Error(err),
zap.String("node IDs", strings.Join(nodes.Strings(), ", ")),
zap.Any("pending segment audits", pending),
zap.Any("pending piece audits", pieces))
}
var err error
for tries := 0; tries <= reporter.maxRetries; tries++ {
if len(successes) == 0 && len(fails) == 0 && len(unknowns) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 && len(pieceAudits) == 0 {
return
}
successes, err = reporter.recordAuditStatus(ctx, successes, nodesReputation, reputation.AuditSuccess)
reportFailures(tries, "successful", err, successes, nil, nil)
fails, err = reporter.recordAuditStatus(ctx, fails, nodesReputation, reputation.AuditFailure)
reportFailures(tries, "failed", err, fails, nil, nil)
unknowns, err = reporter.recordAuditStatus(ctx, unknowns, nodesReputation, reputation.AuditUnknown)
reportFailures(tries, "unknown", err, unknowns, nil, nil)
offlines, err = reporter.recordAuditStatus(ctx, offlines, nodesReputation, reputation.AuditOffline)
reportFailures(tries, "offline", err, offlines, nil, nil)
pendingAudits, err = reporter.recordPendingAudits(ctx, pendingAudits, nodesReputation)
reportFailures(tries, "pending", err, nil, pendingAudits, nil)
pieceAudits, err = reporter.recordPendingPieceAudits(ctx, pieceAudits, nodesReputation)
reportFailures(tries, "pending", err, nil, nil, pieceAudits)
}
}
func (reporter *reporter) recordAuditStatus(ctx context.Context, nodeIDs storj.NodeIDList, nodesReputation map[storj.NodeID]overlay.ReputationStatus, auditOutcome reputation.AuditType) (failed storj.NodeIDList, err error) {
defer mon.Task()(&ctx)(&err)
if len(nodeIDs) == 0 {
return nil, nil
}
var errors errs.Group
for _, nodeID := range nodeIDs {
err = reporter.reputations.ApplyAudit(ctx, nodeID, nodesReputation[nodeID], auditOutcome)
if err != nil {
failed = append(failed, nodeID)
errors.Add(Error.New("failed to record audit status %s in overlay for node %s: %w", auditOutcome.String(), nodeID.String(), err))
}
}
return failed, errors.Err()
}
// recordPendingPieceAudits updates the containment status of nodes with pending piece audits.
// This function is temporary and will replace recordPendingAudits later in this commit chain.
func (reporter *reporter) recordPendingPieceAudits(ctx context.Context, pendingAudits []*ReverificationJob, nodesReputation map[storj.NodeID]overlay.ReputationStatus) (failed []*ReverificationJob, err error) {
defer mon.Task()(&ctx)(&err)
var errlist errs.Group
for _, pendingAudit := range pendingAudits {
logger := reporter.log.With(
zap.Stringer("Node ID", pendingAudit.Locator.NodeID),
zap.Stringer("Stream ID", pendingAudit.Locator.StreamID),
zap.Uint64("Position", pendingAudit.Locator.Position.Encode()),
zap.Int("Piece Num", pendingAudit.Locator.PieceNum))
if pendingAudit.ReverifyCount < int(reporter.maxReverifyCount) {
err := reporter.ReportReverificationNeeded(ctx, &pendingAudit.Locator)
if err != nil {
failed = append(failed, pendingAudit)
errlist.Add(err)
continue
}
logger.Info("reverification queued")
continue
}
// record failure -- max reverify count reached
logger.Info("max reverify count reached (audit failed)")
err = reporter.reputations.ApplyAudit(ctx, pendingAudit.Locator.NodeID, nodesReputation[pendingAudit.Locator.NodeID], reputation.AuditFailure)
if err != nil {
logger.Info("failed to update reputation information", zap.Error(err))
errlist.Add(err)
failed = append(failed, pendingAudit)
continue
}
_, stillContained, err := reporter.newContainment.Delete(ctx, &pendingAudit.Locator)
if err != nil {
if !ErrContainedNotFound.Has(err) {
errlist.Add(err)
}
continue
}
if !stillContained {
err = reporter.overlay.SetNodeContained(ctx, pendingAudit.Locator.NodeID, false)
if err != nil {
logger.Error("failed to mark node as not contained", zap.Error(err))
}
}
}
if len(failed) > 0 {
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
}
return nil, nil
}
// recordPendingAudits updates the containment status of nodes with pending audits.
func (reporter *reporter) recordPendingAudits(ctx context.Context, pendingAudits []*PendingAudit, nodesReputation map[storj.NodeID]overlay.ReputationStatus) (failed []*PendingAudit, err error) {
defer mon.Task()(&ctx)(&err)
var errlist errs.Group
for _, pendingAudit := range pendingAudits {
if pendingAudit.ReverifyCount < reporter.maxReverifyCount {
err := reporter.containment.IncrementPending(ctx, pendingAudit)
if err != nil {
failed = append(failed, pendingAudit)
errlist.Add(err)
}
reporter.log.Info("Audit pending",
zap.Stringer("Piece ID", pendingAudit.PieceID),
zap.Stringer("Node ID", pendingAudit.NodeID))
} else {
// record failure -- max reverify count reached
reporter.log.Info("max reverify count reached (audit failed)", zap.Stringer("Node ID", pendingAudit.NodeID))
err = reporter.reputations.ApplyAudit(ctx, pendingAudit.NodeID, nodesReputation[pendingAudit.NodeID], reputation.AuditFailure)
if err != nil {
errlist.Add(err)
failed = append(failed, pendingAudit)
} else {
_, err = reporter.containment.Delete(ctx, pendingAudit.NodeID)
if err != nil && !ErrContainedNotFound.Has(err) {
errlist.Add(err)
}
}
}
}
if len(failed) > 0 {
for _, v := range failed {
reporter.log.Debug("failed to record Pending Nodes ",
zap.Stringer("NodeID", v.NodeID),
zap.String("Segment StreamID", v.StreamID.String()),
zap.Uint64("Segment Position", v.Position.Encode()))
}
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
}
return nil, nil
}
func (reporter *reporter) ReportReverificationNeeded(ctx context.Context, piece *PieceLocator) (err error) {
defer mon.Task()(&ctx)(&err)
err = reporter.newContainment.Insert(ctx, piece)
if err != nil {
return Error.New("failed to queue reverification audit for node: %w", err)
}
err = reporter.overlay.SetNodeContained(ctx, piece.NodeID, true)
if err != nil {
return Error.New("failed to update contained status: %w", err)
}
return nil
}
func (reporter *reporter) RecordReverificationResult(ctx context.Context, pendingJob *ReverificationJob, outcome Outcome, reputation overlay.ReputationStatus) (err error) {
defer mon.Task()(&ctx)(&err)
keepInQueue := true
report := Report{
NodesReputation: map[storj.NodeID]overlay.ReputationStatus{
pendingJob.Locator.NodeID: reputation,
},
}
switch outcome {
case OutcomeNotPerformed:
case OutcomeNotNecessary:
keepInQueue = false
case OutcomeSuccess:
report.Successes = append(report.Successes, pendingJob.Locator.NodeID)
keepInQueue = false
case OutcomeFailure:
report.Fails = append(report.Fails, pendingJob.Locator.NodeID)
keepInQueue = false
case OutcomeTimedOut:
// This will get re-added to the reverification queue, but that is idempotent
// and fine. We do need to add it to PendingAudits in order to get the
// maxReverifyCount check.
report.PieceAudits = append(report.PieceAudits, pendingJob)
case OutcomeUnknownError:
report.Unknown = append(report.Unknown, pendingJob.Locator.NodeID)
keepInQueue = false
case OutcomeNodeOffline:
report.Offlines = append(report.Offlines, pendingJob.Locator.NodeID)
}
var errList errs.Group
// apply any necessary reputation changes
reporter.RecordAudits(ctx, report)
// remove from reverifications queue if appropriate
if !keepInQueue {
_, stillContained, err := reporter.newContainment.Delete(ctx, &pendingJob.Locator)
if err != nil {
if !ErrContainedNotFound.Has(err) {
errList.Add(err)
}
} else if !stillContained {
err = reporter.overlay.SetNodeContained(ctx, pendingJob.Locator.NodeID, false)
errList.Add(err)
}
}
return errList.Err()
}