2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-16 18:40:34 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package audit
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2022-11-22 21:55:19 +00:00
|
|
|
"strings"
|
2018-10-16 18:40:34 +01:00
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
"github.com/zeebo/errs"
|
2019-06-07 13:38:41 +01:00
|
|
|
"go.uber.org/zap"
|
2019-05-23 21:07:19 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/storj"
|
2021-11-08 20:51:04 +00:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2021-06-23 00:09:39 +01:00
|
|
|
"storj.io/storj/satellite/reputation"
|
2018-10-16 18:40:34 +01:00
|
|
|
)
|
|
|
|
|
2022-04-11 17:47:14 +01:00
|
|
|
// reporter records audit reports in overlay and implements the Reporter interface.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2022-04-11 17:47:14 +01:00
|
|
|
type reporter struct {
|
2022-11-23 15:24:30 +00:00
|
|
|
log *zap.Logger
|
|
|
|
reputations *reputation.Service
|
|
|
|
overlay *overlay.Service
|
|
|
|
containment Containment
|
2019-05-31 16:23:00 +01:00
|
|
|
maxRetries int
|
|
|
|
maxReverifyCount int32
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2022-04-11 17:47:14 +01:00
|
|
|
// Reporter records audit reports in the overlay and database.
|
|
|
|
type Reporter interface {
|
2022-11-22 21:55:19 +00:00
|
|
|
RecordAudits(ctx context.Context, req Report)
|
|
|
|
ReportReverificationNeeded(ctx context.Context, piece *PieceLocator) (err error)
|
|
|
|
RecordReverificationResult(ctx context.Context, pendingJob *ReverificationJob, outcome Outcome, reputation overlay.ReputationStatus) (err error)
|
2022-04-11 17:47:14 +01:00
|
|
|
}
|
|
|
|
|
2021-08-11 21:02:54 +01:00
|
|
|
// Report contains audit result.
|
|
|
|
// It records whether an audit is able to be completed, the total number of
|
|
|
|
// pieces a given audit has conducted for, lists for nodes that
|
|
|
|
// succeeded, failed, were offline, have pending audits, or failed for unknown
|
2021-11-08 20:51:04 +00:00
|
|
|
// reasons and their current reputation status.
|
2019-05-23 23:32:19 +01:00
|
|
|
type Report struct {
|
2022-11-23 15:24:30 +00:00
|
|
|
Successes storj.NodeIDList
|
|
|
|
Fails storj.NodeIDList
|
|
|
|
Offlines storj.NodeIDList
|
|
|
|
PendingAudits []*ReverificationJob
|
2021-11-08 20:51:04 +00:00
|
|
|
Unknown storj.NodeIDList
|
|
|
|
NodesReputation map[storj.NodeID]overlay.ReputationStatus
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// NewReporter instantiates a reporter.
|
2022-11-23 15:24:30 +00:00
|
|
|
func NewReporter(log *zap.Logger, reputations *reputation.Service, overlay *overlay.Service, containment Containment, maxRetries int, maxReverifyCount int32) Reporter {
|
2022-04-11 17:47:14 +01:00
|
|
|
return &reporter{
|
2019-06-07 13:38:41 +01:00
|
|
|
log: log,
|
2021-06-23 00:09:39 +01:00
|
|
|
reputations: reputations,
|
2022-11-22 21:55:19 +00:00
|
|
|
overlay: overlay,
|
2022-11-23 15:24:30 +00:00
|
|
|
containment: containment,
|
2019-06-07 13:38:41 +01:00
|
|
|
maxRetries: maxRetries,
|
2021-09-15 21:31:33 +01:00
|
|
|
maxReverifyCount: maxReverifyCount,
|
|
|
|
}
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2022-11-22 21:55:19 +00:00
|
|
|
// RecordAudits saves audit results, applying reputation changes as appropriate.
|
|
|
|
// If some records can not be updated after a number of attempts, the failures
|
|
|
|
// are logged at level ERROR, but are otherwise thrown away.
|
|
|
|
func (reporter *reporter) RecordAudits(ctx context.Context, req Report) {
|
|
|
|
defer mon.Task()(&ctx)(nil)
|
2019-05-27 12:13:47 +01:00
|
|
|
|
2019-05-23 23:32:19 +01:00
|
|
|
successes := req.Successes
|
|
|
|
fails := req.Fails
|
2020-03-09 15:35:54 +00:00
|
|
|
unknowns := req.Unknown
|
2019-05-23 23:32:19 +01:00
|
|
|
offlines := req.Offlines
|
2022-11-23 15:24:30 +00:00
|
|
|
pendingAudits := req.PendingAudits
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-06-07 13:38:41 +01:00
|
|
|
reporter.log.Debug("Reporting audits",
|
|
|
|
zap.Int("successes", len(successes)),
|
|
|
|
zap.Int("failures", len(fails)),
|
2020-03-09 15:35:54 +00:00
|
|
|
zap.Int("unknowns", len(unknowns)),
|
2019-06-07 13:38:41 +01:00
|
|
|
zap.Int("offlines", len(offlines)),
|
|
|
|
zap.Int("pending", len(pendingAudits)),
|
|
|
|
)
|
|
|
|
|
2021-11-08 20:51:04 +00:00
|
|
|
nodesReputation := req.NodesReputation
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2022-11-23 15:14:27 +00:00
|
|
|
reportFailures := func(tries int, resultType string, err error, nodes storj.NodeIDList, pending []*ReverificationJob) {
|
2022-11-22 21:55:19 +00:00
|
|
|
if err == nil || tries < reporter.maxRetries {
|
|
|
|
// don't need to report anything until the last time through
|
|
|
|
return
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2022-11-22 21:55:19 +00:00
|
|
|
reporter.log.Error("failed to update reputation information with audit results",
|
|
|
|
zap.String("result type", resultType),
|
|
|
|
zap.Error(err),
|
|
|
|
zap.String("node IDs", strings.Join(nodes.Strings(), ", ")),
|
2022-11-23 15:14:27 +00:00
|
|
|
zap.Any("pending segment audits", pending))
|
2022-11-22 21:55:19 +00:00
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2022-11-22 21:55:19 +00:00
|
|
|
var err error
|
|
|
|
for tries := 0; tries <= reporter.maxRetries; tries++ {
|
2022-11-23 15:14:27 +00:00
|
|
|
if len(successes) == 0 && len(fails) == 0 && len(unknowns) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 {
|
2022-11-22 21:55:19 +00:00
|
|
|
return
|
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2022-04-11 17:47:14 +01:00
|
|
|
successes, err = reporter.recordAuditStatus(ctx, successes, nodesReputation, reputation.AuditSuccess)
|
2022-11-23 15:14:27 +00:00
|
|
|
reportFailures(tries, "successful", err, successes, nil)
|
2022-04-11 17:47:14 +01:00
|
|
|
fails, err = reporter.recordAuditStatus(ctx, fails, nodesReputation, reputation.AuditFailure)
|
2022-11-23 15:14:27 +00:00
|
|
|
reportFailures(tries, "failed", err, fails, nil)
|
2022-04-11 17:47:14 +01:00
|
|
|
unknowns, err = reporter.recordAuditStatus(ctx, unknowns, nodesReputation, reputation.AuditUnknown)
|
2022-11-23 15:14:27 +00:00
|
|
|
reportFailures(tries, "unknown", err, unknowns, nil)
|
2022-04-11 17:47:14 +01:00
|
|
|
offlines, err = reporter.recordAuditStatus(ctx, offlines, nodesReputation, reputation.AuditOffline)
|
2022-11-23 15:14:27 +00:00
|
|
|
reportFailures(tries, "offline", err, offlines, nil)
|
2022-11-23 15:24:30 +00:00
|
|
|
pendingAudits, err = reporter.recordPendingAudits(ctx, pendingAudits, nodesReputation)
|
2022-11-23 15:14:27 +00:00
|
|
|
reportFailures(tries, "pending", err, nil, pendingAudits)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-04-11 17:47:14 +01:00
|
|
|
func (reporter *reporter) recordAuditStatus(ctx context.Context, nodeIDs storj.NodeIDList, nodesReputation map[storj.NodeID]overlay.ReputationStatus, auditOutcome reputation.AuditType) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2020-10-22 22:02:48 +01:00
|
|
|
|
2022-04-11 17:47:14 +01:00
|
|
|
if len(nodeIDs) == 0 {
|
|
|
|
return nil, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2021-11-08 20:51:04 +00:00
|
|
|
var errors errs.Group
|
2022-04-11 17:47:14 +01:00
|
|
|
for _, nodeID := range nodeIDs {
|
|
|
|
err = reporter.reputations.ApplyAudit(ctx, nodeID, nodesReputation[nodeID], auditOutcome)
|
2021-06-23 00:09:39 +01:00
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, nodeID)
|
2022-04-11 17:47:14 +01:00
|
|
|
errors.Add(Error.New("failed to record audit status %s in overlay for node %s: %w", auditOutcome.String(), nodeID.String(), err))
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
}
|
2021-11-08 20:51:04 +00:00
|
|
|
return failed, errors.Err()
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
|
2022-11-23 15:24:30 +00:00
|
|
|
// recordPendingAudits updates the containment status of nodes with pending piece audits.
|
|
|
|
func (reporter *reporter) recordPendingAudits(ctx context.Context, pendingAudits []*ReverificationJob, nodesReputation map[storj.NodeID]overlay.ReputationStatus) (failed []*ReverificationJob, err error) {
|
2022-11-22 21:55:19 +00:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
var errlist errs.Group
|
|
|
|
|
|
|
|
for _, pendingAudit := range pendingAudits {
|
|
|
|
logger := reporter.log.With(
|
|
|
|
zap.Stringer("Node ID", pendingAudit.Locator.NodeID),
|
|
|
|
zap.Stringer("Stream ID", pendingAudit.Locator.StreamID),
|
|
|
|
zap.Uint64("Position", pendingAudit.Locator.Position.Encode()),
|
|
|
|
zap.Int("Piece Num", pendingAudit.Locator.PieceNum))
|
|
|
|
|
|
|
|
if pendingAudit.ReverifyCount < int(reporter.maxReverifyCount) {
|
|
|
|
err := reporter.ReportReverificationNeeded(ctx, &pendingAudit.Locator)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, pendingAudit)
|
|
|
|
errlist.Add(err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
logger.Info("reverification queued")
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// record failure -- max reverify count reached
|
|
|
|
logger.Info("max reverify count reached (audit failed)")
|
|
|
|
err = reporter.reputations.ApplyAudit(ctx, pendingAudit.Locator.NodeID, nodesReputation[pendingAudit.Locator.NodeID], reputation.AuditFailure)
|
|
|
|
if err != nil {
|
|
|
|
logger.Info("failed to update reputation information", zap.Error(err))
|
|
|
|
errlist.Add(err)
|
|
|
|
failed = append(failed, pendingAudit)
|
|
|
|
continue
|
|
|
|
}
|
2022-11-23 15:24:30 +00:00
|
|
|
_, stillContained, err := reporter.containment.Delete(ctx, &pendingAudit.Locator)
|
2022-11-22 21:55:19 +00:00
|
|
|
if err != nil {
|
|
|
|
if !ErrContainedNotFound.Has(err) {
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if !stillContained {
|
|
|
|
err = reporter.overlay.SetNodeContained(ctx, pendingAudit.Locator.NodeID, false)
|
|
|
|
if err != nil {
|
|
|
|
logger.Error("failed to mark node as not contained", zap.Error(err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(failed) > 0 {
|
|
|
|
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
|
|
|
|
}
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (reporter *reporter) ReportReverificationNeeded(ctx context.Context, piece *PieceLocator) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2022-11-23 15:24:30 +00:00
|
|
|
err = reporter.containment.Insert(ctx, piece)
|
2022-11-22 21:55:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return Error.New("failed to queue reverification audit for node: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
err = reporter.overlay.SetNodeContained(ctx, piece.NodeID, true)
|
|
|
|
if err != nil {
|
|
|
|
return Error.New("failed to update contained status: %w", err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (reporter *reporter) RecordReverificationResult(ctx context.Context, pendingJob *ReverificationJob, outcome Outcome, reputation overlay.ReputationStatus) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
keepInQueue := true
|
|
|
|
report := Report{
|
|
|
|
NodesReputation: map[storj.NodeID]overlay.ReputationStatus{
|
|
|
|
pendingJob.Locator.NodeID: reputation,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
switch outcome {
|
|
|
|
case OutcomeNotPerformed:
|
|
|
|
case OutcomeNotNecessary:
|
|
|
|
keepInQueue = false
|
|
|
|
case OutcomeSuccess:
|
|
|
|
report.Successes = append(report.Successes, pendingJob.Locator.NodeID)
|
|
|
|
keepInQueue = false
|
|
|
|
case OutcomeFailure:
|
|
|
|
report.Fails = append(report.Fails, pendingJob.Locator.NodeID)
|
|
|
|
keepInQueue = false
|
|
|
|
case OutcomeTimedOut:
|
|
|
|
// This will get re-added to the reverification queue, but that is idempotent
|
|
|
|
// and fine. We do need to add it to PendingAudits in order to get the
|
|
|
|
// maxReverifyCount check.
|
2022-11-23 15:24:30 +00:00
|
|
|
report.PendingAudits = append(report.PendingAudits, pendingJob)
|
2022-11-22 21:55:19 +00:00
|
|
|
case OutcomeUnknownError:
|
|
|
|
report.Unknown = append(report.Unknown, pendingJob.Locator.NodeID)
|
|
|
|
keepInQueue = false
|
|
|
|
case OutcomeNodeOffline:
|
|
|
|
report.Offlines = append(report.Offlines, pendingJob.Locator.NodeID)
|
|
|
|
}
|
|
|
|
var errList errs.Group
|
|
|
|
|
|
|
|
// apply any necessary reputation changes
|
|
|
|
reporter.RecordAudits(ctx, report)
|
|
|
|
|
|
|
|
// remove from reverifications queue if appropriate
|
|
|
|
if !keepInQueue {
|
2022-11-23 15:24:30 +00:00
|
|
|
_, stillContained, err := reporter.containment.Delete(ctx, &pendingJob.Locator)
|
2022-11-22 21:55:19 +00:00
|
|
|
if err != nil {
|
|
|
|
if !ErrContainedNotFound.Has(err) {
|
|
|
|
errList.Add(err)
|
|
|
|
}
|
|
|
|
} else if !stillContained {
|
|
|
|
err = reporter.overlay.SetNodeContained(ctx, pendingJob.Locator.NodeID, false)
|
|
|
|
errList.Add(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return errList.Err()
|
|
|
|
}
|