2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-16 18:40:34 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package audit
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
"github.com/zeebo/errs"
|
2019-06-07 13:38:41 +01:00
|
|
|
"go.uber.org/zap"
|
2019-05-23 21:07:19 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/storj"
|
2021-06-23 00:09:39 +01:00
|
|
|
"storj.io/storj/satellite/reputation"
|
2018-10-16 18:40:34 +01:00
|
|
|
)
|
|
|
|
|
2020-12-05 16:01:42 +00:00
|
|
|
// Reporter records audit reports in overlay and implements the reporter interface.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2018-10-16 18:40:34 +01:00
|
|
|
type Reporter struct {
|
2019-06-07 13:38:41 +01:00
|
|
|
log *zap.Logger
|
2021-06-23 00:09:39 +01:00
|
|
|
reputations *reputation.Service
|
2019-05-31 16:23:00 +01:00
|
|
|
containment Containment
|
|
|
|
maxRetries int
|
|
|
|
maxReverifyCount int32
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2021-08-11 21:02:54 +01:00
|
|
|
// Report contains audit result.
|
|
|
|
// It records whether an audit is able to be completed, the total number of
|
|
|
|
// pieces a given audit has conducted for, lists for nodes that
|
|
|
|
// succeeded, failed, were offline, have pending audits, or failed for unknown
|
|
|
|
// reasons.
|
2019-05-23 23:32:19 +01:00
|
|
|
type Report struct {
|
|
|
|
Successes storj.NodeIDList
|
|
|
|
Fails storj.NodeIDList
|
|
|
|
Offlines storj.NodeIDList
|
|
|
|
PendingAudits []*PendingAudit
|
2019-11-19 16:30:28 +00:00
|
|
|
Unknown storj.NodeIDList
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// NewReporter instantiates a reporter.
|
2021-06-23 00:09:39 +01:00
|
|
|
func NewReporter(log *zap.Logger, reputations *reputation.Service, containment Containment, maxRetries int, maxReverifyCount int32) *Reporter {
|
2019-06-07 13:38:41 +01:00
|
|
|
return &Reporter{
|
|
|
|
log: log,
|
2021-06-23 00:09:39 +01:00
|
|
|
reputations: reputations,
|
2019-06-07 13:38:41 +01:00
|
|
|
containment: containment,
|
|
|
|
maxRetries: maxRetries,
|
2021-09-15 21:31:33 +01:00
|
|
|
maxReverifyCount: maxReverifyCount,
|
|
|
|
}
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2019-08-06 17:35:59 +01:00
|
|
|
// RecordAudits saves audit results to overlay. When no error, it returns
|
2019-06-25 10:23:41 +01:00
|
|
|
// nil for both return values, otherwise it returns the report with the fields
|
|
|
|
// set to the values which have been saved and the error.
|
2020-12-14 12:54:22 +00:00
|
|
|
func (reporter *Reporter) RecordAudits(ctx context.Context, req Report) (_ Report, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-27 12:13:47 +01:00
|
|
|
|
2019-05-23 23:32:19 +01:00
|
|
|
successes := req.Successes
|
|
|
|
fails := req.Fails
|
2020-03-09 15:35:54 +00:00
|
|
|
unknowns := req.Unknown
|
2019-05-23 23:32:19 +01:00
|
|
|
offlines := req.Offlines
|
2019-05-23 21:07:19 +01:00
|
|
|
pendingAudits := req.PendingAudits
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-06-07 13:38:41 +01:00
|
|
|
reporter.log.Debug("Reporting audits",
|
|
|
|
zap.Int("successes", len(successes)),
|
|
|
|
zap.Int("failures", len(fails)),
|
2020-03-09 15:35:54 +00:00
|
|
|
zap.Int("unknowns", len(unknowns)),
|
2019-06-07 13:38:41 +01:00
|
|
|
zap.Int("offlines", len(offlines)),
|
|
|
|
zap.Int("pending", len(pendingAudits)),
|
|
|
|
)
|
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
var errlist errs.Group
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-06-21 16:10:03 +01:00
|
|
|
tries := 0
|
|
|
|
for tries <= reporter.maxRetries {
|
2020-03-09 15:35:54 +00:00
|
|
|
if len(successes) == 0 && len(fails) == 0 && len(unknowns) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 {
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{}, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist = errs.Group{}
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-05-23 23:32:19 +01:00
|
|
|
if len(successes) > 0 {
|
|
|
|
successes, err = reporter.recordAuditSuccessStatus(ctx, successes)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist.Add(err)
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
2019-05-23 23:32:19 +01:00
|
|
|
if len(fails) > 0 {
|
|
|
|
fails, err = reporter.recordAuditFailStatus(ctx, fails)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist.Add(err)
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
2020-03-09 15:35:54 +00:00
|
|
|
if len(unknowns) > 0 {
|
|
|
|
unknowns, err = reporter.recordAuditUnknownStatus(ctx, unknowns)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
}
|
2020-10-22 22:02:48 +01:00
|
|
|
if len(offlines) > 0 {
|
2019-05-23 23:32:19 +01:00
|
|
|
offlines, err = reporter.recordOfflineStatus(ctx, offlines)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(pendingAudits) > 0 {
|
|
|
|
pendingAudits, err = reporter.recordPendingAudits(ctx, pendingAudits)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-21 16:10:03 +01:00
|
|
|
tries++
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2019-05-23 21:07:19 +01:00
|
|
|
|
|
|
|
err = errlist.Err()
|
2019-06-21 16:10:03 +01:00
|
|
|
if tries >= reporter.maxRetries && err != nil {
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2019-05-23 23:32:19 +01:00
|
|
|
Successes: successes,
|
|
|
|
Fails: fails,
|
|
|
|
Offlines: offlines,
|
2020-03-09 15:35:54 +00:00
|
|
|
Unknown: unknowns,
|
2019-05-23 23:32:19 +01:00
|
|
|
PendingAudits: pendingAudits,
|
2019-05-23 21:07:19 +01:00
|
|
|
}, errs.Combine(Error.New("some nodes failed to be updated in overlay"), err)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{}, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// recordAuditFailStatus updates nodeIDs in overlay with isup=true, auditoutcome=fail.
|
2018-12-19 18:44:03 +00:00
|
|
|
func (reporter *Reporter) recordAuditFailStatus(ctx context.Context, failedAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-31 18:21:06 +01:00
|
|
|
|
2021-06-23 00:09:39 +01:00
|
|
|
var errors error
|
|
|
|
for _, nodeID := range failedAuditNodeIDs {
|
|
|
|
err = reporter.reputations.ApplyAudit(ctx, nodeID, reputation.AuditFailure)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, nodeID)
|
|
|
|
errors = errs.Combine(Error.New("failed to record some audit fail statuses in overlay"), err)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
2021-06-23 00:09:39 +01:00
|
|
|
return failed, errors
|
2020-03-09 15:35:54 +00:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// recordAuditUnknownStatus updates nodeIDs in overlay with isup=true, auditoutcome=unknown.
|
2020-03-09 15:35:54 +00:00
|
|
|
func (reporter *Reporter) recordAuditUnknownStatus(ctx context.Context, unknownAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2021-06-23 00:09:39 +01:00
|
|
|
var errors error
|
|
|
|
for _, nodeID := range unknownAuditNodeIDs {
|
|
|
|
err = reporter.reputations.ApplyAudit(ctx, nodeID, reputation.AuditUnknown)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, nodeID)
|
|
|
|
errors = errs.Combine(Error.New("failed to record some audit unknown statuses in overlay"), err)
|
2019-07-31 18:21:06 +01:00
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
2021-06-23 00:09:39 +01:00
|
|
|
return failed, errors
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2020-10-22 22:02:48 +01:00
|
|
|
// recordOfflineStatus updates nodeIDs in overlay with isup=false, auditoutcome=offline.
|
2018-12-19 18:44:03 +00:00
|
|
|
func (reporter *Reporter) recordOfflineStatus(ctx context.Context, offlineNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2020-10-22 22:02:48 +01:00
|
|
|
|
2021-06-23 00:09:39 +01:00
|
|
|
var errors error
|
|
|
|
for _, nodeID := range offlineNodeIDs {
|
|
|
|
err = reporter.reputations.ApplyAudit(ctx, nodeID, reputation.AuditOffline)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, nodeID)
|
|
|
|
errors = errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), err)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
2021-06-23 00:09:39 +01:00
|
|
|
return failed, errors
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// recordAuditSuccessStatus updates nodeIDs in overlay with isup=true, auditoutcome=success.
|
2018-12-19 18:44:03 +00:00
|
|
|
func (reporter *Reporter) recordAuditSuccessStatus(ctx context.Context, successNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-31 18:21:06 +01:00
|
|
|
|
2021-06-23 00:09:39 +01:00
|
|
|
var errors error
|
|
|
|
for _, nodeID := range successNodeIDs {
|
|
|
|
err = reporter.reputations.ApplyAudit(ctx, nodeID, reputation.AuditSuccess)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, nodeID)
|
|
|
|
errors = errs.Combine(Error.New("failed to record some audit success statuses in overlay"), err)
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
}
|
2021-06-23 00:09:39 +01:00
|
|
|
return failed, errors
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// recordPendingAudits updates the containment status of nodes with pending audits.
|
2019-05-23 21:07:19 +01:00
|
|
|
func (reporter *Reporter) recordPendingAudits(ctx context.Context, pendingAudits []*PendingAudit) (failed []*PendingAudit, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-23 21:07:19 +01:00
|
|
|
var errlist errs.Group
|
2019-07-31 18:21:06 +01:00
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
for _, pendingAudit := range pendingAudits {
|
2019-05-31 16:23:00 +01:00
|
|
|
if pendingAudit.ReverifyCount < reporter.maxReverifyCount {
|
2019-05-27 12:13:47 +01:00
|
|
|
err := reporter.containment.IncrementPending(ctx, pendingAudit)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, pendingAudit)
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
2021-03-09 22:05:37 +00:00
|
|
|
reporter.log.Info("Audit pending",
|
|
|
|
zap.Stringer("Piece ID", pendingAudit.PieceID),
|
|
|
|
zap.Stringer("Node ID", pendingAudit.NodeID))
|
2019-05-27 12:13:47 +01:00
|
|
|
} else {
|
|
|
|
// record failure -- max reverify count reached
|
2021-03-09 22:05:37 +00:00
|
|
|
reporter.log.Info("max reverify count reached (audit failed)", zap.Stringer("Node ID", pendingAudit.NodeID))
|
2021-06-23 00:09:39 +01:00
|
|
|
err = reporter.reputations.ApplyAudit(ctx, pendingAudit.NodeID, reputation.AuditFailure)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
|
|
|
failed = append(failed, pendingAudit)
|
2021-08-18 18:21:52 +01:00
|
|
|
} else {
|
|
|
|
_, err = reporter.containment.Delete(ctx, pendingAudit.NodeID)
|
|
|
|
if err != nil && !ErrContainedNotFound.Has(err) {
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
2019-07-31 18:21:06 +01:00
|
|
|
}
|
|
|
|
}
|
2020-03-09 15:35:54 +00:00
|
|
|
}
|
2019-07-31 18:21:06 +01:00
|
|
|
|
2020-03-09 15:35:54 +00:00
|
|
|
if len(failed) > 0 {
|
|
|
|
for _, v := range failed {
|
2021-06-11 15:34:46 +01:00
|
|
|
reporter.log.Debug("failed to record Pending Nodes ",
|
|
|
|
zap.Stringer("NodeID", v.NodeID),
|
|
|
|
zap.String("Segment StreamID", v.StreamID.String()),
|
|
|
|
zap.Uint64("Segment Position", v.Position.Encode()))
|
2019-07-30 17:03:25 +01:00
|
|
|
}
|
2020-03-09 15:35:54 +00:00
|
|
|
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
return nil, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|