8ce619706b
Currently, pending audit is finding segment by segment location (path) because we want to move audit to segmentloop and we will have only StreamID and Position we need to add columns for those fields. Altering existing table can cause issues while migration and deployment. Cleaner choise is to make new table. This change contains migration with new segment_pending_audit table that will replace pending_audits table and adjustments to use new table in the code. Table pending_audits will be dropped with next release. Change-Id: Id507e29c152da594bac1fd812c78d7ecf45ec51f
257 lines
8.2 KiB
Go
257 lines
8.2 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package audit
|
|
|
|
import (
|
|
"context"
|
|
|
|
"github.com/zeebo/errs"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/storj"
|
|
"storj.io/storj/satellite/overlay"
|
|
)
|
|
|
|
// Reporter records audit reports in overlay and implements the reporter interface.
|
|
//
|
|
// architecture: Service
|
|
type Reporter struct {
|
|
log *zap.Logger
|
|
overlay *overlay.Service
|
|
containment Containment
|
|
maxRetries int
|
|
maxReverifyCount int32
|
|
}
|
|
|
|
// Report contains audit result lists for nodes that succeeded, failed, were offline, have pending audits, or failed for unknown reasons.
|
|
type Report struct {
|
|
Successes storj.NodeIDList
|
|
Fails storj.NodeIDList
|
|
Offlines storj.NodeIDList
|
|
PendingAudits []*PendingAudit
|
|
Unknown storj.NodeIDList
|
|
}
|
|
|
|
// NewReporter instantiates a reporter.
|
|
func NewReporter(log *zap.Logger, overlay *overlay.Service, containment Containment, maxRetries int, maxReverifyCount int32) *Reporter {
|
|
return &Reporter{
|
|
log: log,
|
|
overlay: overlay,
|
|
containment: containment,
|
|
maxRetries: maxRetries,
|
|
maxReverifyCount: maxReverifyCount}
|
|
}
|
|
|
|
// RecordAudits saves audit results to overlay. When no error, it returns
|
|
// nil for both return values, otherwise it returns the report with the fields
|
|
// set to the values which have been saved and the error.
|
|
func (reporter *Reporter) RecordAudits(ctx context.Context, req Report) (_ Report, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
successes := req.Successes
|
|
fails := req.Fails
|
|
unknowns := req.Unknown
|
|
offlines := req.Offlines
|
|
pendingAudits := req.PendingAudits
|
|
|
|
reporter.log.Debug("Reporting audits",
|
|
zap.Int("successes", len(successes)),
|
|
zap.Int("failures", len(fails)),
|
|
zap.Int("unknowns", len(unknowns)),
|
|
zap.Int("offlines", len(offlines)),
|
|
zap.Int("pending", len(pendingAudits)),
|
|
)
|
|
|
|
var errlist errs.Group
|
|
|
|
tries := 0
|
|
for tries <= reporter.maxRetries {
|
|
if len(successes) == 0 && len(fails) == 0 && len(unknowns) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 {
|
|
return Report{}, nil
|
|
}
|
|
|
|
errlist = errs.Group{}
|
|
|
|
if len(successes) > 0 {
|
|
successes, err = reporter.recordAuditSuccessStatus(ctx, successes)
|
|
if err != nil {
|
|
errlist.Add(err)
|
|
}
|
|
}
|
|
if len(fails) > 0 {
|
|
fails, err = reporter.recordAuditFailStatus(ctx, fails)
|
|
if err != nil {
|
|
errlist.Add(err)
|
|
}
|
|
}
|
|
if len(unknowns) > 0 {
|
|
unknowns, err = reporter.recordAuditUnknownStatus(ctx, unknowns)
|
|
if err != nil {
|
|
errlist.Add(err)
|
|
}
|
|
}
|
|
if len(offlines) > 0 {
|
|
offlines, err = reporter.recordOfflineStatus(ctx, offlines)
|
|
if err != nil {
|
|
errlist.Add(err)
|
|
}
|
|
}
|
|
if len(pendingAudits) > 0 {
|
|
pendingAudits, err = reporter.recordPendingAudits(ctx, pendingAudits)
|
|
if err != nil {
|
|
errlist.Add(err)
|
|
}
|
|
}
|
|
|
|
tries++
|
|
}
|
|
|
|
err = errlist.Err()
|
|
if tries >= reporter.maxRetries && err != nil {
|
|
return Report{
|
|
Successes: successes,
|
|
Fails: fails,
|
|
Offlines: offlines,
|
|
Unknown: unknowns,
|
|
PendingAudits: pendingAudits,
|
|
}, errs.Combine(Error.New("some nodes failed to be updated in overlay"), err)
|
|
}
|
|
return Report{}, nil
|
|
}
|
|
|
|
// recordAuditFailStatus updates nodeIDs in overlay with isup=true, auditoutcome=fail.
|
|
func (reporter *Reporter) recordAuditFailStatus(ctx context.Context, failedAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
updateRequests := make([]*overlay.UpdateRequest, len(failedAuditNodeIDs))
|
|
for i, nodeID := range failedAuditNodeIDs {
|
|
updateRequests[i] = &overlay.UpdateRequest{
|
|
NodeID: nodeID,
|
|
AuditOutcome: overlay.AuditFailure,
|
|
}
|
|
}
|
|
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
if err != nil || len(failed) > 0 {
|
|
reporter.log.Debug("failed to record Failed Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
|
return failed, errs.Combine(Error.New("failed to record some audit fail statuses in overlay"), err)
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// recordAuditUnknownStatus updates nodeIDs in overlay with isup=true, auditoutcome=unknown.
|
|
func (reporter *Reporter) recordAuditUnknownStatus(ctx context.Context, unknownAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
updateRequests := make([]*overlay.UpdateRequest, len(unknownAuditNodeIDs))
|
|
for i, nodeID := range unknownAuditNodeIDs {
|
|
updateRequests[i] = &overlay.UpdateRequest{
|
|
NodeID: nodeID,
|
|
AuditOutcome: overlay.AuditUnknown,
|
|
}
|
|
}
|
|
|
|
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
if err != nil || len(failed) > 0 {
|
|
reporter.log.Debug("failed to record Unknown Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
|
return failed, errs.Combine(Error.New("failed to record some audit unknown statuses in overlay"), err)
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// recordOfflineStatus updates nodeIDs in overlay with isup=false, auditoutcome=offline.
|
|
func (reporter *Reporter) recordOfflineStatus(ctx context.Context, offlineNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
updateRequests := make([]*overlay.UpdateRequest, len(offlineNodeIDs))
|
|
for i, nodeID := range offlineNodeIDs {
|
|
updateRequests[i] = &overlay.UpdateRequest{
|
|
NodeID: nodeID,
|
|
AuditOutcome: overlay.AuditOffline,
|
|
}
|
|
}
|
|
|
|
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
if err != nil || len(failed) > 0 {
|
|
reporter.log.Debug("failed to record Offline Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
|
return failed, errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), err)
|
|
}
|
|
|
|
return nil, nil
|
|
}
|
|
|
|
// recordAuditSuccessStatus updates nodeIDs in overlay with isup=true, auditoutcome=success.
|
|
func (reporter *Reporter) recordAuditSuccessStatus(ctx context.Context, successNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
updateRequests := make([]*overlay.UpdateRequest, len(successNodeIDs))
|
|
for i, nodeID := range successNodeIDs {
|
|
updateRequests[i] = &overlay.UpdateRequest{
|
|
NodeID: nodeID,
|
|
AuditOutcome: overlay.AuditSuccess,
|
|
}
|
|
}
|
|
|
|
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
if err != nil || len(failed) > 0 {
|
|
reporter.log.Debug("failed to record Success Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
|
return failed, errs.Combine(Error.New("failed to record some audit success statuses in overlay"), err)
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// recordPendingAudits updates the containment status of nodes with pending audits.
|
|
func (reporter *Reporter) recordPendingAudits(ctx context.Context, pendingAudits []*PendingAudit) (failed []*PendingAudit, err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
var errlist errs.Group
|
|
|
|
var updateRequests []*overlay.UpdateRequest
|
|
for _, pendingAudit := range pendingAudits {
|
|
if pendingAudit.ReverifyCount < reporter.maxReverifyCount {
|
|
err := reporter.containment.IncrementPending(ctx, pendingAudit)
|
|
if err != nil {
|
|
failed = append(failed, pendingAudit)
|
|
errlist.Add(err)
|
|
}
|
|
reporter.log.Info("Audit pending",
|
|
zap.Stringer("Piece ID", pendingAudit.PieceID),
|
|
zap.Stringer("Node ID", pendingAudit.NodeID))
|
|
} else {
|
|
// record failure -- max reverify count reached
|
|
reporter.log.Info("max reverify count reached (audit failed)", zap.Stringer("Node ID", pendingAudit.NodeID))
|
|
updateRequests = append(updateRequests, &overlay.UpdateRequest{
|
|
NodeID: pendingAudit.NodeID,
|
|
AuditOutcome: overlay.AuditFailure,
|
|
})
|
|
}
|
|
}
|
|
|
|
failedBatch, err := reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
if err != nil {
|
|
errlist.Add(err)
|
|
}
|
|
if len(failedBatch) > 0 {
|
|
pendingMap := make(map[storj.NodeID]*PendingAudit)
|
|
for _, pendingAudit := range pendingAudits {
|
|
pendingMap[pendingAudit.NodeID] = pendingAudit
|
|
}
|
|
for _, nodeID := range failedBatch {
|
|
pending, ok := pendingMap[nodeID]
|
|
if ok {
|
|
failed = append(failed, pending)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(failed) > 0 {
|
|
for _, v := range failed {
|
|
reporter.log.Debug("failed to record Pending Nodes ",
|
|
zap.Stringer("NodeID", v.NodeID),
|
|
zap.String("Segment StreamID", v.StreamID.String()),
|
|
zap.Uint64("Segment Position", v.Position.Encode()))
|
|
}
|
|
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
|
|
}
|
|
return nil, nil
|
|
}
|