2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-16 18:40:34 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package audit
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
"github.com/zeebo/errs"
|
2019-06-07 13:38:41 +01:00
|
|
|
"go.uber.org/zap"
|
2019-05-23 21:07:19 +01:00
|
|
|
|
2018-11-29 18:39:27 +00:00
|
|
|
"storj.io/storj/pkg/storj"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2018-10-16 18:40:34 +01:00
|
|
|
)
|
|
|
|
|
2019-12-18 10:51:24 +00:00
|
|
|
// We do not report offline nodes to the overlay at this time; see V3-3025.
|
|
|
|
const reportOfflineDuringAudit = false
|
|
|
|
|
2019-03-25 22:25:09 +00:00
|
|
|
// Reporter records audit reports in overlay and implements the reporter interface
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2018-10-16 18:40:34 +01:00
|
|
|
type Reporter struct {
|
2019-06-07 13:38:41 +01:00
|
|
|
log *zap.Logger
|
2019-08-06 17:35:59 +01:00
|
|
|
overlay *overlay.Service
|
2019-05-31 16:23:00 +01:00
|
|
|
containment Containment
|
|
|
|
maxRetries int
|
|
|
|
maxReverifyCount int32
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2019-11-19 16:30:28 +00:00
|
|
|
// Report contains audit result lists for nodes that succeeded, failed, were offline, have pending audits, or failed for unknown reasons
|
2019-05-23 23:32:19 +01:00
|
|
|
type Report struct {
|
|
|
|
Successes storj.NodeIDList
|
|
|
|
Fails storj.NodeIDList
|
|
|
|
Offlines storj.NodeIDList
|
|
|
|
PendingAudits []*PendingAudit
|
2019-11-19 16:30:28 +00:00
|
|
|
Unknown storj.NodeIDList
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
|
2018-10-16 18:40:34 +01:00
|
|
|
// NewReporter instantiates a reporter
|
2019-08-06 17:35:59 +01:00
|
|
|
func NewReporter(log *zap.Logger, overlay *overlay.Service, containment Containment, maxRetries int, maxReverifyCount int32) *Reporter {
|
2019-06-07 13:38:41 +01:00
|
|
|
return &Reporter{
|
|
|
|
log: log,
|
|
|
|
overlay: overlay,
|
|
|
|
containment: containment,
|
|
|
|
maxRetries: maxRetries,
|
|
|
|
maxReverifyCount: maxReverifyCount}
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2019-08-06 17:35:59 +01:00
|
|
|
// RecordAudits saves audit results to overlay. When no error, it returns
|
2019-06-25 10:23:41 +01:00
|
|
|
// nil for both return values, otherwise it returns the report with the fields
|
|
|
|
// set to the values which have been saved and the error.
|
2019-10-16 12:48:05 +01:00
|
|
|
func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path storj.Path) (_ Report, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-27 12:13:47 +01:00
|
|
|
|
2019-05-23 23:32:19 +01:00
|
|
|
successes := req.Successes
|
|
|
|
fails := req.Fails
|
|
|
|
offlines := req.Offlines
|
2019-05-23 21:07:19 +01:00
|
|
|
pendingAudits := req.PendingAudits
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-06-07 13:38:41 +01:00
|
|
|
reporter.log.Debug("Reporting audits",
|
|
|
|
zap.Int("successes", len(successes)),
|
|
|
|
zap.Int("failures", len(fails)),
|
|
|
|
zap.Int("offlines", len(offlines)),
|
|
|
|
zap.Int("pending", len(pendingAudits)),
|
2019-10-16 12:48:05 +01:00
|
|
|
zap.Binary("Segment", []byte(path)),
|
|
|
|
zap.String("Segment Path", path),
|
2019-06-07 13:38:41 +01:00
|
|
|
)
|
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
var errlist errs.Group
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-06-21 16:10:03 +01:00
|
|
|
tries := 0
|
|
|
|
for tries <= reporter.maxRetries {
|
2019-06-07 13:38:41 +01:00
|
|
|
if len(successes) == 0 && len(fails) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 {
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{}, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist = errs.Group{}
|
2018-12-19 18:44:03 +00:00
|
|
|
|
2019-05-23 23:32:19 +01:00
|
|
|
if len(successes) > 0 {
|
|
|
|
successes, err = reporter.recordAuditSuccessStatus(ctx, successes)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist.Add(err)
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
2019-05-23 23:32:19 +01:00
|
|
|
if len(fails) > 0 {
|
|
|
|
fails, err = reporter.recordAuditFailStatus(ctx, fails)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist.Add(err)
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
2019-12-18 10:51:24 +00:00
|
|
|
// We do not report offline nodes to the overlay at this time; see V3-3025.
|
|
|
|
if len(offlines) > 0 && reportOfflineDuringAudit {
|
2019-05-23 23:32:19 +01:00
|
|
|
offlines, err = reporter.recordOfflineStatus(ctx, offlines)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(pendingAudits) > 0 {
|
|
|
|
pendingAudits, err = reporter.recordPendingAudits(ctx, pendingAudits)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-21 16:10:03 +01:00
|
|
|
tries++
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2019-05-23 21:07:19 +01:00
|
|
|
|
|
|
|
err = errlist.Err()
|
2019-06-21 16:10:03 +01:00
|
|
|
if tries >= reporter.maxRetries && err != nil {
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2019-05-23 23:32:19 +01:00
|
|
|
Successes: successes,
|
|
|
|
Fails: fails,
|
|
|
|
Offlines: offlines,
|
|
|
|
PendingAudits: pendingAudits,
|
2019-05-23 21:07:19 +01:00
|
|
|
}, errs.Combine(Error.New("some nodes failed to be updated in overlay"), err)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{}, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2019-03-25 22:25:09 +00:00
|
|
|
// recordAuditFailStatus updates nodeIDs in overlay with isup=true, auditsuccess=false
|
2018-12-19 18:44:03 +00:00
|
|
|
func (reporter *Reporter) recordAuditFailStatus(ctx context.Context, failedAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-31 18:21:06 +01:00
|
|
|
|
|
|
|
updateRequests := make([]*overlay.UpdateRequest, len(failedAuditNodeIDs))
|
|
|
|
for i, nodeID := range failedAuditNodeIDs {
|
|
|
|
updateRequests[i] = &overlay.UpdateRequest{
|
2018-12-19 18:44:03 +00:00
|
|
|
NodeID: nodeID,
|
|
|
|
IsUp: true,
|
|
|
|
AuditSuccess: false,
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
2019-07-31 18:21:06 +01:00
|
|
|
if len(updateRequests) > 0 {
|
|
|
|
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
|
|
if err != nil || len(failed) > 0 {
|
|
|
|
reporter.log.Debug("failed to record Failed Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
|
|
|
return failed, errs.Combine(Error.New("failed to record some audit fail statuses in overlay"), err)
|
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
return nil, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2019-08-22 13:13:43 +01:00
|
|
|
// recordOfflineStatus updates nodeIDs in overlay with isup=false. When there
|
|
|
|
// is any error the function return the list of nodes which haven't been
|
|
|
|
// recorded.
|
2018-12-19 18:44:03 +00:00
|
|
|
func (reporter *Reporter) recordOfflineStatus(ctx context.Context, offlineNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-23 21:07:19 +01:00
|
|
|
var errlist errs.Group
|
2018-12-19 18:44:03 +00:00
|
|
|
for _, nodeID := range offlineNodeIDs {
|
2019-03-25 22:25:09 +00:00
|
|
|
_, err := reporter.overlay.UpdateUptime(ctx, nodeID, false)
|
2018-12-19 18:44:03 +00:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
failed = append(failed, nodeID)
|
|
|
|
errlist.Add(err)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
2019-05-23 21:07:19 +01:00
|
|
|
if len(failed) > 0 {
|
2019-07-30 17:03:25 +01:00
|
|
|
reporter.log.Debug("failed to record Offline Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
2019-05-23 21:07:19 +01:00
|
|
|
return failed, errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), errlist.Err())
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
2019-07-31 18:21:06 +01:00
|
|
|
|
2018-12-19 18:44:03 +00:00
|
|
|
return nil, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2019-03-25 22:25:09 +00:00
|
|
|
// recordAuditSuccessStatus updates nodeIDs in overlay with isup=true, auditsuccess=true
|
2018-12-19 18:44:03 +00:00
|
|
|
func (reporter *Reporter) recordAuditSuccessStatus(ctx context.Context, successNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-31 18:21:06 +01:00
|
|
|
|
|
|
|
updateRequests := make([]*overlay.UpdateRequest, len(successNodeIDs))
|
|
|
|
for i, nodeID := range successNodeIDs {
|
|
|
|
updateRequests[i] = &overlay.UpdateRequest{
|
2018-12-19 18:44:03 +00:00
|
|
|
NodeID: nodeID,
|
|
|
|
IsUp: true,
|
|
|
|
AuditSuccess: true,
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
}
|
2019-07-31 18:21:06 +01:00
|
|
|
|
|
|
|
if len(updateRequests) > 0 {
|
|
|
|
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
|
|
if err != nil || len(failed) > 0 {
|
|
|
|
reporter.log.Debug("failed to record Success Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
|
|
|
return failed, errs.Combine(Error.New("failed to record some audit success statuses in overlay"), err)
|
|
|
|
}
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// recordPendingAudits updates the containment status of nodes with pending audits
|
|
|
|
func (reporter *Reporter) recordPendingAudits(ctx context.Context, pendingAudits []*PendingAudit) (failed []*PendingAudit, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-23 21:07:19 +01:00
|
|
|
var errlist errs.Group
|
2019-07-31 18:21:06 +01:00
|
|
|
|
|
|
|
var updateRequests []*overlay.UpdateRequest
|
2019-05-23 21:07:19 +01:00
|
|
|
for _, pendingAudit := range pendingAudits {
|
2019-05-31 16:23:00 +01:00
|
|
|
if pendingAudit.ReverifyCount < reporter.maxReverifyCount {
|
2019-05-27 12:13:47 +01:00
|
|
|
err := reporter.containment.IncrementPending(ctx, pendingAudit)
|
|
|
|
if err != nil {
|
|
|
|
failed = append(failed, pendingAudit)
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// record failure -- max reverify count reached
|
2019-07-31 18:21:06 +01:00
|
|
|
updateRequests = append(updateRequests, &overlay.UpdateRequest{
|
2019-05-27 12:13:47 +01:00
|
|
|
NodeID: pendingAudit.NodeID,
|
|
|
|
IsUp: true,
|
|
|
|
AuditSuccess: false,
|
|
|
|
})
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
2019-07-31 18:21:06 +01:00
|
|
|
|
|
|
|
if len(updateRequests) > 0 {
|
|
|
|
failedBatch, err := reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
|
|
|
if err != nil {
|
|
|
|
errlist.Add(err)
|
|
|
|
}
|
|
|
|
if len(failedBatch) > 0 {
|
|
|
|
pendingMap := make(map[storj.NodeID]*PendingAudit)
|
|
|
|
for _, pendingAudit := range pendingAudits {
|
|
|
|
pendingMap[pendingAudit.NodeID] = pendingAudit
|
|
|
|
}
|
|
|
|
for _, nodeID := range failedBatch {
|
|
|
|
pending, ok := pendingMap[nodeID]
|
|
|
|
if ok {
|
|
|
|
failed = append(failed, pending)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(failed) > 0 {
|
|
|
|
for _, v := range failed {
|
|
|
|
reporter.log.Debug("failed to record Pending Nodes ", zap.Stringer("NodeID", v.NodeID), zap.String("Path", v.Path))
|
|
|
|
}
|
|
|
|
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
|
2019-07-30 17:03:25 +01:00
|
|
|
}
|
2018-12-19 18:44:03 +00:00
|
|
|
}
|
|
|
|
return nil, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|