satellite/{audit,overlay,satellitedb}: enable reporting offline audits

- Remove flag for switching off offline audit reporting.
- Change the overlay method used from UpdateUptime to BatchUpdateStats, as this
is where the new online scoring is done.
- Add a new overlay.AuditOutcome type: AuditOffline. Since we now use the same
method to record offline audits as success, failure, and unknown, we need to
distinguish offline audits from the rest.

Change-Id: Iadcfe10cf13466fa1a1c2dc542db8994a6423355
This commit is contained in:
Cameron Ayer 2020-10-22 17:02:48 -04:00 committed by jens
parent 2fbb4095b2
commit bb7be23115
5 changed files with 48 additions and 20 deletions

View File

@ -13,9 +13,6 @@ import (
"storj.io/storj/satellite/overlay"
)
// We do not report offline nodes to the overlay at this time; see V3-3025.
const reportOfflineDuringAudit = false
// Reporter records audit reports in overlay and implements the reporter interface
//
// architecture: Service
@ -94,8 +91,7 @@ func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path sto
errlist.Add(err)
}
}
// We do not report offline nodes to the overlay at this time; see V3-3025.
if len(offlines) > 0 && reportOfflineDuringAudit {
if len(offlines) > 0 {
offlines, err = reporter.recordOfflineStatus(ctx, offlines)
if err != nil {
errlist.Add(err)
@ -165,22 +161,23 @@ func (reporter *Reporter) recordAuditUnknownStatus(ctx context.Context, unknownA
return nil, nil
}
// recordOfflineStatus updates nodeIDs in overlay with isup=false. When there
// is any error the function return the list of nodes which haven't been
// recorded.
// recordOfflineStatus updates nodeIDs in overlay with isup=false, auditoutcome=offline.
func (reporter *Reporter) recordOfflineStatus(ctx context.Context, offlineNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
defer mon.Task()(&ctx)(&err)
var errlist errs.Group
for _, nodeID := range offlineNodeIDs {
_, err := reporter.overlay.UpdateUptime(ctx, nodeID, false)
if err != nil {
failed = append(failed, nodeID)
errlist.Add(err)
updateRequests := make([]*overlay.UpdateRequest, len(offlineNodeIDs))
for i, nodeID := range offlineNodeIDs {
updateRequests[i] = &overlay.UpdateRequest{
NodeID: nodeID,
IsUp: false,
AuditOutcome: overlay.AuditOffline,
}
}
if len(failed) > 0 {
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
if err != nil || len(failed) > 0 {
reporter.log.Debug("failed to record Offline Nodes ", zap.Strings("NodeIDs", failed.Strings()))
return failed, errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), errlist.Err())
return failed, errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), err)
}
return nil, nil

View File

@ -242,3 +242,29 @@ func TestGracefullyExitedNotUpdated(t *testing.T) {
}
})
}
func TestReportOfflineAudits(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
node := planet.StorageNodes[0]
audits := satellite.Audit
audits.Worker.Loop.Pause()
cache := satellite.Overlay.DB
_, err := audits.Reporter.RecordAudits(ctx, audit.Report{Offlines: storj.NodeIDList{node.ID()}}, "")
require.NoError(t, err)
d, err := cache.Get(ctx, node.ID())
require.NoError(t, err)
require.Equal(t, int64(1), d.Reputation.AuditCount)
// check that other reputation stats were not incorrectly updated by offline audit
require.EqualValues(t, 0, d.Reputation.AuditSuccessCount)
require.EqualValues(t, 1, d.Reputation.AuditReputationAlpha)
require.EqualValues(t, 0, d.Reputation.AuditReputationBeta)
require.EqualValues(t, 1, d.Reputation.UnknownAuditReputationAlpha)
require.EqualValues(t, 0, d.Reputation.UnknownAuditReputationBeta)
})
}

View File

@ -155,6 +155,8 @@ const (
AuditFailure
// AuditUnknown represents an audit that resulted in an unknown error from the node.
AuditUnknown
// AuditOffline represents an audit where a node was offline.
AuditOffline
)
// UpdateRequest is used to update a node status.

View File

@ -247,10 +247,10 @@ func TestAuditSuspendDQDisabled(t *testing.T) {
require.Nil(t, n.UnknownAuditSuspended)
require.NotNil(t, n.Disqualified)
// offline node should still be suspended but not disqualified
// offline node should not be suspended or disqualified
n, err = oc.Get(ctx, offlineNodeID)
require.NoError(t, err)
require.NotNil(t, n.UnknownAuditSuspended)
require.Nil(t, n.UnknownAuditSuspended)
require.Nil(t, n.Disqualified)
// unknown node should still be suspended but not disqualified
@ -342,7 +342,7 @@ func TestOfflineSuspend(t *testing.T) {
updateReq := &overlay.UpdateRequest{
NodeID: nodeID,
AuditOutcome: overlay.AuditSuccess,
AuditOutcome: overlay.AuditOffline,
IsUp: false,
AuditHistory: overlay.AuditHistoryConfig{
WindowSize: time.Hour,

View File

@ -1246,8 +1246,11 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
updateReq.AuditWeight,
totalAuditCount,
)
case overlay.AuditOffline:
// for audit offline, only update total audit count
updatedTotalAuditCount = totalAuditCount + 1
}
mon.FloatVal("audit_reputation_alpha").Observe(auditAlpha) //mon:locked
mon.FloatVal("audit_reputation_beta").Observe(auditBeta) //mon:locked
mon.FloatVal("unknown_audit_reputation_alpha").Observe(unknownAuditAlpha) //mon:locked