satellite/{audit,overlay,satellitedb}: enable reporting offline audits
- Remove flag for switching off offline audit reporting. - Change the overlay method used from UpdateUptime to BatchUpdateStats, as this is where the new online scoring is done. - Add a new overlay.AuditOutcome type: AuditOffline. Since we now use the same method to record offline audits as success, failure, and unknown, we need to distinguish offline audits from the rest. Change-Id: Iadcfe10cf13466fa1a1c2dc542db8994a6423355
This commit is contained in:
parent
2fbb4095b2
commit
bb7be23115
@ -13,9 +13,6 @@ import (
|
||||
"storj.io/storj/satellite/overlay"
|
||||
)
|
||||
|
||||
// We do not report offline nodes to the overlay at this time; see V3-3025.
|
||||
const reportOfflineDuringAudit = false
|
||||
|
||||
// Reporter records audit reports in overlay and implements the reporter interface
|
||||
//
|
||||
// architecture: Service
|
||||
@ -94,8 +91,7 @@ func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path sto
|
||||
errlist.Add(err)
|
||||
}
|
||||
}
|
||||
// We do not report offline nodes to the overlay at this time; see V3-3025.
|
||||
if len(offlines) > 0 && reportOfflineDuringAudit {
|
||||
if len(offlines) > 0 {
|
||||
offlines, err = reporter.recordOfflineStatus(ctx, offlines)
|
||||
if err != nil {
|
||||
errlist.Add(err)
|
||||
@ -165,22 +161,23 @@ func (reporter *Reporter) recordAuditUnknownStatus(ctx context.Context, unknownA
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// recordOfflineStatus updates nodeIDs in overlay with isup=false. When there
|
||||
// is any error the function return the list of nodes which haven't been
|
||||
// recorded.
|
||||
// recordOfflineStatus updates nodeIDs in overlay with isup=false, auditoutcome=offline.
|
||||
func (reporter *Reporter) recordOfflineStatus(ctx context.Context, offlineNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
var errlist errs.Group
|
||||
for _, nodeID := range offlineNodeIDs {
|
||||
_, err := reporter.overlay.UpdateUptime(ctx, nodeID, false)
|
||||
if err != nil {
|
||||
failed = append(failed, nodeID)
|
||||
errlist.Add(err)
|
||||
|
||||
updateRequests := make([]*overlay.UpdateRequest, len(offlineNodeIDs))
|
||||
for i, nodeID := range offlineNodeIDs {
|
||||
updateRequests[i] = &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
IsUp: false,
|
||||
AuditOutcome: overlay.AuditOffline,
|
||||
}
|
||||
}
|
||||
if len(failed) > 0 {
|
||||
|
||||
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
||||
if err != nil || len(failed) > 0 {
|
||||
reporter.log.Debug("failed to record Offline Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
||||
return failed, errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), errlist.Err())
|
||||
return failed, errs.Combine(Error.New("failed to record some audit offline statuses in overlay"), err)
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
|
@ -242,3 +242,29 @@ func TestGracefullyExitedNotUpdated(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestReportOfflineAudits(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
node := planet.StorageNodes[0]
|
||||
audits := satellite.Audit
|
||||
audits.Worker.Loop.Pause()
|
||||
cache := satellite.Overlay.DB
|
||||
|
||||
_, err := audits.Reporter.RecordAudits(ctx, audit.Report{Offlines: storj.NodeIDList{node.ID()}}, "")
|
||||
require.NoError(t, err)
|
||||
|
||||
d, err := cache.Get(ctx, node.ID())
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, int64(1), d.Reputation.AuditCount)
|
||||
|
||||
// check that other reputation stats were not incorrectly updated by offline audit
|
||||
require.EqualValues(t, 0, d.Reputation.AuditSuccessCount)
|
||||
require.EqualValues(t, 1, d.Reputation.AuditReputationAlpha)
|
||||
require.EqualValues(t, 0, d.Reputation.AuditReputationBeta)
|
||||
require.EqualValues(t, 1, d.Reputation.UnknownAuditReputationAlpha)
|
||||
require.EqualValues(t, 0, d.Reputation.UnknownAuditReputationBeta)
|
||||
})
|
||||
}
|
||||
|
@ -155,6 +155,8 @@ const (
|
||||
AuditFailure
|
||||
// AuditUnknown represents an audit that resulted in an unknown error from the node.
|
||||
AuditUnknown
|
||||
// AuditOffline represents an audit where a node was offline.
|
||||
AuditOffline
|
||||
)
|
||||
|
||||
// UpdateRequest is used to update a node status.
|
||||
|
@ -247,10 +247,10 @@ func TestAuditSuspendDQDisabled(t *testing.T) {
|
||||
require.Nil(t, n.UnknownAuditSuspended)
|
||||
require.NotNil(t, n.Disqualified)
|
||||
|
||||
// offline node should still be suspended but not disqualified
|
||||
// offline node should not be suspended or disqualified
|
||||
n, err = oc.Get(ctx, offlineNodeID)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, n.UnknownAuditSuspended)
|
||||
require.Nil(t, n.UnknownAuditSuspended)
|
||||
require.Nil(t, n.Disqualified)
|
||||
|
||||
// unknown node should still be suspended but not disqualified
|
||||
@ -342,7 +342,7 @@ func TestOfflineSuspend(t *testing.T) {
|
||||
|
||||
updateReq := &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditOutcome: overlay.AuditOffline,
|
||||
IsUp: false,
|
||||
AuditHistory: overlay.AuditHistoryConfig{
|
||||
WindowSize: time.Hour,
|
||||
|
@ -1246,8 +1246,11 @@ func (cache *overlaycache) populateUpdateNodeStats(dbNode *dbx.Node, updateReq *
|
||||
updateReq.AuditWeight,
|
||||
totalAuditCount,
|
||||
)
|
||||
|
||||
case overlay.AuditOffline:
|
||||
// for audit offline, only update total audit count
|
||||
updatedTotalAuditCount = totalAuditCount + 1
|
||||
}
|
||||
|
||||
mon.FloatVal("audit_reputation_alpha").Observe(auditAlpha) //mon:locked
|
||||
mon.FloatVal("audit_reputation_beta").Observe(auditBeta) //mon:locked
|
||||
mon.FloatVal("unknown_audit_reputation_alpha").Observe(unknownAuditAlpha) //mon:locked
|
||||
|
Loading…
Reference in New Issue
Block a user