satellite/{audit,overlay,satellitedb}: implement unknown audit reputation and suspension
* change overlay.UpdateStats to allow a third audit outcome. Now it can handle successful, failed, and unknown audits. * when "unknown audit reputation" (unknownAuditAlpha/(unknownAuditAlpha+unknownAuditBeta)) falls below the DQ threshold, put node into suspension. * when unknown audit reputation goes above the DQ threshold, remove node from suspension. * record unknown audits from audit reporter. * add basic tests around unknown audits and suspension. Change-Id: I125f06f3af52e8a29ba48dc19361821a9ff1daa1
This commit is contained in:
parent
52590197c2
commit
8b72181a1f
@ -89,6 +89,8 @@ storj.io/storj/satellite/repair/repairer."time_for_repair" FloatVal
|
||||
storj.io/storj/satellite/repair/repairer."time_since_checker_queue" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_reputation_alpha" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_reputation_beta" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_alpha" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."unknown_audit_reputation_beta" FloatVal
|
||||
storj.io/storj/storage/filestore."open_file_in_trash" Meter
|
||||
storj.io/storj/storagenode/contact."satellite_contact_request" Meter
|
||||
storj.io/storj/storagenode/gracefulexit."satellite_gracefulexit_request" Meter
|
||||
|
@ -218,7 +218,7 @@ func dqNodes(ctx *testcontext.Context, planet *testplanet.Planet) (map[storj.Nod
|
||||
updateRequests = append(updateRequests, &overlay.UpdateRequest{
|
||||
NodeID: n.ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: false,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -227,7 +227,7 @@ func TestDisqualifiedNodeRemainsDisqualified(t *testing.T) {
|
||||
_, err = satellitePeer.DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{{
|
||||
NodeID: disqualifiedNode.ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 0, // forget about history
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0, // make sure new reputation scores are larger than the DQ thresholds
|
||||
|
@ -54,12 +54,14 @@ func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path sto
|
||||
|
||||
successes := req.Successes
|
||||
fails := req.Fails
|
||||
unknowns := req.Unknown
|
||||
offlines := req.Offlines
|
||||
pendingAudits := req.PendingAudits
|
||||
|
||||
reporter.log.Debug("Reporting audits",
|
||||
zap.Int("successes", len(successes)),
|
||||
zap.Int("failures", len(fails)),
|
||||
zap.Int("unknowns", len(unknowns)),
|
||||
zap.Int("offlines", len(offlines)),
|
||||
zap.Int("pending", len(pendingAudits)),
|
||||
zap.Binary("Segment", []byte(path)),
|
||||
@ -70,7 +72,7 @@ func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path sto
|
||||
|
||||
tries := 0
|
||||
for tries <= reporter.maxRetries {
|
||||
if len(successes) == 0 && len(fails) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 {
|
||||
if len(successes) == 0 && len(fails) == 0 && len(unknowns) == 0 && len(offlines) == 0 && len(pendingAudits) == 0 {
|
||||
return Report{}, nil
|
||||
}
|
||||
|
||||
@ -88,6 +90,12 @@ func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path sto
|
||||
errlist.Add(err)
|
||||
}
|
||||
}
|
||||
if len(unknowns) > 0 {
|
||||
unknowns, err = reporter.recordAuditUnknownStatus(ctx, unknowns)
|
||||
if err != nil {
|
||||
errlist.Add(err)
|
||||
}
|
||||
}
|
||||
// We do not report offline nodes to the overlay at this time; see V3-3025.
|
||||
if len(offlines) > 0 && reportOfflineDuringAudit {
|
||||
offlines, err = reporter.recordOfflineStatus(ctx, offlines)
|
||||
@ -111,13 +119,14 @@ func (reporter *Reporter) RecordAudits(ctx context.Context, req Report, path sto
|
||||
Successes: successes,
|
||||
Fails: fails,
|
||||
Offlines: offlines,
|
||||
Unknown: unknowns,
|
||||
PendingAudits: pendingAudits,
|
||||
}, errs.Combine(Error.New("some nodes failed to be updated in overlay"), err)
|
||||
}
|
||||
return Report{}, nil
|
||||
}
|
||||
|
||||
// recordAuditFailStatus updates nodeIDs in overlay with isup=true, auditsuccess=false
|
||||
// recordAuditFailStatus updates nodeIDs in overlay with isup=true, auditoutcome=fail
|
||||
func (reporter *Reporter) recordAuditFailStatus(ctx context.Context, failedAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
@ -126,15 +135,34 @@ func (reporter *Reporter) recordAuditFailStatus(ctx context.Context, failedAudit
|
||||
updateRequests[i] = &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
IsUp: true,
|
||||
AuditSuccess: false,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
}
|
||||
}
|
||||
if len(updateRequests) > 0 {
|
||||
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
||||
if err != nil || len(failed) > 0 {
|
||||
reporter.log.Debug("failed to record Failed Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
||||
return failed, errs.Combine(Error.New("failed to record some audit fail statuses in overlay"), err)
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// recordAuditUnknownStatus updates nodeIDs in overlay with isup=true, auditoutcome=unknown
|
||||
func (reporter *Reporter) recordAuditUnknownStatus(ctx context.Context, unknownAuditNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
updateRequests := make([]*overlay.UpdateRequest, len(unknownAuditNodeIDs))
|
||||
for i, nodeID := range unknownAuditNodeIDs {
|
||||
updateRequests[i] = &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
IsUp: true,
|
||||
AuditOutcome: overlay.AuditUnknown,
|
||||
}
|
||||
}
|
||||
|
||||
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
||||
if err != nil || len(failed) > 0 {
|
||||
reporter.log.Debug("failed to record Unknown Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
||||
return failed, errs.Combine(Error.New("failed to record some audit unknown statuses in overlay"), err)
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
@ -160,7 +188,7 @@ func (reporter *Reporter) recordOfflineStatus(ctx context.Context, offlineNodeID
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// recordAuditSuccessStatus updates nodeIDs in overlay with isup=true, auditsuccess=true
|
||||
// recordAuditSuccessStatus updates nodeIDs in overlay with isup=true, auditoutcome=success
|
||||
func (reporter *Reporter) recordAuditSuccessStatus(ctx context.Context, successNodeIDs storj.NodeIDList) (failed storj.NodeIDList, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
@ -169,17 +197,15 @@ func (reporter *Reporter) recordAuditSuccessStatus(ctx context.Context, successN
|
||||
updateRequests[i] = &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
}
|
||||
}
|
||||
|
||||
if len(updateRequests) > 0 {
|
||||
failed, err = reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
||||
if err != nil || len(failed) > 0 {
|
||||
reporter.log.Debug("failed to record Success Nodes ", zap.Strings("NodeIDs", failed.Strings()))
|
||||
return failed, errs.Combine(Error.New("failed to record some audit success statuses in overlay"), err)
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@ -201,12 +227,11 @@ func (reporter *Reporter) recordPendingAudits(ctx context.Context, pendingAudits
|
||||
updateRequests = append(updateRequests, &overlay.UpdateRequest{
|
||||
NodeID: pendingAudit.NodeID,
|
||||
IsUp: true,
|
||||
AuditSuccess: false,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if len(updateRequests) > 0 {
|
||||
failedBatch, err := reporter.overlay.BatchUpdateStats(ctx, updateRequests)
|
||||
if err != nil {
|
||||
errlist.Add(err)
|
||||
@ -230,6 +255,5 @@ func (reporter *Reporter) recordPendingAudits(ctx context.Context, pendingAudits
|
||||
}
|
||||
return failed, errs.Combine(Error.New("failed to record some pending audits"), errlist.Err())
|
||||
}
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -1258,13 +1258,12 @@ func TestReverifyUnknownError(t *testing.T) {
|
||||
require.Len(t, report.Unknown, 1)
|
||||
require.Equal(t, report.Unknown[0], badNode)
|
||||
|
||||
// TODO uncomment this stuff when suspension mode is implemented
|
||||
//// record audit
|
||||
//_, err = audits.Reporter.RecordAudits(ctx, report, path)
|
||||
//require.NoError(t, err)
|
||||
//
|
||||
//// make sure that pending audit is removed by the reporter when audit is recorded
|
||||
//_, err = containment.Get(ctx, pending.NodeID)
|
||||
//require.True(t, audit.ErrContainedNotFound.Has(err))
|
||||
// record audit
|
||||
_, err = audits.Reporter.RecordAudits(ctx, report, path)
|
||||
require.NoError(t, err)
|
||||
|
||||
// make sure that pending audit is removed by the reporter when audit is recorded
|
||||
_, err = containment.Get(ctx, pending.NodeID)
|
||||
require.True(t, audit.ErrContainedNotFound.Has(err))
|
||||
})
|
||||
}
|
||||
|
@ -77,9 +77,13 @@ func BenchmarkOverlay(b *testing.B) {
|
||||
b.Run("UpdateStats", func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
id := all[i%len(all)]
|
||||
outcome := overlay.AuditFailure
|
||||
if i&1 == 0 {
|
||||
outcome = overlay.AuditSuccess
|
||||
}
|
||||
_, err := overlaydb.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: id,
|
||||
AuditSuccess: i&1 == 0,
|
||||
AuditOutcome: outcome,
|
||||
IsUp: i&2 == 0,
|
||||
})
|
||||
require.NoError(b, err)
|
||||
@ -90,9 +94,13 @@ func BenchmarkOverlay(b *testing.B) {
|
||||
var updateRequests []*overlay.UpdateRequest
|
||||
for i := 0; i < b.N; i++ {
|
||||
id := all[i%len(all)]
|
||||
outcome := overlay.AuditFailure
|
||||
if i&1 == 0 {
|
||||
outcome = overlay.AuditSuccess
|
||||
}
|
||||
updateRequests = append(updateRequests, &overlay.UpdateRequest{
|
||||
NodeID: id,
|
||||
AuditSuccess: i&1 == 0,
|
||||
AuditOutcome: outcome,
|
||||
IsUp: i&2 == 0,
|
||||
})
|
||||
|
||||
|
@ -137,7 +137,7 @@ func TestNodeSelection(t *testing.T) {
|
||||
_, err := satellite.DB.OverlayCache().UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: node.ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.5,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
@ -162,7 +162,7 @@ func TestNodeSelectionWithBatch(t *testing.T) {
|
||||
_, err := satellite.DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{{
|
||||
NodeID: node.ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.5,
|
||||
}}, 1)
|
||||
require.NoError(t, err)
|
||||
@ -282,7 +282,7 @@ func TestNodeSelectionGracefulExit(t *testing.T) {
|
||||
_, err := satellite.DB.OverlayCache().UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: node.ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.5,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
@ -485,7 +485,7 @@ func TestDistinctIPs(t *testing.T) {
|
||||
_, err := satellite.DB.OverlayCache().UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: planet.StorageNodes[i].ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 1,
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0.5,
|
||||
@ -513,7 +513,7 @@ func TestDistinctIPsWithBatch(t *testing.T) {
|
||||
_, err := satellite.DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{{
|
||||
NodeID: planet.StorageNodes[i].ID(),
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 1,
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0.5,
|
||||
|
@ -94,6 +94,11 @@ type DB interface {
|
||||
|
||||
// DisqualifyNode disqualifies a storage node.
|
||||
DisqualifyNode(ctx context.Context, nodeID storj.NodeID) (err error)
|
||||
|
||||
// SuspendNode suspends a storage node.
|
||||
SuspendNode(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error)
|
||||
// UnsuspendNode unsuspends a storage node.
|
||||
UnsuspendNode(ctx context.Context, nodeID storj.NodeID) (err error)
|
||||
}
|
||||
|
||||
// NodeCheckInInfo contains all the info that will be updated when a node checkins
|
||||
@ -128,10 +133,22 @@ type NodeCriteria struct {
|
||||
DistinctIP bool
|
||||
}
|
||||
|
||||
// AuditType is an enum representing the outcome of a particular audit reported to the overlay.
|
||||
type AuditType int
|
||||
|
||||
const (
|
||||
// AuditSuccess represents a successful audit.
|
||||
AuditSuccess AuditType = iota
|
||||
// AuditFailure represents a failed audit.
|
||||
AuditFailure
|
||||
// AuditUnknown represents an audit that resulted in an unknown error from the node.
|
||||
AuditUnknown
|
||||
)
|
||||
|
||||
// UpdateRequest is used to update a node status.
|
||||
type UpdateRequest struct {
|
||||
NodeID storj.NodeID
|
||||
AuditSuccess bool
|
||||
AuditOutcome AuditType
|
||||
IsUp bool
|
||||
// n.b. these are set values from the satellite.
|
||||
// They are part of the UpdateRequest struct in order to be
|
||||
@ -169,6 +186,7 @@ type NodeDossier struct {
|
||||
Version pb.NodeVersion
|
||||
Contained bool
|
||||
Disqualified *time.Time
|
||||
Suspended *time.Time
|
||||
PieceCount int64
|
||||
ExitStatus ExitStatus
|
||||
CreatedAt time.Time
|
||||
@ -188,6 +206,9 @@ type NodeStats struct {
|
||||
AuditReputationAlpha float64
|
||||
AuditReputationBeta float64
|
||||
Disqualified *time.Time
|
||||
UnknownAuditReputationAlpha float64
|
||||
UnknownAuditReputationBeta float64
|
||||
Suspended *time.Time
|
||||
}
|
||||
|
||||
// NodeLastContact contains the ID, address, and timestamp
|
||||
|
@ -134,7 +134,7 @@ func testCache(ctx context.Context, t *testing.T, store overlay.DB) {
|
||||
stats, err := service.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: valid1ID,
|
||||
IsUp: true,
|
||||
AuditSuccess: false,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
newAuditAlpha := 1
|
||||
@ -151,7 +151,7 @@ func testCache(ctx context.Context, t *testing.T, store overlay.DB) {
|
||||
_, err = service.BatchUpdateStats(ctx, []*overlay.UpdateRequest{{
|
||||
NodeID: valid2ID,
|
||||
IsUp: false,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
}})
|
||||
require.NoError(t, err)
|
||||
dossier, err := service.Get(ctx, valid2ID)
|
||||
@ -197,7 +197,7 @@ func TestRandomizedSelection(t *testing.T) {
|
||||
_, err = cache.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: newID,
|
||||
IsUp: true,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
AuditLambda: 1,
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0.5,
|
||||
@ -299,7 +299,7 @@ func TestKnownReliable(t *testing.T) {
|
||||
// Disqualify storage node #0
|
||||
stats, err := service.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: planet.StorageNodes[0].ID(),
|
||||
AuditSuccess: false,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, stats.Disqualified)
|
||||
|
@ -51,7 +51,7 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
|
||||
// update stats so node disqualification is triggered
|
||||
_, err = cache.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: tt.nodeID,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
IsUp: true,
|
||||
AuditLambda: 1, AuditWeight: 1,
|
||||
AuditDQ: 0.9,
|
||||
@ -133,7 +133,7 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
|
||||
|
||||
updateReq := &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
AuditSuccess: true,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
IsUp: true,
|
||||
AuditLambda: 0.123, AuditWeight: 0.456,
|
||||
AuditDQ: 0, // don't disqualify for any reason
|
||||
@ -149,7 +149,7 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
|
||||
auditAlpha = expectedAuditAlpha
|
||||
auditBeta = expectedAuditBeta
|
||||
|
||||
updateReq.AuditSuccess = false
|
||||
updateReq.AuditOutcome = overlay.AuditFailure
|
||||
updateReq.IsUp = false
|
||||
stats, err = cache.UpdateStats(ctx, updateReq)
|
||||
require.NoError(t, err)
|
||||
|
131
satellite/overlay/suspension_test.go
Normal file
131
satellite/overlay/suspension_test.go
Normal file
@ -0,0 +1,131 @@
|
||||
// Copyright (C) 2020 Storj Labs, Inc.
|
||||
// See LICENSE for copying information.
|
||||
|
||||
package overlay_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"storj.io/common/testcontext"
|
||||
"storj.io/storj/private/testplanet"
|
||||
"storj.io/storj/satellite/overlay"
|
||||
)
|
||||
|
||||
// TestSuspendBasic ensures that we can suspend a node using overlayService.SuspendNode and that we can unsuspend a node using overlayservice.UnsuspendNode
|
||||
func TestSuspendBasic(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
nodeID := planet.StorageNodes[0].ID()
|
||||
oc := planet.Satellites[0].Overlay.DB
|
||||
|
||||
node, err := oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, node.Suspended)
|
||||
|
||||
timeToSuspend := time.Now().UTC().Truncate(time.Second)
|
||||
err = oc.SuspendNode(ctx, nodeID, timeToSuspend)
|
||||
require.NoError(t, err)
|
||||
|
||||
node, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, node.Suspended)
|
||||
require.True(t, node.Suspended.Equal(timeToSuspend))
|
||||
|
||||
err = oc.UnsuspendNode(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
|
||||
node, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, node.Suspended)
|
||||
})
|
||||
}
|
||||
|
||||
// TestSuspendWithUpdateStats ensures that a node goes into suspension node from getting enough unknown audits, and gets removed from getting enough successful audits.
|
||||
func TestSuspendWithUpdateStats(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
nodeID := planet.StorageNodes[0].ID()
|
||||
oc := planet.Satellites[0].Overlay.Service
|
||||
|
||||
node, err := oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, node.Suspended)
|
||||
|
||||
testStartTime := time.Now()
|
||||
|
||||
// give node one unknown audit - bringing unknown audit rep to 0.5, and suspending node
|
||||
_, err = oc.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
AuditOutcome: overlay.AuditUnknown,
|
||||
IsUp: true,
|
||||
AuditLambda: 1,
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0.6,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
node, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, node.Suspended)
|
||||
require.True(t, node.Suspended.After(testStartTime))
|
||||
// expect node is not disqualified and that normal audit alpha/beta remain unchanged
|
||||
require.Nil(t, node.Disqualified)
|
||||
require.EqualValues(t, node.Reputation.AuditReputationAlpha, 1)
|
||||
require.EqualValues(t, node.Reputation.AuditReputationBeta, 0)
|
||||
|
||||
// give node two successful audits - bringing unknown audit rep to 0.75, and unsuspending node
|
||||
for i := 0; i < 2; i++ {
|
||||
_, err = oc.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
AuditOutcome: overlay.AuditSuccess,
|
||||
IsUp: true,
|
||||
AuditLambda: 1,
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0.6,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
node, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, node.Suspended)
|
||||
})
|
||||
}
|
||||
|
||||
// TestSuspendFailedAudit ensures that a node is not suspended for a failed audit.
|
||||
func TestSuspendFailedAudit(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
nodeID := planet.StorageNodes[0].ID()
|
||||
oc := planet.Satellites[0].Overlay.DB
|
||||
|
||||
node, err := oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, node.Disqualified)
|
||||
require.Nil(t, node.Suspended)
|
||||
|
||||
// give node one failed audit - bringing audit rep to 0.5, and disqualifying node
|
||||
// expect that suspended field and unknown audit reputation remain unchanged
|
||||
_, err = oc.UpdateStats(ctx, &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
IsUp: true,
|
||||
AuditLambda: 1,
|
||||
AuditWeight: 1,
|
||||
AuditDQ: 0.6,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
node, err = oc.Get(ctx, nodeID)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, node.Disqualified)
|
||||
require.Nil(t, node.Suspended)
|
||||
require.EqualValues(t, node.Reputation.UnknownAuditReputationAlpha, 1)
|
||||
require.EqualValues(t, node.Reputation.UnknownAuditReputationBeta, 0)
|
||||
})
|
||||
}
|
@ -328,7 +328,7 @@ func (repairer *SegmentRepairer) updateAuditFailStatus(ctx context.Context, fail
|
||||
updateRequests[i] = &overlay.UpdateRequest{
|
||||
NodeID: nodeID,
|
||||
IsUp: true,
|
||||
AuditSuccess: false,
|
||||
AuditOutcome: overlay.AuditFailure,
|
||||
}
|
||||
}
|
||||
if len(updateRequests) > 0 {
|
||||
|
@ -802,6 +802,38 @@ func (cache *overlaycache) DisqualifyNode(ctx context.Context, nodeID storj.Node
|
||||
return nil
|
||||
}
|
||||
|
||||
// SuspendNode suspends a storage node.
|
||||
func (cache *overlaycache) SuspendNode(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
updateFields := dbx.Node_Update_Fields{}
|
||||
updateFields.Suspended = dbx.Node_Suspended(suspendedAt.UTC())
|
||||
|
||||
dbNode, err := cache.db.Update_Node_By_Id(ctx, dbx.Node_Id(nodeID.Bytes()), updateFields)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dbNode == nil {
|
||||
return errs.New("unable to get node by ID: %v", nodeID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnsuspendNode unsuspends a storage node.
|
||||
func (cache *overlaycache) UnsuspendNode(ctx context.Context, nodeID storj.NodeID) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
updateFields := dbx.Node_Update_Fields{}
|
||||
updateFields.Suspended = dbx.Node_Suspended_Null()
|
||||
|
||||
dbNode, err := cache.db.Update_Node_By_Id(ctx, dbx.Node_Id(nodeID.Bytes()), updateFields)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dbNode == nil {
|
||||
return errs.New("unable to get node by ID: %v", nodeID)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AllPieceCounts returns a map of node IDs to piece counts from the db.
|
||||
// NB: a valid, partial piece map can be returned even if node ID parsing error(s) are returned.
|
||||
func (cache *overlaycache) AllPieceCounts(ctx context.Context) (_ map[storj.NodeID]int, err error) {
|
||||
@ -1126,6 +1158,7 @@ func convertDBNode(ctx context.Context, info *dbx.Node) (_ *overlay.NodeDossier,
|
||||
},
|
||||
Contained: info.Contained,
|
||||
Disqualified: info.Disqualified,
|
||||
Suspended: info.Suspended,
|
||||
PieceCount: info.PieceCount,
|
||||
ExitStatus: exitStatus,
|
||||
CreatedAt: info.CreatedAt,
|
||||
@ -1169,6 +1202,9 @@ func getNodeStats(dbNode *dbx.Node) *overlay.NodeStats {
|
||||
AuditReputationAlpha: dbNode.AuditReputationAlpha,
|
||||
AuditReputationBeta: dbNode.AuditReputationBeta,
|
||||
Disqualified: dbNode.Disqualified,
|
||||
UnknownAuditReputationAlpha: dbNode.UnknownAuditReputationAlpha,
|
||||
UnknownAuditReputationBeta: dbNode.UnknownAuditReputationBeta,
|
||||
Suspended: dbNode.Suspended,
|
||||
}
|
||||
return nodeStats
|
||||
}
|
||||
@ -1225,6 +1261,13 @@ func buildUpdateStatement(update updateNodeStats) string {
|
||||
atLeastOne = true
|
||||
sql += fmt.Sprintf("disqualified = '%v'", update.Disqualified.value.Format(time.RFC3339Nano))
|
||||
}
|
||||
if update.Suspended.set {
|
||||
if atLeastOne {
|
||||
sql += ","
|
||||
}
|
||||
atLeastOne = true
|
||||
sql += fmt.Sprintf("suspended = '%v'", update.Suspended.value.Format(time.RFC3339Nano))
|
||||
}
|
||||
if update.UptimeSuccessCount.set {
|
||||
if atLeastOne {
|
||||
sql += ","
|
||||
@ -1289,6 +1332,7 @@ type boolField struct {
|
||||
|
||||
type timeField struct {
|
||||
set bool
|
||||
isNil bool
|
||||
value time.Time
|
||||
}
|
||||
|
||||
@ -1299,6 +1343,9 @@ type updateNodeStats struct {
|
||||
AuditReputationAlpha float64Field
|
||||
AuditReputationBeta float64Field
|
||||
Disqualified timeField
|
||||
UnknownAuditReputationAlpha float64Field
|
||||
UnknownAuditReputationBeta float64Field
|
||||
Suspended timeField
|
||||
UptimeSuccessCount int64Field
|
||||
LastContactSuccess timeField
|
||||
LastContactFailure timeField
|
||||
@ -1307,16 +1354,62 @@ type updateNodeStats struct {
|
||||
}
|
||||
|
||||
func populateUpdateNodeStats(dbNode *dbx.Node, updateReq *overlay.UpdateRequest) updateNodeStats {
|
||||
auditAlpha, auditBeta, totalAuditCount := updateReputation(
|
||||
updateReq.AuditSuccess,
|
||||
dbNode.AuditReputationAlpha,
|
||||
dbNode.AuditReputationBeta,
|
||||
// there are three audit outcomes: success, failure, and unknown
|
||||
// if a node fails enough audits, it gets disqualified
|
||||
// if a node gets enough "unknown" audits, it gets put into suspension
|
||||
// if a node gets enough successful audits, and is in suspension, it gets removed from suspension
|
||||
|
||||
auditAlpha := dbNode.AuditReputationAlpha
|
||||
auditBeta := dbNode.AuditReputationBeta
|
||||
unknownAuditAlpha := dbNode.UnknownAuditReputationAlpha
|
||||
unknownAuditBeta := dbNode.UnknownAuditReputationBeta
|
||||
totalAuditCount := dbNode.TotalAuditCount
|
||||
|
||||
switch updateReq.AuditOutcome {
|
||||
case overlay.AuditSuccess:
|
||||
// for a successful audit, increase reputation for normal *and* unknown audits
|
||||
auditAlpha, auditBeta, totalAuditCount = updateReputation(
|
||||
true,
|
||||
auditAlpha,
|
||||
auditBeta,
|
||||
updateReq.AuditLambda,
|
||||
updateReq.AuditWeight,
|
||||
dbNode.TotalAuditCount,
|
||||
totalAuditCount,
|
||||
)
|
||||
unknownAuditAlpha, unknownAuditBeta, totalAuditCount = updateReputation(
|
||||
true,
|
||||
unknownAuditAlpha,
|
||||
unknownAuditBeta,
|
||||
updateReq.AuditLambda,
|
||||
updateReq.AuditWeight,
|
||||
totalAuditCount-1, // subtract one because this is still a single audit
|
||||
)
|
||||
case overlay.AuditFailure:
|
||||
// for audit failure, only update normal alpha/beta
|
||||
auditAlpha, auditBeta, totalAuditCount = updateReputation(
|
||||
false,
|
||||
auditAlpha,
|
||||
auditBeta,
|
||||
updateReq.AuditLambda,
|
||||
updateReq.AuditWeight,
|
||||
totalAuditCount,
|
||||
)
|
||||
case overlay.AuditUnknown:
|
||||
// for audit unknown, only update unknown alpha/beta
|
||||
unknownAuditAlpha, unknownAuditBeta, totalAuditCount = updateReputation(
|
||||
false,
|
||||
unknownAuditAlpha,
|
||||
unknownAuditBeta,
|
||||
updateReq.AuditLambda,
|
||||
updateReq.AuditWeight,
|
||||
totalAuditCount,
|
||||
)
|
||||
|
||||
}
|
||||
mon.FloatVal("audit_reputation_alpha").Observe(auditAlpha) //locked
|
||||
mon.FloatVal("audit_reputation_beta").Observe(auditBeta) //locked
|
||||
mon.FloatVal("unknown_audit_reputation_alpha").Observe(unknownAuditAlpha) //locked
|
||||
mon.FloatVal("unknown_audit_reputation_beta").Observe(unknownAuditBeta) //locked
|
||||
|
||||
totalUptimeCount := dbNode.TotalUptimeCount
|
||||
if updateReq.IsUp {
|
||||
@ -1329,6 +1422,8 @@ func populateUpdateNodeStats(dbNode *dbx.Node, updateReq *overlay.UpdateRequest)
|
||||
TotalUptimeCount: int64Field{set: true, value: totalUptimeCount},
|
||||
AuditReputationAlpha: float64Field{set: true, value: auditAlpha},
|
||||
AuditReputationBeta: float64Field{set: true, value: auditBeta},
|
||||
UnknownAuditReputationAlpha: float64Field{set: true, value: unknownAuditAlpha},
|
||||
UnknownAuditReputationBeta: float64Field{set: true, value: unknownAuditBeta},
|
||||
}
|
||||
|
||||
auditRep := auditAlpha / (auditAlpha + auditBeta)
|
||||
@ -1336,6 +1431,16 @@ func populateUpdateNodeStats(dbNode *dbx.Node, updateReq *overlay.UpdateRequest)
|
||||
updateFields.Disqualified = timeField{set: true, value: time.Now().UTC()}
|
||||
}
|
||||
|
||||
// if unknown audit rep goes below threshold, suspend node. Otherwise unsuspend node.
|
||||
unknownAuditRep := unknownAuditAlpha / (unknownAuditAlpha + unknownAuditBeta)
|
||||
if unknownAuditRep <= updateReq.AuditDQ {
|
||||
updateFields.Suspended = timeField{set: true, value: time.Now().UTC()}
|
||||
} else {
|
||||
updateFields.Suspended = timeField{set: true, isNil: true}
|
||||
}
|
||||
|
||||
// TODO if node has been suspended for longer than threshold, and audit outcome is failure or unknown, disqualify node.
|
||||
|
||||
if updateReq.IsUp {
|
||||
updateFields.UptimeSuccessCount = int64Field{set: true, value: dbNode.UptimeSuccessCount + 1}
|
||||
updateFields.LastContactSuccess = timeField{set: true, value: time.Now()}
|
||||
@ -1343,7 +1448,7 @@ func populateUpdateNodeStats(dbNode *dbx.Node, updateReq *overlay.UpdateRequest)
|
||||
updateFields.LastContactFailure = timeField{set: true, value: time.Now()}
|
||||
}
|
||||
|
||||
if updateReq.AuditSuccess {
|
||||
if updateReq.AuditOutcome == overlay.AuditSuccess {
|
||||
updateFields.AuditSuccessCount = int64Field{set: true, value: dbNode.AuditSuccessCount + 1}
|
||||
}
|
||||
|
||||
@ -1372,6 +1477,19 @@ func populateUpdateFields(dbNode *dbx.Node, updateReq *overlay.UpdateRequest) db
|
||||
if update.Disqualified.set {
|
||||
updateFields.Disqualified = dbx.Node_Disqualified(update.Disqualified.value)
|
||||
}
|
||||
if update.UnknownAuditReputationAlpha.set {
|
||||
updateFields.UnknownAuditReputationAlpha = dbx.Node_UnknownAuditReputationAlpha(update.UnknownAuditReputationAlpha.value)
|
||||
}
|
||||
if update.UnknownAuditReputationBeta.set {
|
||||
updateFields.UnknownAuditReputationBeta = dbx.Node_UnknownAuditReputationBeta(update.UnknownAuditReputationBeta.value)
|
||||
}
|
||||
if update.Suspended.set {
|
||||
if update.Suspended.isNil {
|
||||
updateFields.Suspended = dbx.Node_Suspended_Null()
|
||||
} else {
|
||||
updateFields.Suspended = dbx.Node_Suspended(update.Suspended.value)
|
||||
}
|
||||
}
|
||||
if update.UptimeSuccessCount.set {
|
||||
updateFields.UptimeSuccessCount = dbx.Node_UptimeSuccessCount(update.UptimeSuccessCount.value)
|
||||
}
|
||||
@ -1387,7 +1505,7 @@ func populateUpdateFields(dbNode *dbx.Node, updateReq *overlay.UpdateRequest) db
|
||||
if update.Contained.set {
|
||||
updateFields.Contained = dbx.Node_Contained(update.Contained.value)
|
||||
}
|
||||
if updateReq.AuditSuccess {
|
||||
if updateReq.AuditOutcome == overlay.AuditSuccess {
|
||||
updateFields.AuditSuccessCount = dbx.Node_AuditSuccessCount(dbNode.AuditSuccessCount + 1)
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user