// Copyright (C) 2020 Storj Labs, Inc. // See LICENSE for copying information. package overlay_test import ( "testing" "time" "github.com/stretchr/testify/require" "go.uber.org/zap" "storj.io/common/storj" "storj.io/common/testcontext" "storj.io/storj/private/testplanet" "storj.io/storj/satellite" "storj.io/storj/satellite/audit" "storj.io/storj/satellite/overlay" ) // TestSuspendBasic ensures that we can suspend a node using overlayService.SuspendNode and that we can unsuspend a node using overlayservice.UnsuspendNode. func TestSuspendBasic(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0, }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { nodeID := planet.StorageNodes[0].ID() oc := planet.Satellites[0].Overlay.DB node, err := oc.Get(ctx, nodeID) require.NoError(t, err) require.Nil(t, node.UnknownAuditSuspended) timeToSuspend := time.Now().UTC().Truncate(time.Second) err = oc.SuspendNodeUnknownAudit(ctx, nodeID, timeToSuspend) require.NoError(t, err) node, err = oc.Get(ctx, nodeID) require.NoError(t, err) require.NotNil(t, node.UnknownAuditSuspended) require.True(t, node.UnknownAuditSuspended.Equal(timeToSuspend)) err = oc.UnsuspendNodeUnknownAudit(ctx, nodeID) require.NoError(t, err) node, err = oc.Get(ctx, nodeID) require.NoError(t, err) require.Nil(t, node.UnknownAuditSuspended) }) } // TestSuspendWithUpdateStats ensures that a node goes into suspension node from getting enough unknown audits, and gets removed from getting enough successful audits. func TestSuspendWithUpdateStats(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0, }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { nodeID := planet.StorageNodes[0].ID() oc := planet.Satellites[0].Overlay.Service node, err := oc.Get(ctx, nodeID) require.NoError(t, err) require.Nil(t, node.UnknownAuditSuspended) testStartTime := time.Now() // give node one unknown audit - bringing unknown audit rep to 0.5, and suspending node _, err = oc.UpdateStats(ctx, &overlay.UpdateRequest{ NodeID: nodeID, AuditOutcome: overlay.AuditUnknown, IsUp: true, AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.6, }) require.NoError(t, err) node, err = oc.Get(ctx, nodeID) require.NoError(t, err) // expect unknown audit alpha/beta to change and suspended to be set require.True(t, node.Reputation.UnknownAuditReputationAlpha < 1) require.True(t, node.Reputation.UnknownAuditReputationBeta > 0) require.NotNil(t, node.UnknownAuditSuspended) require.True(t, node.UnknownAuditSuspended.After(testStartTime)) // expect node is not disqualified and that normal audit alpha/beta remain unchanged require.Nil(t, node.Disqualified) require.EqualValues(t, node.Reputation.AuditReputationAlpha, 1) require.EqualValues(t, node.Reputation.AuditReputationBeta, 0) // give node two successful audits - bringing unknown audit rep to 0.75, and unsuspending node for i := 0; i < 2; i++ { _, err = oc.UpdateStats(ctx, &overlay.UpdateRequest{ NodeID: nodeID, AuditOutcome: overlay.AuditSuccess, IsUp: true, AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.6, }) require.NoError(t, err) } node, err = oc.Get(ctx, nodeID) require.NoError(t, err) require.Nil(t, node.UnknownAuditSuspended) }) } // TestSuspendFailedAudit ensures that a node is not suspended for a failed audit. func TestSuspendFailedAudit(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0, }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { nodeID := planet.StorageNodes[0].ID() oc := planet.Satellites[0].Overlay.DB node, err := oc.Get(ctx, nodeID) require.NoError(t, err) require.Nil(t, node.Disqualified) require.Nil(t, node.UnknownAuditSuspended) // give node one failed audit - bringing audit rep to 0.5, and disqualifying node // expect that suspended field and unknown audit reputation remain unchanged _, err = oc.UpdateStats(ctx, &overlay.UpdateRequest{ NodeID: nodeID, AuditOutcome: overlay.AuditFailure, IsUp: true, AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.6, }) require.NoError(t, err) node, err = oc.Get(ctx, nodeID) require.NoError(t, err) require.NotNil(t, node.Disqualified) require.Nil(t, node.UnknownAuditSuspended) require.EqualValues(t, node.Reputation.UnknownAuditReputationAlpha, 1) require.EqualValues(t, node.Reputation.UnknownAuditReputationBeta, 0) }) } // TestSuspendExceedGracePeriod ensures that a node is disqualified when it receives a failing or unknown audit after the grace period expires. func TestSuspendExceedGracePeriod(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 0, Reconfigure: testplanet.Reconfigure{ Satellite: func(log *zap.Logger, index int, config *satellite.Config) { config.Overlay.Node.SuspensionGracePeriod = time.Hour }, }, }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { successNodeID := planet.StorageNodes[0].ID() failNodeID := planet.StorageNodes[1].ID() offlineNodeID := planet.StorageNodes[2].ID() unknownNodeID := planet.StorageNodes[3].ID() // suspend each node two hours ago (more than grace period) oc := planet.Satellites[0].DB.OverlayCache() for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) { err := oc.SuspendNodeUnknownAudit(ctx, node, time.Now().Add(-2*time.Hour)) require.NoError(t, err) } // no nodes should be disqualified for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) { n, err := oc.Get(ctx, node) require.NoError(t, err) require.Nil(t, n.Disqualified) } // give one node a successful audit, one a failed audit, one an offline audit, and one an unknown audit report := audit.Report{ Successes: storj.NodeIDList{successNodeID}, Fails: storj.NodeIDList{failNodeID}, Offlines: storj.NodeIDList{offlineNodeID}, Unknown: storj.NodeIDList{unknownNodeID}, } auditService := planet.Satellites[0].Audit _, err := auditService.Reporter.RecordAudits(ctx, report, "") require.NoError(t, err) // success and offline nodes should not be disqualified // fail and unknown nodes should be disqualified for _, node := range (storj.NodeIDList{successNodeID, offlineNodeID}) { n, err := oc.Get(ctx, node) require.NoError(t, err) require.Nil(t, n.Disqualified) } for _, node := range (storj.NodeIDList{failNodeID, unknownNodeID}) { n, err := oc.Get(ctx, node) require.NoError(t, err) require.NotNil(t, n.Disqualified) } }) } // TestSuspendDQDisabled ensures that a node is not disqualified from suspended mode if the suspension DQ enabled flag is false. func TestSuspendDQDisabled(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 0, Reconfigure: testplanet.Reconfigure{ Satellite: func(log *zap.Logger, index int, config *satellite.Config) { config.Overlay.Node.SuspensionGracePeriod = time.Hour config.Overlay.Node.SuspensionDQEnabled = false }, }, }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { successNodeID := planet.StorageNodes[0].ID() failNodeID := planet.StorageNodes[1].ID() offlineNodeID := planet.StorageNodes[2].ID() unknownNodeID := planet.StorageNodes[3].ID() // suspend each node two hours ago (more than grace period) oc := planet.Satellites[0].DB.OverlayCache() for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) { err := oc.SuspendNodeUnknownAudit(ctx, node, time.Now().Add(-2*time.Hour)) require.NoError(t, err) } // no nodes should be disqualified for _, node := range (storj.NodeIDList{successNodeID, failNodeID, offlineNodeID, unknownNodeID}) { n, err := oc.Get(ctx, node) require.NoError(t, err) require.Nil(t, n.Disqualified) } // give one node a successful audit, one a failed audit, one an offline audit, and one an unknown audit report := audit.Report{ Successes: storj.NodeIDList{successNodeID}, Fails: storj.NodeIDList{failNodeID}, Offlines: storj.NodeIDList{offlineNodeID}, Unknown: storj.NodeIDList{unknownNodeID}, } auditService := planet.Satellites[0].Audit _, err := auditService.Reporter.RecordAudits(ctx, report, "") require.NoError(t, err) // successful node should not be suspended or disqualified n, err := oc.Get(ctx, successNodeID) require.NoError(t, err) require.Nil(t, n.UnknownAuditSuspended) require.Nil(t, n.Disqualified) // failed node should not be suspended but should be disqualified // (disqualified because of a failed audit, not because of exceeding suspension grace period) n, err = oc.Get(ctx, failNodeID) require.NoError(t, err) require.Nil(t, n.UnknownAuditSuspended) require.NotNil(t, n.Disqualified) // offline node should still be suspended but not disqualified n, err = oc.Get(ctx, offlineNodeID) require.NoError(t, err) require.NotNil(t, n.UnknownAuditSuspended) require.Nil(t, n.Disqualified) // unknown node should still be suspended but not disqualified n, err = oc.Get(ctx, unknownNodeID) require.NoError(t, err) require.NotNil(t, n.UnknownAuditSuspended) require.Nil(t, n.Disqualified) }) } // TestSuspendBatchUpdateStats ensures that suspension and alpha/beta fields are properly updated from batch update stats. func TestSuspendBatchUpdateStats(t *testing.T) { testplanet.Run(t, testplanet.Config{ SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0, }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { nodeID := planet.StorageNodes[0].ID() oc := planet.Satellites[0].Overlay.Service node, err := oc.Get(ctx, nodeID) require.NoError(t, err) require.Nil(t, node.UnknownAuditSuspended) testStartTime := time.Now() nodeUpdateReq := &overlay.UpdateRequest{ NodeID: nodeID, AuditOutcome: overlay.AuditSuccess, IsUp: true, AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.6, } // give node successful audit - expect alpha to be > 1 and beta to be 0 _, err = oc.BatchUpdateStats(ctx, []*overlay.UpdateRequest{nodeUpdateReq}) require.NoError(t, err) node, err = oc.Get(ctx, nodeID) require.NoError(t, err) // expect unknown audit alpha/beta to change and suspended to be nil require.True(t, node.Reputation.UnknownAuditReputationAlpha > 1) require.True(t, node.Reputation.UnknownAuditReputationBeta == 0) require.Nil(t, node.UnknownAuditSuspended) // expect audit alpha/beta to change and disqualified to be nil require.True(t, node.Reputation.AuditReputationAlpha > 1) require.True(t, node.Reputation.AuditReputationBeta == 0) require.Nil(t, node.Disqualified) require.EqualValues(t, node.Reputation.AuditReputationAlpha, 1) require.EqualValues(t, node.Reputation.AuditReputationBeta, 0) oldReputation := node.Reputation // give node two unknown audits to suspend node nodeUpdateReq.AuditOutcome = overlay.AuditUnknown _, err = oc.BatchUpdateStats(ctx, []*overlay.UpdateRequest{nodeUpdateReq}) require.NoError(t, err) _, err = oc.BatchUpdateStats(ctx, []*overlay.UpdateRequest{nodeUpdateReq}) require.NoError(t, err) node, err = oc.Get(ctx, nodeID) require.NoError(t, err) require.True(t, node.Reputation.UnknownAuditReputationAlpha < oldReputation.UnknownAuditReputationAlpha) require.True(t, node.Reputation.UnknownAuditReputationBeta > oldReputation.UnknownAuditReputationBeta) require.NotNil(t, node.UnknownAuditSuspended) require.True(t, node.Reputation.UnknownAuditSuspended.After(testStartTime)) // node should not be disqualified and normal audit reputation should not change require.EqualValues(t, node.Reputation.AuditReputationAlpha, oldReputation.AuditReputationAlpha) require.EqualValues(t, node.Reputation.AuditReputationBeta, oldReputation.AuditReputationBeta) require.Nil(t, node.Disqualified) }) }