satellite/satellitedb/overlaycache: fix behavior around gracefully exited nodes

Sometimes nodes who have gracefully exited will still be holding pieces according to the satellite. This has some unintended side effects currently, such as nodes getting disqualified after having successfully exited. * When the audit reporter attempts to update node stats, do not update stats (alpha, beta, suspension, disqualification) if the node has finished graceful exit (audit/reporter_test.go TestGracefullyExitedNotUpdated) * Treat gracefully exited nodes as "not reputable" so that the repairer and checker do not count them as healthy (overlay/statdb_test.go TestKnownUnreliableOrOffline, repair/repair_test.go TestRepairGracefullyExited) Change-Id: I1920d60dd35de5b2385a9b06989397628a2f1272
2020-04-23 15:46:16 -04:00 · 2020-04-23 15:46:16 -04:00 · de366537a8
commit de366537a8
parent baccfd36b1
4 changed files with 222 additions and 11 deletions
--- a/satellite/audit/reporter_test.go
+++ b/satellite/audit/reporter_test.go
@ -5,6 +5,7 @@ package audit_test

 import (
 	"testing"
+	"time"

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@ -16,6 +17,7 @@ import (
 	"storj.io/common/testrand"
 	"storj.io/storj/private/testplanet"
 	"storj.io/storj/satellite/audit"
+	"storj.io/storj/satellite/overlay"
 )

 func TestReportPendingAudits(t *testing.T) {
@ -177,3 +179,66 @@ func TestSuspensionTimeNotResetBySuccessiveAudit(t *testing.T) {
 		require.Equal(t, suspendedAt, node.Suspended)
 	})
 }
+
+// TestGracefullyExitedNotUpdated verifies that a gracefully exited node's reputation, suspension,
+// and disqualification flags are not updated when an audit is reported for that node.
+func TestGracefullyExitedNotUpdated(t *testing.T) {
+	testplanet.Run(t, testplanet.Config{
+		SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 0,
+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
+		satellite := planet.Satellites[0]
+		audits := satellite.Audit
+		audits.Worker.Loop.Pause()
+		cache := satellite.Overlay.DB
+
+		successNode := planet.StorageNodes[0]
+		failedNode := planet.StorageNodes[1]
+		containedNode := planet.StorageNodes[2]
+		unknownNode := planet.StorageNodes[3]
+		offlineNode := planet.StorageNodes[4]
+		nodeList := []*testplanet.StorageNode{successNode, failedNode, containedNode, unknownNode, offlineNode}
+
+		// mark each node as having gracefully exited
+		for _, node := range nodeList {
+			req := &overlay.ExitStatusRequest{
+				NodeID:              node.ID(),
+				ExitInitiatedAt:     time.Now(),
+				ExitLoopCompletedAt: time.Now(),
+				ExitFinishedAt:      time.Now(),
+			}
+			_, err := cache.UpdateExitStatus(ctx, req)
+			require.NoError(t, err)
+		}
+
+		pending := audit.PendingAudit{
+			NodeID:            containedNode.ID(),
+			PieceID:           storj.NewPieceID(),
+			StripeIndex:       1,
+			ShareSize:         1 * memory.KiB.Int32(),
+			ExpectedShareHash: pkcrypto.SHA256Hash([]byte("test")),
+		}
+		report := audit.Report{
+			Successes:     storj.NodeIDList{successNode.ID()},
+			Fails:         storj.NodeIDList{failedNode.ID()},
+			Offlines:      storj.NodeIDList{offlineNode.ID()},
+			PendingAudits: []*audit.PendingAudit{&pending},
+			Unknown:       storj.NodeIDList{unknownNode.ID()},
+		}
+		failed, err := audits.Reporter.RecordAudits(ctx, report, "")
+		require.NoError(t, err)
+		assert.Zero(t, failed)
+
+		// since every node has gracefully exit, reputation, dq, and suspension should remain at default values
+		for _, node := range nodeList {
+			nodeCacheInfo, err := cache.Get(ctx, node.ID())
+			require.NoError(t, err)
+
+			require.EqualValues(t, 1, nodeCacheInfo.Reputation.AuditReputationAlpha)
+			require.EqualValues(t, 0, nodeCacheInfo.Reputation.AuditReputationBeta)
+			require.EqualValues(t, 1, nodeCacheInfo.Reputation.UnknownAuditReputationAlpha)
+			require.EqualValues(t, 0, nodeCacheInfo.Reputation.UnknownAuditReputationBeta)
+			require.Nil(t, nodeCacheInfo.Suspended)
+			require.Nil(t, nodeCacheInfo.Disqualified)
+		}
+	})
+}
--- a/satellite/overlay/statdb_test.go
+++ b/satellite/overlay/statdb_test.go
@ -31,15 +31,17 @@ func TestStatDB(t *testing.T) {
 func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
 	{ // TestKnownUnreliableOrOffline
 		for _, tt := range []struct {
-			nodeID       storj.NodeID
-			suspended    bool
-			disqualified bool
-			offline      bool
+			nodeID           storj.NodeID
+			suspended        bool
+			disqualified     bool
+			offline          bool
+			gracefullyexited bool
 		}{
-			{storj.NodeID{1}, false, false, false}, // good
-			{storj.NodeID{2}, false, true, false},  // disqualified
-			{storj.NodeID{3}, true, false, false},  // suspended
-			{storj.NodeID{4}, false, false, true},  // offline
+			{storj.NodeID{1}, false, false, false, false}, // good
+			{storj.NodeID{2}, false, true, false, false},  // disqualified
+			{storj.NodeID{3}, true, false, false, false},  // suspended
+			{storj.NodeID{4}, false, false, true, false},  // offline
+			{storj.NodeID{5}, false, false, false, true},  // gracefully exited
 		} {
 			startingRep := overlay.NodeSelectionConfig{}
 			n := pb.Node{Id: tt.nodeID}
@ -61,12 +63,22 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
 				err = cache.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-2*time.Hour), overlay.NodeSelectionConfig{})
 				require.NoError(t, err)
 			}
+			if tt.gracefullyexited {
+				req := &overlay.ExitStatusRequest{
+					NodeID:              tt.nodeID,
+					ExitInitiatedAt:     time.Now(),
+					ExitLoopCompletedAt: time.Now(),
+					ExitFinishedAt:      time.Now(),
+				}
+				_, err := cache.UpdateExitStatus(ctx, req)
+				require.NoError(t, err)
+			}
 		}

 		nodeIds := storj.NodeIDList{
 			storj.NodeID{1}, storj.NodeID{2},
 			storj.NodeID{3}, storj.NodeID{4},
-			storj.NodeID{5},
+			storj.NodeID{5}, storj.NodeID{6},
 		}
 		criteria := &overlay.NodeCriteria{OnlineWindow: time.Hour}

@ -76,8 +88,9 @@ func testDatabase(ctx context.Context, t *testing.T, cache overlay.DB) {
 		require.Contains(t, invalid, storj.NodeID{2}) // disqualified
 		require.Contains(t, invalid, storj.NodeID{3}) // suspended
 		require.Contains(t, invalid, storj.NodeID{4}) // offline
-		require.Contains(t, invalid, storj.NodeID{5}) // not in db
-		require.Len(t, invalid, 4)
+		require.Contains(t, invalid, storj.NodeID{5}) // gracefully exited
+		require.Contains(t, invalid, storj.NodeID{6}) // not in db
+		require.Len(t, invalid, 5)
 	}

 	{ // TestUpdateOperator
--- a/satellite/repair/repair_test.go
+++ b/satellite/repair/repair_test.go
@ -1238,6 +1238,124 @@ func testDataRepairUploadLimit(t *testing.T, inMemoryRepair bool) {
 	})
 }

+// TestRepairGracefullyExited does the following:
+// - Uploads test data to 7 nodes
+// - Set 3 nodes as gracefully exited
+// - Triggers data repair, which repairs the data from the remaining 4 nodes to additional 3 new nodes
+// - Shuts down the 4 nodes from which the data was repaired
+// - Now we have just the 3 new nodes to which the data was repaired
+// - Downloads the data from these 3 nodes (succeeds because 3 nodes are enough for download)
+// - Expect newly repaired pointer does not contain the gracefully exited nodes
+func TestRepairGracefullyExitedInMemory(t *testing.T) {
+	testRepairGracefullyExited(t, true)
+}
+func TestRepairGracefullyExitedToDisk(t *testing.T) {
+	testRepairGracefullyExited(t, false)
+}
+
+func testRepairGracefullyExited(t *testing.T, inMemoryRepair bool) {
+	testplanet.Run(t, testplanet.Config{
+		SatelliteCount:   1,
+		StorageNodeCount: 12,
+		UplinkCount:      1,
+		Reconfigure: testplanet.Reconfigure{
+			Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
+				config.Repairer.InMemoryRepair = inMemoryRepair
+
+				config.Metainfo.RS.MinThreshold = 3
+				config.Metainfo.RS.RepairThreshold = 5
+				config.Metainfo.RS.SuccessThreshold = 7
+				config.Metainfo.RS.TotalThreshold = 7
+			},
+		},
+	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
+		// first, upload some remote data
+		uplinkPeer := planet.Uplinks[0]
+		satellite := planet.Satellites[0]
+
+		satellite.Repair.Checker.Loop.Pause()
+		satellite.Repair.Repairer.Loop.Pause()
+
+		testData := testrand.Bytes(8 * memory.KiB)
+
+		err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
+		require.NoError(t, err)
+
+		// get a remote segment from metainfo
+		metainfo := satellite.Metainfo.Service
+		listResponse, _, err := metainfo.List(ctx, "", "", true, 0, 0)
+		require.NoError(t, err)
+
+		var path string
+		var pointer *pb.Pointer
+		for _, v := range listResponse {
+			path = v.GetPath()
+			pointer, err = metainfo.Get(ctx, path)
+			require.NoError(t, err)
+			if pointer.GetType() == pb.Pointer_REMOTE {
+				break
+			}
+		}
+
+		numStorageNodes := len(planet.StorageNodes)
+		remotePieces := pointer.GetRemote().GetRemotePieces()
+		numPieces := len(remotePieces)
+		// sanity check
+		require.EqualValues(t, numPieces, 7)
+		toExit := 3
+		// we should have enough storage nodes to repair on
+		require.True(t, (numStorageNodes-toExit) >= numPieces)
+
+		// gracefully exit nodes and track lost pieces
+		nodesToExit := make(map[storj.NodeID]bool)
+		nodesToKeepAlive := make(map[storj.NodeID]bool)
+
+		// exit nodes
+		for i := 0; i < toExit; i++ {
+			nodesToExit[remotePieces[i].NodeId] = true
+			req := &overlay.ExitStatusRequest{
+				NodeID:              remotePieces[i].NodeId,
+				ExitInitiatedAt:     time.Now(),
+				ExitLoopCompletedAt: time.Now(),
+				ExitFinishedAt:      time.Now(),
+			}
+			_, err := satellite.DB.OverlayCache().UpdateExitStatus(ctx, req)
+			require.NoError(t, err)
+		}
+		for i := toExit; i < len(remotePieces); i++ {
+			nodesToKeepAlive[remotePieces[i].NodeId] = true
+		}
+
+		err = satellite.Repair.Checker.RefreshReliabilityCache(ctx)
+		require.NoError(t, err)
+
+		satellite.Repair.Checker.Loop.TriggerWait()
+		satellite.Repair.Repairer.Loop.TriggerWait()
+		satellite.Repair.Repairer.WaitForPendingRepairs()
+
+		// kill nodes kept alive to ensure repair worked
+		for _, node := range planet.StorageNodes {
+			if nodesToKeepAlive[node.ID()] {
+				stopNodeByID(t, ctx, planet, node.ID())
+			}
+		}
+
+		// we should be able to download data without any of the original nodes
+		newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
+		require.NoError(t, err)
+		require.Equal(t, newData, testData)
+
+		// updated pointer should not contain any of the gracefully exited nodes
+		pointer, err = metainfo.Get(ctx, path)
+		require.NoError(t, err)
+
+		remotePieces = pointer.GetRemote().GetRemotePieces()
+		for _, piece := range remotePieces {
+			require.False(t, nodesToExit[piece.NodeId])
+		}
+	})
+}
+
 // getRemoteSegment returns a remote pointer its path from satellite.
 // nolint:golint
 func getRemoteSegment(
--- a/satellite/satellitedb/overlaycache.go
+++ b/satellite/satellitedb/overlaycache.go
@ -229,6 +229,7 @@ func (cache *overlaycache) KnownUnreliableOrOffline(ctx context.Context, criteri
 			WHERE id = any($1::bytea[])
 			AND disqualified IS NULL
 			AND suspended IS NULL
+			AND exit_finished_at IS NULL
 			AND last_contact_success > $2
 		`), postgresNodeIDList(nodeIds), time.Now().Add(-criteria.OnlineWindow),
 	)
@ -269,6 +270,7 @@ func (cache *overlaycache) KnownReliable(ctx context.Context, onlineWindow time.
 			WHERE id = any($1::bytea[])
 			AND disqualified IS NULL
 			AND suspended IS NULL
+			AND exit_finished_at IS NULL
 			AND last_contact_success > $2
 		`), postgresNodeIDList(nodeIDs), time.Now().Add(-onlineWindow),
 	)
@ -299,6 +301,7 @@ func (cache *overlaycache) Reliable(ctx context.Context, criteria *overlay.NodeC
 		SELECT id FROM nodes
 		WHERE disqualified IS NULL
 		AND suspended IS NULL
+		AND exit_finished_at IS NULL
 		AND last_contact_success > ?
 	`), time.Now().Add(-criteria.OnlineWindow))
 	if err != nil {
@ -407,6 +410,10 @@ func (cache *overlaycache) BatchUpdateStats(ctx context.Context, updateRequests
 				if dbNode.Disqualified != nil {
 					continue
 				}
+				// do not update reputation if node has gracefully exited
+				if dbNode.ExitFinishedAt != nil {
+					continue
+				}

 				updateNodeStats := cache.populateUpdateNodeStats(dbNode, updateReq)

@ -471,6 +478,10 @@ func (cache *overlaycache) UpdateStats(ctx context.Context, updateReq *overlay.U
 		if dbNode.Disqualified != nil {
 			return nil
 		}
+		// do not update reputation if node has gracefully exited
+		if dbNode.ExitFinishedAt != nil {
+			return nil
+		}

 		updateFields := cache.populateUpdateFields(dbNode, updateReq)

@ -549,6 +560,10 @@ func (cache *overlaycache) UpdateUptime(ctx context.Context, nodeID storj.NodeID
 		if dbNode.Disqualified != nil {
 			return nil
 		}
+		// do not update reputation if node has gracefully exited
+		if dbNode.ExitFinishedAt != nil {
+			return nil
+		}

 		updateFields := dbx.Node_Update_Fields{}
 		totalUptimeCount := dbNode.TotalUptimeCount