7fb86617fc
Use the 'AS OF SYSTEM TIME' Cockroach DB clause for the Graceful Exit (a.k.a GE) queries that count the delete the GE queue items of nodes which have already exited the network. Split the subquery used for deleting all the transfer queue items of nodes which has exited when CRDB is used and batch the queries because CRDB struggles when executing in a single query unlike Postgres. The new test which has been added to this commit to verify the CRDB batch logic for deleting all the transfer queue items of the exited nodes has raised that the Enqueue method has to run in baches when CRDB is used otherwise CRDB has return the error "driver: bad connection" when a big a amount of items are passed to be enqueued. This error didn't happen with the current test implementation it was with an initial one that it was creating a big amount of exited nodes and transfer queue items for those nodes. Change-Id: I6a099cdbc515a240596bc93141fea3182c2e50a9
277 lines
8.6 KiB
Go
277 lines
8.6 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package gracefulexit_test
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/memory"
|
|
"storj.io/common/storj"
|
|
"storj.io/common/testcontext"
|
|
"storj.io/common/testrand"
|
|
"storj.io/storj/private/testplanet"
|
|
"storj.io/storj/satellite"
|
|
"storj.io/storj/satellite/gracefulexit"
|
|
"storj.io/storj/satellite/metabase"
|
|
"storj.io/storj/satellite/overlay"
|
|
"storj.io/storj/satellite/satellitedb/satellitedbtest"
|
|
)
|
|
|
|
func TestChore(t *testing.T) {
|
|
var maximumInactiveTimeFrame = time.Second * 1
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1,
|
|
StorageNodeCount: 8,
|
|
UplinkCount: 1,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: testplanet.Combine(
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
config.GracefulExit.MaxInactiveTimeFrame = maximumInactiveTimeFrame
|
|
},
|
|
testplanet.ReconfigureRS(4, 6, 8, 8),
|
|
),
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
uplinkPeer := planet.Uplinks[0]
|
|
satellite := planet.Satellites[0]
|
|
exitingNode := planet.StorageNodes[1]
|
|
|
|
project, err := uplinkPeer.GetProject(ctx, satellite)
|
|
require.NoError(t, err)
|
|
defer func() { require.NoError(t, project.Close()) }()
|
|
|
|
satellite.GracefulExit.Chore.Loop.Pause()
|
|
|
|
err = uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))
|
|
require.NoError(t, err)
|
|
|
|
err = uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path2", testrand.Bytes(5*memory.KiB))
|
|
require.NoError(t, err)
|
|
|
|
info, err := project.BeginUpload(ctx, "testbucket", "test/path3", nil)
|
|
require.NoError(t, err)
|
|
|
|
upload, err := project.UploadPart(ctx, "testbucket", "test/path3", info.UploadID, 1)
|
|
require.NoError(t, err)
|
|
|
|
_, err = upload.Write(testrand.Bytes(5 * memory.KiB))
|
|
require.NoError(t, err)
|
|
require.NoError(t, upload.Commit())
|
|
|
|
exitStatusRequest := overlay.ExitStatusRequest{
|
|
NodeID: exitingNode.ID(),
|
|
ExitInitiatedAt: time.Now(),
|
|
}
|
|
|
|
_, err = satellite.Overlay.DB.UpdateExitStatus(ctx, &exitStatusRequest)
|
|
require.NoError(t, err)
|
|
|
|
exitingNodes, err := satellite.Overlay.DB.GetExitingNodes(ctx)
|
|
require.NoError(t, err)
|
|
nodeIDs := make(storj.NodeIDList, 0, len(exitingNodes))
|
|
for _, exitingNode := range exitingNodes {
|
|
if exitingNode.ExitLoopCompletedAt == nil {
|
|
nodeIDs = append(nodeIDs, exitingNode.NodeID)
|
|
}
|
|
}
|
|
require.Len(t, nodeIDs, 1)
|
|
|
|
satellite.GracefulExit.Chore.Loop.TriggerWait()
|
|
|
|
incompleteTransfers, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 20, 0)
|
|
require.NoError(t, err)
|
|
require.Len(t, incompleteTransfers, 3)
|
|
for _, incomplete := range incompleteTransfers {
|
|
require.True(t, incomplete.DurabilityRatio > 0)
|
|
require.NotNil(t, incomplete.RootPieceID)
|
|
}
|
|
|
|
// test the other nodes don't have anything to transfer
|
|
for _, node := range planet.StorageNodes {
|
|
if node.ID() == exitingNode.ID() {
|
|
continue
|
|
}
|
|
incompleteTransfers, err := satellite.DB.GracefulExit().GetIncomplete(ctx, node.ID(), 20, 0)
|
|
require.NoError(t, err)
|
|
require.Len(t, incompleteTransfers, 0)
|
|
}
|
|
|
|
exitingNodes, err = satellite.Overlay.DB.GetExitingNodes(ctx)
|
|
require.NoError(t, err)
|
|
nodeIDs = make(storj.NodeIDList, 0, len(exitingNodes))
|
|
for _, exitingNode := range exitingNodes {
|
|
if exitingNode.ExitLoopCompletedAt == nil {
|
|
nodeIDs = append(nodeIDs, exitingNode.NodeID)
|
|
}
|
|
}
|
|
require.Len(t, nodeIDs, 0)
|
|
|
|
satellite.GracefulExit.Chore.Loop.Pause()
|
|
err = satellite.DB.GracefulExit().IncrementProgress(ctx, exitingNode.ID(), 0, 0, 0)
|
|
require.NoError(t, err)
|
|
|
|
incompleteTransfers, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 20, 0)
|
|
require.NoError(t, err)
|
|
require.Len(t, incompleteTransfers, 3)
|
|
|
|
// node should fail graceful exit if it has been inactive for maximum inactive time frame since last activity
|
|
time.Sleep(maximumInactiveTimeFrame + time.Second*1)
|
|
satellite.GracefulExit.Chore.Loop.TriggerWait()
|
|
|
|
exitStatus, err := satellite.Overlay.DB.GetExitStatus(ctx, exitingNode.ID())
|
|
require.NoError(t, err)
|
|
require.False(t, exitStatus.ExitSuccess)
|
|
require.NotNil(t, exitStatus.ExitFinishedAt)
|
|
|
|
incompleteTransfers, err = satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 20, 0)
|
|
require.NoError(t, err)
|
|
require.Len(t, incompleteTransfers, 0)
|
|
|
|
})
|
|
}
|
|
|
|
func TestDurabilityRatio(t *testing.T) {
|
|
const (
|
|
maximumInactiveTimeFrame = time.Second * 1
|
|
successThreshold = 4
|
|
)
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1,
|
|
StorageNodeCount: 4,
|
|
UplinkCount: 1,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: testplanet.Combine(
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
config.GracefulExit.MaxInactiveTimeFrame = maximumInactiveTimeFrame
|
|
},
|
|
testplanet.ReconfigureRS(2, 3, successThreshold, 4),
|
|
),
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
uplinkPeer := planet.Uplinks[0]
|
|
satellite := planet.Satellites[0]
|
|
nodeToRemove := planet.StorageNodes[0]
|
|
exitingNode := planet.StorageNodes[1]
|
|
|
|
project, err := uplinkPeer.GetProject(ctx, satellite)
|
|
require.NoError(t, err)
|
|
defer func() { require.NoError(t, project.Close()) }()
|
|
satellite.GracefulExit.Chore.Loop.Pause()
|
|
|
|
err = uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path1", testrand.Bytes(5*memory.KiB))
|
|
require.NoError(t, err)
|
|
|
|
info, err := project.BeginUpload(ctx, "testbucket", "test/path2", nil)
|
|
require.NoError(t, err)
|
|
|
|
upload, err := project.UploadPart(ctx, "testbucket", "test/path2", info.UploadID, 1)
|
|
require.NoError(t, err)
|
|
|
|
_, err = upload.Write(testrand.Bytes(5 * memory.KiB))
|
|
require.NoError(t, err)
|
|
require.NoError(t, upload.Commit())
|
|
|
|
exitStatusRequest := overlay.ExitStatusRequest{
|
|
NodeID: exitingNode.ID(),
|
|
ExitInitiatedAt: time.Now(),
|
|
}
|
|
|
|
_, err = satellite.Overlay.DB.UpdateExitStatus(ctx, &exitStatusRequest)
|
|
require.NoError(t, err)
|
|
|
|
exitingNodes, err := satellite.Overlay.DB.GetExitingNodes(ctx)
|
|
require.NoError(t, err)
|
|
nodeIDs := make(storj.NodeIDList, 0, len(exitingNodes))
|
|
for _, exitingNode := range exitingNodes {
|
|
if exitingNode.ExitLoopCompletedAt == nil {
|
|
nodeIDs = append(nodeIDs, exitingNode.NodeID)
|
|
}
|
|
}
|
|
require.Len(t, nodeIDs, 1)
|
|
|
|
// retrieve remote segment
|
|
segments, err := satellite.Metainfo.Metabase.TestingAllSegments(ctx)
|
|
require.NoError(t, err)
|
|
require.Len(t, segments, 2)
|
|
|
|
for _, segment := range segments {
|
|
remotePieces := segment.Pieces
|
|
var newPieces metabase.Pieces = make(metabase.Pieces, len(remotePieces)-1)
|
|
idx := 0
|
|
for _, p := range remotePieces {
|
|
if p.StorageNode != nodeToRemove.ID() {
|
|
newPieces[idx] = p
|
|
idx++
|
|
}
|
|
}
|
|
err = satellite.Metainfo.Metabase.UpdateSegmentPieces(ctx, metabase.UpdateSegmentPieces{
|
|
StreamID: segment.StreamID,
|
|
Position: segment.Position,
|
|
|
|
OldPieces: segment.Pieces,
|
|
NewPieces: newPieces,
|
|
NewRedundancy: segment.Redundancy,
|
|
})
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
satellite.GracefulExit.Chore.Loop.TriggerWait()
|
|
|
|
incompleteTransfers, err := satellite.DB.GracefulExit().GetIncomplete(ctx, exitingNode.ID(), 20, 0)
|
|
require.NoError(t, err)
|
|
require.Len(t, incompleteTransfers, 2)
|
|
for _, incomplete := range incompleteTransfers {
|
|
require.Equal(t, float64(successThreshold-1)/float64(successThreshold), incomplete.DurabilityRatio)
|
|
require.NotNil(t, incomplete.RootPieceID)
|
|
}
|
|
})
|
|
}
|
|
|
|
func BenchmarkChore(b *testing.B) {
|
|
satellitedbtest.Bench(b, func(b *testing.B, db satellite.DB) {
|
|
gracefulexitdb := db.GracefulExit()
|
|
ctx := context.Background()
|
|
|
|
b.Run("BatchUpdateStats-100", func(b *testing.B) {
|
|
batch(ctx, b, gracefulexitdb, 100)
|
|
})
|
|
if !testing.Short() {
|
|
b.Run("BatchUpdateStats-250", func(b *testing.B) {
|
|
batch(ctx, b, gracefulexitdb, 250)
|
|
})
|
|
b.Run("BatchUpdateStats-500", func(b *testing.B) {
|
|
batch(ctx, b, gracefulexitdb, 500)
|
|
})
|
|
b.Run("BatchUpdateStats-1000", func(b *testing.B) {
|
|
batch(ctx, b, gracefulexitdb, 1000)
|
|
})
|
|
b.Run("BatchUpdateStats-5000", func(b *testing.B) {
|
|
batch(ctx, b, gracefulexitdb, 5000)
|
|
})
|
|
}
|
|
})
|
|
}
|
|
func batch(ctx context.Context, b *testing.B, db gracefulexit.DB, size int) {
|
|
for i := 0; i < b.N; i++ {
|
|
var transferQueueItems []gracefulexit.TransferQueueItem
|
|
for j := 0; j < size; j++ {
|
|
item := gracefulexit.TransferQueueItem{
|
|
NodeID: testrand.NodeID(),
|
|
Key: testrand.Bytes(memory.B * 256),
|
|
PieceNum: 0,
|
|
DurabilityRatio: 1.0,
|
|
}
|
|
transferQueueItems = append(transferQueueItems, item)
|
|
}
|
|
batchSize := 1000
|
|
err := db.Enqueue(ctx, transferQueueItems, batchSize)
|
|
require.NoError(b, err)
|
|
}
|
|
}
|