storj/satellite/overlay/selection_test.go
Egon Elbre cb781d66c7 satellite/overlay: optimize FindStorageNodes
Reduce the number of fields returned from the query.

Benchmark results in `satellite/overlay`:

benchstat before.txt after2.txt
name                               old time/op  new time/op  delta
SelectStorageNodes-32              7.85ms ± 1%  6.27ms ± 1%  -20.18%  (p=0.002 n=10+4)
SelectNewStorageNodes-32           8.21ms ± 1%  6.61ms ± 0%  -19.53%  (p=0.002 n=10+4)
SelectStorageNodesExclusion-32     17.2ms ± 1%  15.9ms ± 1%   -7.55%  (p=0.002 n=10+4)
SelectNewStorageNodesExclusion-32  17.8ms ± 2%  16.1ms ± 0%   -9.38%  (p=0.002 n=10+4)
FindStorageNodes-32                48.4ms ± 1%  45.1ms ± 0%   -6.69%  (p=0.002 n=10+4)
FindStorageNodesExclusion-32       79.2ms ± 1%  76.1ms ± 1%   -3.89%  (p=0.002 n=10+4)

Benchmark results from `satellite/overlay` after making them parallel:

benchstat before-parallel.txt after2-parallel.txt
name                               old time/op  new time/op  delta
SelectStorageNodes-32               548µs ± 1%   353µs ± 1%  -35.60%  (p=0.029 n=4+4)
SelectNewStorageNodes-32            562µs ± 0%   368µs ± 0%  -34.51%  (p=0.029 n=4+4)
SelectStorageNodesExclusion-32     1.02ms ± 1%  0.84ms ± 0%  -18.08%  (p=0.029 n=4+4)
SelectNewStorageNodesExclusion-32  1.03ms ± 1%  0.86ms ± 2%  -16.22%  (p=0.029 n=4+4)
FindStorageNodes-32                3.11ms ± 0%  2.79ms ± 1%  -10.27%  (p=0.029 n=4+4)
FindStorageNodesExclusion-32       4.75ms ± 0%  4.43ms ± 1%   -6.56%  (p=0.029 n=4+4)

Change-Id: I1d85e2764eb270f4c2b1998303ccfc1179d65b26
2020-03-30 18:36:23 +03:00

595 lines
18 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package overlay_test
import (
"crypto/tls"
"crypto/x509"
"net"
"runtime"
"strings"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/memory"
"storj.io/common/pb"
"storj.io/common/rpc/rpcpeer"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/overlay"
)
func TestMinimumDiskSpace(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.Node.MinimumDiskSpace = 10 * memory.MB
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
node0 := planet.StorageNodes[0]
node0.Contact.Chore.Pause(ctx)
nodeDossier := node0.Local()
ident := node0.Identity
peer := rpcpeer.Peer{
Addr: &net.TCPAddr{
IP: net.ParseIP(nodeDossier.Address.GetAddress()),
Port: 5,
},
State: tls.ConnectionState{
PeerCertificates: []*x509.Certificate{ident.Leaf, ident.CA},
},
}
peerCtx := rpcpeer.NewContext(ctx, &peer)
// report disk space less than minimum
_, err := planet.Satellites[0].Contact.Endpoint.CheckIn(peerCtx, &pb.CheckInRequest{
Address: nodeDossier.Address.GetAddress(),
Version: &nodeDossier.Version,
Capacity: &pb.NodeCapacity{
FreeDisk: 9 * memory.MB.Int64(),
},
Operator: &nodeDossier.Operator,
})
require.NoError(t, err)
// request 2 nodes, expect failure from not enough nodes
_, err = planet.Satellites[0].Overlay.Service.FindStorageNodes(ctx, overlay.FindStorageNodesRequest{
MinimumRequiredNodes: 2,
RequestedCount: 2,
})
require.Error(t, err)
require.True(t, overlay.ErrNotEnoughNodes.Has(err))
// report disk space greater than minimum
_, err = planet.Satellites[0].Contact.Endpoint.CheckIn(peerCtx, &pb.CheckInRequest{
Address: nodeDossier.Address.GetAddress(),
Version: &nodeDossier.Version,
Capacity: &pb.NodeCapacity{
FreeDisk: 11 * memory.MB.Int64(),
},
Operator: &nodeDossier.Operator,
})
require.NoError(t, err)
// request 2 nodes, expect success
_, err = planet.Satellites[0].Overlay.Service.FindStorageNodes(ctx, overlay.FindStorageNodesRequest{
MinimumRequiredNodes: 2,
RequestedCount: 2,
})
require.NoError(t, err)
})
}
func TestOffline(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
service := satellite.Overlay.Service
// TODO: handle cleanup
result, err := service.KnownUnreliableOrOffline(ctx, []storj.NodeID{
planet.StorageNodes[0].ID(),
})
require.NoError(t, err)
require.Empty(t, result)
result, err = service.KnownUnreliableOrOffline(ctx, []storj.NodeID{
planet.StorageNodes[0].ID(),
planet.StorageNodes[1].ID(),
planet.StorageNodes[2].ID(),
})
require.NoError(t, err)
require.Empty(t, result)
result, err = service.KnownUnreliableOrOffline(ctx, []storj.NodeID{
planet.StorageNodes[0].ID(),
{1, 2, 3, 4}, //note that this succeeds by design
planet.StorageNodes[2].ID(),
})
require.NoError(t, err)
require.Len(t, result, 1)
require.Equal(t, result[0], storj.NodeID{1, 2, 3, 4})
})
}
func TestNodeSelection(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 10, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// This sets audit counts of 0, 1, 2, 3, ... 9
// so that we can fine-tune how many nodes are considered new or reputable
// by modifying the audit count cutoff passed into FindStorageNodesWithPreferences
for i, node := range planet.StorageNodes {
for k := 0; k < i; k++ {
_, err := satellite.DB.OverlayCache().UpdateStats(ctx, &overlay.UpdateRequest{
NodeID: node.ID(),
IsUp: true,
AuditOutcome: overlay.AuditSuccess,
AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.5,
})
require.NoError(t, err)
}
}
testNodeSelection(t, ctx, planet)
})
}
func TestNodeSelectionWithBatch(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 10, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// This sets audit counts of 0, 1, 2, 3, ... 9
// so that we can fine-tune how many nodes are considered new or reputable
// by modifying the audit count cutoff passed into FindStorageNodesWithPreferences
for i, node := range planet.StorageNodes {
for k := 0; k < i; k++ {
// These are done individually b/c the previous stat data is important
_, err := satellite.DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{{
NodeID: node.ID(),
IsUp: true,
AuditOutcome: overlay.AuditSuccess,
AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.5,
}}, 1)
require.NoError(t, err)
}
}
testNodeSelection(t, ctx, planet)
})
}
func testNodeSelection(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// ensure all storagenodes are in overlay
for _, storageNode := range planet.StorageNodes {
err := satellite.Overlay.Service.Put(ctx, storageNode.ID(), storageNode.Local().Node)
assert.NoError(t, err)
}
type test struct {
Preferences overlay.NodeSelectionConfig
ExcludeCount int
RequestCount int
ExpectedCount int
ShouldFailWith *errs.Class
}
for i, tt := range []test{
{ // all reputable nodes, only reputable nodes requested
Preferences: testNodeSelectionConfig(0, 0, false),
RequestCount: 5,
ExpectedCount: 5,
},
{ // all reputable nodes, reputable and new nodes requested
Preferences: testNodeSelectionConfig(0, 1, false),
RequestCount: 5,
ExpectedCount: 5,
},
{ // 50-50 reputable and new nodes, not enough reputable nodes
Preferences: testNodeSelectionConfig(5, 0, false),
RequestCount: 10,
ExpectedCount: 5,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
{ // 50-50 reputable and new nodes, reputable and new nodes requested, not enough reputable nodes
Preferences: testNodeSelectionConfig(5, 0.2, false),
RequestCount: 10,
ExpectedCount: 7,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
{ // all new nodes except one, reputable and new nodes requested (happy path)
Preferences: testNodeSelectionConfig(9, 0.5, false),
RequestCount: 2,
ExpectedCount: 2,
},
{ // all new nodes except one, reputable and new nodes requested (not happy path)
Preferences: testNodeSelectionConfig(9, 0.5, false),
RequestCount: 4,
ExpectedCount: 3,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
{ // all new nodes, reputable and new nodes requested
Preferences: testNodeSelectionConfig(50, 1, false),
RequestCount: 2,
ExpectedCount: 2,
},
{ // audit threshold edge case (1)
Preferences: testNodeSelectionConfig(9, 0, false),
RequestCount: 1,
ExpectedCount: 1,
},
{ // excluded node ids being excluded
Preferences: testNodeSelectionConfig(5, 0, false),
ExcludeCount: 7,
RequestCount: 5,
ExpectedCount: 3,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
} {
t.Logf("#%2d. %+v", i, tt)
service := planet.Satellites[0].Overlay.Service
var excludedNodes []storj.NodeID
for _, storageNode := range planet.StorageNodes[:tt.ExcludeCount] {
excludedNodes = append(excludedNodes, storageNode.ID())
}
response, err := service.FindStorageNodesWithPreferences(ctx, overlay.FindStorageNodesRequest{
RequestedCount: tt.RequestCount,
ExcludedIDs: excludedNodes,
}, &tt.Preferences)
t.Log(len(response), err)
if tt.ShouldFailWith != nil {
assert.Error(t, err)
assert.True(t, tt.ShouldFailWith.Has(err))
} else {
assert.NoError(t, err)
}
assert.Equal(t, tt.ExpectedCount, len(response))
}
}
func TestNodeSelectionGracefulExit(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 10, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
exitingNodes := make(map[storj.NodeID]bool)
// This sets audit counts of 0, 1, 2, 3, ... 9
// so that we can fine-tune how many nodes are considered new or reputable
// by modifying the audit count cutoff passed into FindStorageNodesWithPreferences
// nodes at indices 0, 2, 4, 6, 8 are gracefully exiting
for i, node := range planet.StorageNodes {
for k := 0; k < i; k++ {
_, err := satellite.DB.OverlayCache().UpdateStats(ctx, &overlay.UpdateRequest{
NodeID: node.ID(),
IsUp: true,
AuditOutcome: overlay.AuditSuccess,
AuditLambda: 1, AuditWeight: 1, AuditDQ: 0.5,
})
require.NoError(t, err)
}
// make half the nodes gracefully exiting
if i%2 == 0 {
_, err := satellite.DB.OverlayCache().UpdateExitStatus(ctx, &overlay.ExitStatusRequest{
NodeID: node.ID(),
ExitInitiatedAt: time.Now(),
})
require.NoError(t, err)
exitingNodes[node.ID()] = true
}
}
type test struct {
Preferences overlay.NodeSelectionConfig
ExcludeCount int
RequestCount int
ExpectedCount int
ShouldFailWith *errs.Class
}
for i, tt := range []test{
{ // reputable and new nodes, happy path
Preferences: testNodeSelectionConfig(5, 0.5, false),
RequestCount: 5,
ExpectedCount: 5,
},
{ // all reputable nodes, happy path
Preferences: testNodeSelectionConfig(0, 1, false),
RequestCount: 5,
ExpectedCount: 5,
},
{ // all new nodes, happy path
Preferences: testNodeSelectionConfig(50, 1, false),
RequestCount: 5,
ExpectedCount: 5,
},
{ // reputable and new nodes, requested too many
Preferences: testNodeSelectionConfig(5, 0.5, false),
RequestCount: 10,
ExpectedCount: 5,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
{ // all reputable nodes, requested too many
Preferences: testNodeSelectionConfig(0, 1, false),
RequestCount: 10,
ExpectedCount: 5,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
{ // all new nodes, requested too many
Preferences: testNodeSelectionConfig(50, 1, false),
RequestCount: 10,
ExpectedCount: 5,
ShouldFailWith: &overlay.ErrNotEnoughNodes,
},
} {
t.Logf("#%2d. %+v", i, tt)
response, err := satellite.Overlay.Service.FindStorageNodesWithPreferences(ctx, overlay.FindStorageNodesRequest{
RequestedCount: tt.RequestCount,
}, &tt.Preferences)
t.Log(len(response), err)
if tt.ShouldFailWith != nil {
assert.Error(t, err)
assert.True(t, tt.ShouldFailWith.Has(err))
} else {
assert.NoError(t, err)
}
assert.Equal(t, tt.ExpectedCount, len(response))
// expect no exiting nodes in selection
for _, node := range response {
assert.False(t, exitingNodes[node.ID])
}
}
})
}
func TestFindStorageNodesDistinctNetworks(t *testing.T) {
if runtime.GOOS == "darwin" {
t.Skip("Test does not work with macOS")
}
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 5, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
// will create 3 storage nodes with same IP; 2 will have unique
UniqueIPCount: 2,
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.Node.DistinctIP = true
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// select one of the nodes that shares an IP with others to exclude
var excludedNodes storj.NodeIDList
addrCounts := make(map[string]int)
var excludedNodeAddr string
for _, node := range planet.StorageNodes {
addrNoPort := strings.Split(node.Addr(), ":")[0]
if addrCounts[addrNoPort] > 0 && len(excludedNodes) == 0 {
excludedNodes = append(excludedNodes, node.ID())
break
}
addrCounts[addrNoPort]++
}
require.Len(t, excludedNodes, 1)
res, err := satellite.Overlay.Service.Get(ctx, excludedNodes[0])
require.NoError(t, err)
excludedNodeAddr = res.LastIPPort
req := overlay.FindStorageNodesRequest{
MinimumRequiredNodes: 2,
RequestedCount: 2,
ExcludedIDs: excludedNodes,
}
nodes, err := satellite.Overlay.Service.FindStorageNodes(ctx, req)
require.NoError(t, err)
require.Len(t, nodes, 2)
require.NotEqual(t, nodes[0].LastIPPort, nodes[1].LastIPPort)
require.NotEqual(t, nodes[0].LastIPPort, excludedNodeAddr)
require.NotEqual(t, nodes[1].LastIPPort, excludedNodeAddr)
req = overlay.FindStorageNodesRequest{
MinimumRequiredNodes: 3,
RequestedCount: 3,
ExcludedIDs: excludedNodes,
}
_, err = satellite.Overlay.Service.FindStorageNodes(ctx, req)
require.Error(t, err)
})
}
func TestSelectNewStorageNodesExcludedIPs(t *testing.T) {
if runtime.GOOS == "darwin" {
t.Skip("Test does not work with macOS")
}
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
// will create 2 storage nodes with same IP; 2 will have unique
UniqueIPCount: 2,
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.Node.DistinctIP = true
config.Overlay.Node.NewNodeFraction = 1
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// select one of the nodes that shares an IP with others to exclude
var excludedNodes storj.NodeIDList
addrCounts := make(map[string]int)
var excludedNodeAddr string
for _, node := range planet.StorageNodes {
addrNoPort := strings.Split(node.Addr(), ":")[0]
if addrCounts[addrNoPort] > 0 {
excludedNodes = append(excludedNodes, node.ID())
break
}
addrCounts[addrNoPort]++
}
require.Len(t, excludedNodes, 1)
res, err := satellite.Overlay.Service.Get(ctx, excludedNodes[0])
require.NoError(t, err)
excludedNodeAddr = res.LastIPPort
req := overlay.FindStorageNodesRequest{
MinimumRequiredNodes: 2,
RequestedCount: 2,
ExcludedIDs: excludedNodes,
}
nodes, err := satellite.Overlay.Service.FindStorageNodes(ctx, req)
require.NoError(t, err)
require.Len(t, nodes, 2)
require.NotEqual(t, nodes[0].LastIPPort, nodes[1].LastIPPort)
require.NotEqual(t, nodes[0].LastIPPort, excludedNodeAddr)
require.NotEqual(t, nodes[1].LastIPPort, excludedNodeAddr)
})
}
func TestDistinctIPs(t *testing.T) {
if runtime.GOOS == "darwin" {
t.Skip("Test does not work with macOS")
}
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 10, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
UniqueIPCount: 3,
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// This sets a reputable audit count for nodes[8] and nodes[9].
for i := 9; i > 7; i-- {
_, err := satellite.DB.OverlayCache().UpdateStats(ctx, &overlay.UpdateRequest{
NodeID: planet.StorageNodes[i].ID(),
IsUp: true,
AuditOutcome: overlay.AuditSuccess,
AuditLambda: 1,
AuditWeight: 1,
AuditDQ: 0.5,
})
assert.NoError(t, err)
}
testDistinctIPs(t, ctx, planet)
})
}
func TestDistinctIPsWithBatch(t *testing.T) {
if runtime.GOOS == "darwin" {
t.Skip("Test does not work with macOS")
}
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 10, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
UniqueIPCount: 3,
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
// This sets a reputable audit count for nodes[8] and nodes[9].
for i := 9; i > 7; i-- {
// These are done individually b/c the previous stat data is important
_, err := satellite.DB.OverlayCache().BatchUpdateStats(ctx, []*overlay.UpdateRequest{{
NodeID: planet.StorageNodes[i].ID(),
IsUp: true,
AuditOutcome: overlay.AuditSuccess,
AuditLambda: 1,
AuditWeight: 1,
AuditDQ: 0.5,
}}, 1)
assert.NoError(t, err)
}
testDistinctIPs(t, ctx, planet)
})
}
func testDistinctIPs(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
service := satellite.Overlay.Service
tests := []struct {
duplicateCount int
requestCount int
preferences overlay.NodeSelectionConfig
shouldFailWith *errs.Class
}{
{ // test only distinct IPs with half new nodes
requestCount: 4,
preferences: testNodeSelectionConfig(1, 0.5, true),
},
{ // test not enough distinct IPs
requestCount: 7,
preferences: testNodeSelectionConfig(0, 0, true),
shouldFailWith: &overlay.ErrNotEnoughNodes,
},
{ // test distinct flag false allows duplicates
duplicateCount: 10,
requestCount: 5,
preferences: testNodeSelectionConfig(0, 0.5, false),
},
}
for _, tt := range tests {
response, err := service.FindStorageNodesWithPreferences(ctx, overlay.FindStorageNodesRequest{
RequestedCount: tt.requestCount,
}, &tt.preferences)
if tt.shouldFailWith != nil {
assert.Error(t, err)
assert.True(t, tt.shouldFailWith.Has(err))
continue
} else {
require.NoError(t, err)
}
// assert all IPs are unique
if tt.preferences.DistinctIP {
ips := make(map[string]bool)
for _, n := range response {
assert.False(t, ips[n.LastIPPort])
ips[n.LastIPPort] = true
}
}
assert.Equal(t, tt.requestCount, len(response))
}
}
func TestAddrtoNetwork_Conversion(t *testing.T) {
ctx := testcontext.New(t)
defer ctx.Cleanup()
ip := "8.8.8.8:28967"
resolvedIPPort, network, err := overlay.ResolveIPAndNetwork(ctx, ip)
require.Equal(t, "8.8.8.0", network)
require.Equal(t, ip, resolvedIPPort)
require.NoError(t, err)
ipv6 := "[fc00::1:200]:28967"
resolvedIPPort, network, err = overlay.ResolveIPAndNetwork(ctx, ipv6)
require.Equal(t, "fc00::", network)
require.Equal(t, ipv6, resolvedIPPort)
require.NoError(t, err)
}