storj/satellite/overlay/db_test.go
Cameron Ayer a0c5da6643 satellite/satellitedb: in stray nodes DQ, don't DQ nodes where last_contact_success = '0001-01-01 00:00:00+00'
When nodes check in for the very first time, if the satellite can't ping
them back, they are inserted into the nodes table with
last_contact_success of '0001-01-01 00:00:00+00'. If the stray nodes
chore runs before the node can fix their problem, they are DQd.

Solution: when DQing stray nodes, dont DQ where last_contact_success =
'0001-01-01 00:00:00+00'::timestamptz

Change-Id: I477a02d5ef85b2c930ed6b7d99a4d1995169bca8
2021-04-22 10:13:13 -04:00

291 lines
10 KiB
Go

// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.
package overlay_test
import (
"fmt"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"storj.io/common/pb"
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/private/teststorj"
"storj.io/storj/satellite"
"storj.io/storj/satellite/overlay"
"storj.io/storj/storagenode"
)
func TestDQNodesLastSeenBefore(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 3,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
planet.Satellites[0].Overlay.DQStrayNodes.Loop.Pause()
node1 := planet.StorageNodes[0]
node2 := planet.StorageNodes[1]
node3 := planet.StorageNodes[2]
node1.Contact.Chore.Pause(ctx)
node2.Contact.Chore.Pause(ctx)
node3.Contact.Chore.Pause(ctx)
cache := planet.Satellites[0].Overlay.DB
// initial check that node1 is not disqualified
n1Info, err := cache.Get(ctx, node1.ID())
require.NoError(t, err)
require.Nil(t, n1Info.Disqualified)
// initial check that node2 is not disqualified
n2Info, err := cache.Get(ctx, node2.ID())
require.NoError(t, err)
require.Nil(t, n2Info.Disqualified)
// gracefully exit node3
req := &overlay.ExitStatusRequest{
NodeID: node3.ID(),
ExitInitiatedAt: time.Now(),
ExitLoopCompletedAt: time.Now(),
ExitFinishedAt: time.Now(),
}
dossier, err := cache.UpdateExitStatus(ctx, req)
require.NoError(t, err)
require.NotNil(t, dossier.ExitStatus.ExitFinishedAt)
// check that limit works, set limit = 1
// run twice to DQ nodes 1 and 2
for i := 0; i < 2; i++ {
n, err := cache.DQNodesLastSeenBefore(ctx, time.Now(), 1)
require.NoError(t, err)
require.Equal(t, n, 1)
}
n1Info, err = cache.Get(ctx, node1.ID())
require.NoError(t, err)
require.NotNil(t, n1Info.Disqualified)
n1DQTime := n1Info.Disqualified
n2Info, err = cache.Get(ctx, node2.ID())
require.NoError(t, err)
require.NotNil(t, n2Info.Disqualified)
n2DQTime := n2Info.Disqualified
// there should be no more nodes for DQ
// use higher limit to double check that DQ times
// for nodes 1 and 2 have not changed
n, err := cache.DQNodesLastSeenBefore(ctx, time.Now(), 100)
require.NoError(t, err)
require.Equal(t, n, 0)
// double check that node 3 is not DQ
n3Info, err := cache.Get(ctx, node3.ID())
require.NoError(t, err)
require.Nil(t, n3Info.Disqualified)
// double check dq times not updated
n1Info, err = cache.Get(ctx, node1.ID())
require.NoError(t, err)
require.NotNil(t, n1Info.Disqualified)
require.Equal(t, n1DQTime, n1Info.Reputation.Disqualified)
n2Info, err = cache.Get(ctx, node2.ID())
require.NoError(t, err)
require.NotNil(t, n2Info.Disqualified)
require.Equal(t, n2DQTime, n2Info.Reputation.Disqualified)
})
}
// In the event of a new node failing the satellite's initial pingback, its last_contact_success
// is set to '0001-01-01 00:00:00+00'. The DQ stray nodes chore DQs nodes where last_contact_success < now() - 30d.
// Make sure these nodes are not DQd with stray nodes.
func TestDQStrayNodesFailedPingback(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 0,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
sat := planet.Satellites[0]
sat.Overlay.DQStrayNodes.Loop.Pause()
oc := sat.DB.OverlayCache()
testID := teststorj.NodeIDFromString("test")
checkIn := overlay.NodeCheckInInfo{
NodeID: testID,
IsUp: false,
Address: &pb.NodeAddress{
Address: "1.2.3.4",
},
Version: &pb.NodeVersion{
Version: "v0.0.0",
CommitHash: "",
Timestamp: time.Time{},
Release: false,
},
}
require.NoError(t, oc.UpdateCheckIn(ctx, checkIn, time.Now(), sat.Config.Overlay.Node))
d, err := oc.Get(ctx, testID)
require.NoError(t, err)
require.Equal(t, time.Time{}, d.Reputation.LastContactSuccess.UTC())
require.Nil(t, d.Reputation.Disqualified)
sat.Overlay.DQStrayNodes.Loop.TriggerWait()
d, err = oc.Get(ctx, testID)
require.NoError(t, err)
require.Equal(t, time.Time{}, d.Reputation.LastContactSuccess.UTC())
require.Nil(t, d.Reputation.Disqualified)
})
}
func TestUpdateStats(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.Node.AuditCount = 2
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
node := planet.StorageNodes[0]
node.Contact.Chore.Pause(ctx)
cache := planet.Satellites[0].DB.OverlayCache()
// 1 audit -> unvetted
updateReq := &overlay.UpdateRequest{
NodeID: node.ID(),
AuditOutcome: overlay.AuditOffline,
AuditsRequiredForVetting: planet.Satellites[0].Config.Overlay.Node.AuditCount,
AuditHistory: testAuditHistoryConfig(),
}
nodeStats, err := cache.UpdateStats(ctx, updateReq, time.Now())
require.NoError(t, err)
assert.Nil(t, nodeStats.VettedAt)
assert.EqualValues(t, 1, nodeStats.AuditCount)
// 2 audits -> vetted
updateReq.NodeID = node.ID()
updateReq.AuditOutcome = overlay.AuditOffline
nodeStats, err = cache.UpdateStats(ctx, updateReq, time.Now())
require.NoError(t, err)
assert.NotNil(t, nodeStats.VettedAt)
assert.EqualValues(t, 2, nodeStats.AuditCount)
// Don't overwrite node's vetted_at timestamp
updateReq.NodeID = node.ID()
updateReq.AuditOutcome = overlay.AuditSuccess
nodeStats2, err := cache.UpdateStats(ctx, updateReq, time.Now())
require.NoError(t, err)
assert.NotNil(t, nodeStats2.VettedAt)
assert.Equal(t, nodeStats.VettedAt, nodeStats2.VettedAt)
assert.EqualValues(t, 3, nodeStats2.AuditCount)
})
}
func TestBatchUpdateStats(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.Node.AuditCount = 2
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeA := planet.StorageNodes[0]
nodeB := planet.StorageNodes[1]
nodeA.Contact.Chore.Pause(ctx)
nodeB.Contact.Chore.Pause(ctx)
cache := planet.Satellites[0].DB.OverlayCache()
numAudits := planet.Satellites[0].Config.Overlay.Node.AuditCount
batchSize := 2
// unvetted
updateReqA := &overlay.UpdateRequest{NodeID: nodeA.ID(), AuditOutcome: overlay.AuditOffline, AuditsRequiredForVetting: numAudits, AuditHistory: testAuditHistoryConfig()}
updateReqB := &overlay.UpdateRequest{NodeID: nodeB.ID(), AuditOutcome: overlay.AuditSuccess, AuditsRequiredForVetting: numAudits, AuditHistory: testAuditHistoryConfig()}
updateReqs := []*overlay.UpdateRequest{updateReqA, updateReqB}
failed, err := cache.BatchUpdateStats(ctx, updateReqs, batchSize, time.Now())
require.NoError(t, err)
assert.Len(t, failed, 0)
nA, err := cache.Get(ctx, nodeA.ID())
require.NoError(t, err)
assert.Nil(t, nA.Reputation.VettedAt)
assert.EqualValues(t, 1, nA.Reputation.AuditCount)
nB, err := cache.Get(ctx, nodeB.ID())
require.NoError(t, err)
assert.Nil(t, nB.Reputation.VettedAt)
assert.EqualValues(t, 1, nB.Reputation.AuditCount)
// vetted
updateReqA = &overlay.UpdateRequest{NodeID: nodeA.ID(), AuditOutcome: overlay.AuditOffline, AuditsRequiredForVetting: numAudits, AuditHistory: testAuditHistoryConfig()}
updateReqB = &overlay.UpdateRequest{NodeID: nodeB.ID(), AuditOutcome: overlay.AuditFailure, AuditsRequiredForVetting: numAudits, AuditHistory: testAuditHistoryConfig()}
updateReqs = []*overlay.UpdateRequest{updateReqA, updateReqB}
failed, err = cache.BatchUpdateStats(ctx, updateReqs, batchSize, time.Now())
require.NoError(t, err)
assert.Len(t, failed, 0)
nA, err = cache.Get(ctx, nodeA.ID())
require.NoError(t, err)
assert.NotNil(t, nA.Reputation.VettedAt)
assert.EqualValues(t, 2, nA.Reputation.AuditCount)
nB, err = cache.Get(ctx, nodeB.ID())
require.NoError(t, err)
assert.NotNil(t, nB.Reputation.VettedAt)
assert.EqualValues(t, 2, nB.Reputation.AuditCount)
// don't overwrite timestamp
updateReqA = &overlay.UpdateRequest{NodeID: nodeA.ID(), AuditOutcome: overlay.AuditSuccess, AuditsRequiredForVetting: numAudits, AuditHistory: testAuditHistoryConfig()}
updateReqB = &overlay.UpdateRequest{NodeID: nodeB.ID(), AuditOutcome: overlay.AuditSuccess, AuditsRequiredForVetting: numAudits, AuditHistory: testAuditHistoryConfig()}
updateReqs = []*overlay.UpdateRequest{updateReqA, updateReqB}
failed, err = cache.BatchUpdateStats(ctx, updateReqs, batchSize, time.Now())
require.NoError(t, err)
assert.Len(t, failed, 0)
nA2, err := cache.Get(ctx, nodeA.ID())
require.NoError(t, err)
assert.NotNil(t, nA2.Reputation.VettedAt)
assert.Equal(t, nA.Reputation.VettedAt, nA2.Reputation.VettedAt)
assert.EqualValues(t, 3, nA2.Reputation.AuditCount)
nB2, err := cache.Get(ctx, nodeB.ID())
require.NoError(t, err)
assert.NotNil(t, nB2.Reputation.VettedAt)
assert.Equal(t, nB.Reputation.VettedAt, nB2.Reputation.VettedAt)
assert.EqualValues(t, 3, nB2.Reputation.AuditCount)
})
}
func TestOperatorConfig(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2, Reconfigure: testplanet.Reconfigure{
StorageNode: func(index int, config *storagenode.Config) {
config.Operator.Wallet = fmt.Sprintf("0x%d123456789012345678901234567890123456789", index)
config.Operator.WalletFeatures = []string{fmt.Sprintf("test_%d", index)}
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
nodeA := planet.StorageNodes[0]
nodeB := planet.StorageNodes[1]
nodeA.Contact.Chore.Pause(ctx)
nodeB.Contact.Chore.Pause(ctx)
cache := planet.Satellites[0].DB.OverlayCache()
for _, node := range []*testplanet.StorageNode{nodeA, nodeB} {
nodeInfo, err := cache.Get(ctx, node.ID())
require.NoError(t, err)
require.Equal(t, node.Config.Operator.Email, nodeInfo.Operator.Email)
require.Equal(t, node.Config.Operator.Wallet, nodeInfo.Operator.Wallet)
require.Equal(t, []string(node.Config.Operator.WalletFeatures), nodeInfo.Operator.WalletFeatures)
}
})
}