1de8a695e8
We had a bug in the stray nodes chore where nodes who had not been seen in several months were not being DQd. We figured out that this was happening because we were using two queries: The first to grab nodes where last_contact_success < some cutoff, the second to DQ them unless last_contact_success == '0001-01-01 00:00:00+00'. The problem is that if all of the nodes returned from the first query had last_contact_success of '0001-01-01 00:00:00+00', we would pass them to the second query which would not DQ them. This would result in the stray nodes DQ loop ending since we found a number of nodes to DQ less than the limit. The fix: add the "WHERE last_contact_success != '0001-01-01 00:00:00+00'::timestamptz" to the selection query. Change-Id: I4e60de90b68d8745d641b4467c2b23e0e56f7dff
128 lines
3.9 KiB
Go
128 lines
3.9 KiB
Go
// Copyright (C) 2020 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package straynodes_test
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/pb"
|
|
"storj.io/common/testcontext"
|
|
"storj.io/storj/private/testplanet"
|
|
"storj.io/storj/satellite"
|
|
"storj.io/storj/satellite/overlay"
|
|
)
|
|
|
|
func TestDQStrayNodes(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 2,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
strayNode := planet.StorageNodes[0]
|
|
liveNode := planet.StorageNodes[1]
|
|
sat := planet.Satellites[0]
|
|
strayNode.Contact.Chore.Pause(ctx)
|
|
sat.Overlay.DQStrayNodes.Loop.Pause()
|
|
|
|
cache := planet.Satellites[0].Overlay.DB
|
|
|
|
strayInfo, err := cache.Get(ctx, strayNode.ID())
|
|
require.NoError(t, err)
|
|
require.Nil(t, strayInfo.Disqualified)
|
|
|
|
checkInInfo := overlay.NodeCheckInInfo{
|
|
NodeID: strayNode.ID(),
|
|
IsUp: true,
|
|
Address: &pb.NodeAddress{
|
|
Address: "1.2.3.4",
|
|
},
|
|
Version: &pb.NodeVersion{
|
|
Version: "v0.0.0",
|
|
CommitHash: "",
|
|
Timestamp: time.Time{},
|
|
Release: false,
|
|
},
|
|
}
|
|
|
|
// set strayNode last_contact_success to 48 hours ago
|
|
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node))
|
|
|
|
sat.Overlay.DQStrayNodes.Loop.TriggerWait()
|
|
|
|
strayInfo, err = cache.Get(ctx, strayNode.ID())
|
|
require.NoError(t, err)
|
|
require.NotNil(t, strayInfo.Disqualified)
|
|
|
|
liveInfo, err := cache.Get(ctx, liveNode.ID())
|
|
require.NoError(t, err)
|
|
require.Nil(t, liveInfo.Disqualified)
|
|
})
|
|
}
|
|
|
|
// We had a bug in the stray nodes chore where nodes who had not been seen
|
|
// in several months were not being DQd. We figured out that this was
|
|
// happening because we were using two queries: The first to grab
|
|
// nodes where last_contact_success < some cutoff, the second to DQ them
|
|
// unless last_contact_success == '0001-01-01 00:00:00+00'. The problem
|
|
// is that if all of the nodes returned from the first query had
|
|
// last_contact_success of '0001-01-01 00:00:00+00', we would pass them to
|
|
// the second query which would not DQ them. This would result in the stray
|
|
// nodes DQ loop ending with no DQs. This test consitently failed until the fix
|
|
// was implemented.
|
|
func TestNodesWithNoLastContactSuccessDoNotBlockDQOfOtherNodes(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 2,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
|
|
config.StrayNodes.Limit = 1
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
node1 := planet.StorageNodes[0]
|
|
node2 := planet.StorageNodes[1]
|
|
sat := planet.Satellites[0]
|
|
node1.Contact.Chore.Pause(ctx)
|
|
node2.Contact.Chore.Pause(ctx)
|
|
sat.Overlay.DQStrayNodes.Loop.Pause()
|
|
|
|
cache := planet.Satellites[0].Overlay.DB
|
|
|
|
checkInInfo := overlay.NodeCheckInInfo{
|
|
NodeID: node1.ID(),
|
|
IsUp: true,
|
|
Address: &pb.NodeAddress{
|
|
Address: "1.2.3.4",
|
|
},
|
|
Version: &pb.NodeVersion{
|
|
Version: "v0.0.0",
|
|
CommitHash: "",
|
|
Timestamp: time.Time{},
|
|
Release: false,
|
|
},
|
|
}
|
|
|
|
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Time{}, sat.Config.Overlay.Node))
|
|
checkInInfo.NodeID = node2.ID()
|
|
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node))
|
|
|
|
sat.Overlay.DQStrayNodes.Loop.TriggerWait()
|
|
|
|
n1Info, err := cache.Get(ctx, node1.ID())
|
|
require.NoError(t, err)
|
|
require.Nil(t, n1Info.Disqualified)
|
|
|
|
n2Info, err := cache.Get(ctx, node2.ID())
|
|
require.NoError(t, err)
|
|
require.NotNil(t, n2Info.Disqualified)
|
|
})
|
|
}
|