satellite/{overlay,satellitedb}: fix stray nodes DQ bug
We had a bug in the stray nodes chore where nodes who had not been seen in several months were not being DQd. We figured out that this was happening because we were using two queries: The first to grab nodes where last_contact_success < some cutoff, the second to DQ them unless last_contact_success == '0001-01-01 00:00:00+00'. The problem is that if all of the nodes returned from the first query had last_contact_success of '0001-01-01 00:00:00+00', we would pass them to the second query which would not DQ them. This would result in the stray nodes DQ loop ending since we found a number of nodes to DQ less than the limit. The fix: add the "WHERE last_contact_success != '0001-01-01 00:00:00+00'::timestamptz" to the selection query. Change-Id: I4e60de90b68d8745d641b4467c2b23e0e56f7dff
This commit is contained in:
parent
774ae017e3
commit
1de8a695e8
@ -66,3 +66,62 @@ func TestDQStrayNodes(t *testing.T) {
|
||||
require.Nil(t, liveInfo.Disqualified)
|
||||
})
|
||||
}
|
||||
|
||||
// We had a bug in the stray nodes chore where nodes who had not been seen
|
||||
// in several months were not being DQd. We figured out that this was
|
||||
// happening because we were using two queries: The first to grab
|
||||
// nodes where last_contact_success < some cutoff, the second to DQ them
|
||||
// unless last_contact_success == '0001-01-01 00:00:00+00'. The problem
|
||||
// is that if all of the nodes returned from the first query had
|
||||
// last_contact_success of '0001-01-01 00:00:00+00', we would pass them to
|
||||
// the second query which would not DQ them. This would result in the stray
|
||||
// nodes DQ loop ending with no DQs. This test consitently failed until the fix
|
||||
// was implemented.
|
||||
func TestNodesWithNoLastContactSuccessDoNotBlockDQOfOtherNodes(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 2,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
|
||||
config.StrayNodes.Limit = 1
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
node1 := planet.StorageNodes[0]
|
||||
node2 := planet.StorageNodes[1]
|
||||
sat := planet.Satellites[0]
|
||||
node1.Contact.Chore.Pause(ctx)
|
||||
node2.Contact.Chore.Pause(ctx)
|
||||
sat.Overlay.DQStrayNodes.Loop.Pause()
|
||||
|
||||
cache := planet.Satellites[0].Overlay.DB
|
||||
|
||||
checkInInfo := overlay.NodeCheckInInfo{
|
||||
NodeID: node1.ID(),
|
||||
IsUp: true,
|
||||
Address: &pb.NodeAddress{
|
||||
Address: "1.2.3.4",
|
||||
},
|
||||
Version: &pb.NodeVersion{
|
||||
Version: "v0.0.0",
|
||||
CommitHash: "",
|
||||
Timestamp: time.Time{},
|
||||
Release: false,
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Time{}, sat.Config.Overlay.Node))
|
||||
checkInInfo.NodeID = node2.ID()
|
||||
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node))
|
||||
|
||||
sat.Overlay.DQStrayNodes.Loop.TriggerWait()
|
||||
|
||||
n1Info, err := cache.Get(ctx, node1.ID())
|
||||
require.NoError(t, err)
|
||||
require.Nil(t, n1Info.Disqualified)
|
||||
|
||||
n2Info, err := cache.Get(ctx, node2.ID())
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, n2Info.Disqualified)
|
||||
})
|
||||
}
|
||||
|
@ -1037,6 +1037,7 @@ func (cache *overlaycache) getNodesForDQLastSeenBefore(ctx context.Context, cuto
|
||||
WHERE last_contact_success < $1
|
||||
AND disqualified is NULL
|
||||
AND exit_finished_at is NULL
|
||||
AND last_contact_success != '0001-01-01 00:00:00+00'::timestamptz
|
||||
LIMIT $2
|
||||
`), cutoff, limit)
|
||||
if err != nil {
|
||||
|
Loading…
Reference in New Issue
Block a user