satellite/{overlay,satellitedb}: fix stray nodes DQ bug

We had a bug in the stray nodes chore where nodes who had not been seen
in several months were not being DQd. We figured out that this was
happening because we were using two queries: The first to grab
nodes where last_contact_success < some cutoff, the second to DQ them
unless last_contact_success == '0001-01-01 00:00:00+00'. The problem
is that if all of the nodes returned from the first query had
last_contact_success of '0001-01-01 00:00:00+00', we would pass them to
the second query which would not DQ them. This would result in the stray
nodes DQ loop ending since we found a number of nodes to DQ less than the
limit.

The fix: add the "WHERE last_contact_success != '0001-01-01
00:00:00+00'::timestamptz" to the selection query.

Change-Id: I4e60de90b68d8745d641b4467c2b23e0e56f7dff
This commit is contained in:
Cameron Ayer 2021-10-28 14:59:04 -04:00 committed by paul cannon
parent 774ae017e3
commit 1de8a695e8
2 changed files with 60 additions and 0 deletions

View File

@ -66,3 +66,62 @@ func TestDQStrayNodes(t *testing.T) {
require.Nil(t, liveInfo.Disqualified)
})
}
// We had a bug in the stray nodes chore where nodes who had not been seen
// in several months were not being DQd. We figured out that this was
// happening because we were using two queries: The first to grab
// nodes where last_contact_success < some cutoff, the second to DQ them
// unless last_contact_success == '0001-01-01 00:00:00+00'. The problem
// is that if all of the nodes returned from the first query had
// last_contact_success of '0001-01-01 00:00:00+00', we would pass them to
// the second query which would not DQ them. This would result in the stray
// nodes DQ loop ending with no DQs. This test consitently failed until the fix
// was implemented.
func TestNodesWithNoLastContactSuccessDoNotBlockDQOfOtherNodes(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
config.StrayNodes.Limit = 1
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
node1 := planet.StorageNodes[0]
node2 := planet.StorageNodes[1]
sat := planet.Satellites[0]
node1.Contact.Chore.Pause(ctx)
node2.Contact.Chore.Pause(ctx)
sat.Overlay.DQStrayNodes.Loop.Pause()
cache := planet.Satellites[0].Overlay.DB
checkInInfo := overlay.NodeCheckInInfo{
NodeID: node1.ID(),
IsUp: true,
Address: &pb.NodeAddress{
Address: "1.2.3.4",
},
Version: &pb.NodeVersion{
Version: "v0.0.0",
CommitHash: "",
Timestamp: time.Time{},
Release: false,
},
}
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Time{}, sat.Config.Overlay.Node))
checkInInfo.NodeID = node2.ID()
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node))
sat.Overlay.DQStrayNodes.Loop.TriggerWait()
n1Info, err := cache.Get(ctx, node1.ID())
require.NoError(t, err)
require.Nil(t, n1Info.Disqualified)
n2Info, err := cache.Get(ctx, node2.ID())
require.NoError(t, err)
require.NotNil(t, n2Info.Disqualified)
})
}

View File

@ -1037,6 +1037,7 @@ func (cache *overlaycache) getNodesForDQLastSeenBefore(ctx context.Context, cuto
WHERE last_contact_success < $1
AND disqualified is NULL
AND exit_finished_at is NULL
AND last_contact_success != '0001-01-01 00:00:00+00'::timestamptz
LIMIT $2
`), cutoff, limit)
if err != nil {