storj/satellite/overlay/straynodes/chore_test.go

// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.

package straynodes_test

import (
	"testing"
	"time"

	"github.com/stretchr/testify/require"
	"go.uber.org/zap"

	"storj.io/common/pb"
	"storj.io/common/testcontext"
	"storj.io/storj/private/testplanet"
	"storj.io/storj/satellite"
	"storj.io/storj/satellite/overlay"
)

func TestDQStrayNodes(t *testing.T) {
	testplanet.Run(t, testplanet.Config{
		SatelliteCount: 1, StorageNodeCount: 2,
		Reconfigure: testplanet.Reconfigure{
			Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
				config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
			},
		},
	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
		strayNode := planet.StorageNodes[0]
		liveNode := planet.StorageNodes[1]
		sat := planet.Satellites[0]
		strayNode.Contact.Chore.Pause(ctx)
		sat.Overlay.DQStrayNodes.Loop.Pause()

		cache := planet.Satellites[0].Overlay.DB

		strayInfo, err := cache.Get(ctx, strayNode.ID())
		require.NoError(t, err)
		require.Nil(t, strayInfo.Disqualified)

		checkInInfo := overlay.NodeCheckInInfo{
			NodeID: strayNode.ID(),
			IsUp:   true,
			Address: &pb.NodeAddress{
				Address: "1.2.3.4",
			},
			Version: &pb.NodeVersion{
				Version:    "v0.0.0",
				CommitHash: "",
				Timestamp:  time.Time{},
				Release:    false,
			},
		}

		// set strayNode last_contact_success to 48 hours ago
		require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node))

		sat.Overlay.DQStrayNodes.Loop.TriggerWait()

		strayInfo, err = cache.Get(ctx, strayNode.ID())
		require.NoError(t, err)
		require.NotNil(t, strayInfo.Disqualified)

		liveInfo, err := cache.Get(ctx, liveNode.ID())
		require.NoError(t, err)
		require.Nil(t, liveInfo.Disqualified)
	})
}

// We had a bug in the stray nodes chore where nodes who had not been seen
// in several months were not being DQd. We figured out that this was
// happening because we were using two queries: The first to grab
// nodes where last_contact_success < some cutoff, the second to DQ them
// unless last_contact_success == '0001-01-01 00:00:00+00'. The problem
// is that if all of the nodes returned from the first query had
// last_contact_success of '0001-01-01 00:00:00+00', we would pass them to
// the second query which would not DQ them. This would result in the stray
// nodes DQ loop ending with no DQs. This test consitently failed until the fix
// was implemented.
func TestNodesWithNoLastContactSuccessDoNotBlockDQOfOtherNodes(t *testing.T) {
	testplanet.Run(t, testplanet.Config{
		SatelliteCount: 1, StorageNodeCount: 2,
		Reconfigure: testplanet.Reconfigure{
			Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
				config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
				config.StrayNodes.Limit = 1
			},
		},
	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
		node1 := planet.StorageNodes[0]
		node2 := planet.StorageNodes[1]
		sat := planet.Satellites[0]
		node1.Contact.Chore.Pause(ctx)
		node2.Contact.Chore.Pause(ctx)
		sat.Overlay.DQStrayNodes.Loop.Pause()

		cache := planet.Satellites[0].Overlay.DB

		checkInInfo := overlay.NodeCheckInInfo{
			NodeID: node1.ID(),
			IsUp:   true,
			Address: &pb.NodeAddress{
				Address: "1.2.3.4",
			},
			Version: &pb.NodeVersion{
				Version:    "v0.0.0",
				CommitHash: "",
				Timestamp:  time.Time{},
				Release:    false,
			},
		}

		require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Time{}, sat.Config.Overlay.Node))
		checkInInfo.NodeID = node2.ID()
		require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node))

		sat.Overlay.DQStrayNodes.Loop.TriggerWait()

		n1Info, err := cache.Get(ctx, node1.ID())
		require.NoError(t, err)
		require.Nil(t, n1Info.Disqualified)

		n2Info, err := cache.Get(ctx, node2.ID())
		require.NoError(t, err)
		require.NotNil(t, n2Info.Disqualified)
	})
}