storj/satellite/downtime/estimation_chore_test.go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.

package downtime_test

import (
	"testing"
	"time"

	"github.com/stretchr/testify/require"
	"go.uber.org/zap"
	"golang.org/x/sync/errgroup"

	"storj.io/common/testcontext"
	"storj.io/storj/private/testplanet"
	"storj.io/storj/satellite"
	"storj.io/storj/satellite/downtime"
)

// TestEstimationChoreBasic tests the basic functionality of the downtime estimation chore:
// 1. Test that when a node that had one failed ping, and one successful ping >1s later does not have recorded downtime
// 2. Test that when a node that had one failed ping, and another failed ping >1s later has at least 1s of recorded downtime
func TestEstimationChoreBasic(t *testing.T) {
	testplanet.Run(t, testplanet.Config{
		SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
		Reconfigure: testplanet.Reconfigure{
			Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
				config.Downtime.EstimationBatchSize = 1
			},
		},
	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
		node := planet.StorageNodes[0]
		satellite := planet.Satellites[0]
		node.Contact.Chore.Pause(ctx)
		satellite.DowntimeTracking.EstimationChore.Loop.Pause()
		{ // test estimation chore updates uptime correctly for an online node
			// mark node as failing an uptime check so the estimation chore picks it up
			_, err := satellite.DB.OverlayCache().UpdateUptime(ctx, node.ID(), false)
			require.NoError(t, err)
			oldNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())
			require.NoError(t, err)
			require.True(t, oldNode.Reputation.LastContactSuccess.Before(oldNode.Reputation.LastContactFailure))
			// run estimation chore
			time.Sleep(1 * time.Second) // wait for 1s because estimation chore truncates offline duration to seconds
			satellite.DowntimeTracking.EstimationChore.Loop.TriggerWait()
			// get offline time for node, expect it to be 0 since node was online when chore pinged it
			downtime, err := satellite.DB.DowntimeTracking().GetOfflineTime(ctx, node.ID(), time.Now().Add(-5*time.Hour), time.Now())
			require.NoError(t, err)
			require.True(t, downtime == 0)
			// expect node last contact success was updated
			newNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())
			require.NoError(t, err)
			require.Equal(t, oldNode.Reputation.LastContactFailure, newNode.Reputation.LastContactFailure)
			require.True(t, oldNode.Reputation.LastContactSuccess.Before(newNode.Reputation.LastContactSuccess))
			require.True(t, newNode.Reputation.LastContactFailure.Before(newNode.Reputation.LastContactSuccess))
		}
		{ // test estimation chore correctly aggregates offline time
			// mark node as failing an uptime check so the estimation chore picks it up
			_, err := satellite.DB.OverlayCache().UpdateUptime(ctx, node.ID(), false)
			require.NoError(t, err)
			oldNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())
			require.NoError(t, err)
			require.True(t, oldNode.Reputation.LastContactSuccess.Before(oldNode.Reputation.LastContactFailure))
			// close the node service so the ping back will fail
			err = node.Server.Close()
			require.NoError(t, err)
			// run estimation chore
			time.Sleep(1 * time.Second) // wait for 1s because estimation chore truncates offline duration to seconds
			satellite.DowntimeTracking.EstimationChore.Loop.TriggerWait()
			// get offline time for node, expect it to be greater than 0 since node has been offline for at least 1s
			downtime, err := satellite.DB.DowntimeTracking().GetOfflineTime(ctx, node.ID(), time.Now().Add(-5*time.Hour), time.Now())
			require.NoError(t, err)
			require.True(t, downtime > 0)
			// expect node last contact failure was updated
			newNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())
			require.NoError(t, err)
			require.Equal(t, oldNode.Reputation.LastContactSuccess, newNode.Reputation.LastContactSuccess)
			require.True(t, oldNode.Reputation.LastContactFailure.Before(newNode.Reputation.LastContactFailure))
		}
	})
}

// TestEstimationChoreSatelliteDowntime tests the situation where downtime is estimated when the satellite was started after the last failed ping
// If a storage node has a failed ping, then another ping fails later, the estimation chore will normally take the difference between these pings and record that as the downtime.
// If the satellite was started between the old failed ping and the new failed ping, we do not want to risk including satellite downtime in our calculation - no downtime should be recorded in this case.
func TestEstimationChoreSatelliteDowntime(t *testing.T) {
	testplanet.Run(t, testplanet.Config{
		SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
		Reconfigure: testplanet.Reconfigure{
			Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
				config.Downtime.EstimationBatchSize = 1
			},
		},
	}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
		node := planet.StorageNodes[0]
		satellite := planet.Satellites[0]
		node.Contact.Chore.Pause(ctx)
		satellite.DowntimeTracking.EstimationChore.Loop.Pause()

		// mark node as failing an uptime check so the estimation chore picks it up
		_, err := satellite.DB.OverlayCache().UpdateUptime(ctx, node.ID(), false)
		require.NoError(t, err)
		oldNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())
		require.NoError(t, err)
		require.True(t, oldNode.Reputation.LastContactSuccess.Before(oldNode.Reputation.LastContactFailure))
		// close the node service so the ping back will fail
		err = node.Server.Close()
		require.NoError(t, err)

		// create new estimation chore that starts after the node's last contacted time
		newEstimationChore := downtime.NewEstimationChore(
			satellite.Log,
			downtime.Config{
				EstimationInterval:  1 * time.Second,
				EstimationBatchSize: 10,
			},
			satellite.Overlay.Service,
			satellite.DowntimeTracking.Service,
			satellite.DB.DowntimeTracking(),
		)

		time.Sleep(1 * time.Second) // wait for 1s because estimation chore truncates offline duration to seconds

		var group errgroup.Group
		group.Go(func() error {
			return newEstimationChore.Run(ctx)
		})
		defer func() {
			err = newEstimationChore.Close()
			require.NoError(t, err)
			err = group.Wait()
			require.NoError(t, err)
		}()

		newEstimationChore.Loop.TriggerWait()
		// since the estimation chore was started after the last ping, the node's offline time should be 0
		downtime, err := satellite.DB.DowntimeTracking().GetOfflineTime(ctx, node.ID(), time.Now().Add(-5*time.Hour), time.Now())
		require.NoError(t, err)
		require.EqualValues(t, downtime, 0)

		// expect node last contact failure was updated
		newNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())
		require.NoError(t, err)
		require.Equal(t, oldNode.Reputation.LastContactSuccess, newNode.Reputation.LastContactSuccess)
		require.True(t, oldNode.Reputation.LastContactFailure.Before(newNode.Reputation.LastContactFailure))
	})
}
satellite/downtime: new chore estimates downtime Adds EstimationChore to the downtime package, which is an independent chore that finds offline nodes given a configurable limit, then uptime checks those nodes, and sets a last contact success or failure given a response. For failed nodes, the chore updates the amount of downtime the node has been offline in the DowntimeTracking table. Design doc section: https://github.com/storj/storj/blob/master/docs/blueprints/storage-node-downtime-tracking.md#estimating-offline-time Jira: https://storjlabs.atlassian.net/browse/V3-2545 Change-Id: I60af95803930bf9b33232b248bb20cca6f0e0b5f 2020-01-07 21:34:48 +00:00			`// Copyright (C) 2019 Storj Labs, Inc.`
			`// See LICENSE for copying information.`

			`package downtime_test`

			`import (`
			`"testing"`
			`"time"`

			`"github.com/stretchr/testify/require"`
			`"go.uber.org/zap"`
satellite/downtime: update detection and estimation downtime chores for more trustworthy downtime tracking Detection chore: Do not update downtime at all from the detection chore. We only want to include downtime between two explicitly failed ping attempts (the duration between last contact success and the first failed ping is no longer included in downtime calculation) Estimation chore: If the satellite started after the last failed ping for a node, do not include offline time since the last failed ping time - only estimate based on two failed pings with no satellite downtime in between. This protects us from including satellite downtime in our storagenode downtime calculations. Change-Id: I1fddc9f7255a7023e02474255d70c64faae75b8a 2020-02-10 20:55:35 +00:00			`"golang.org/x/sync/errgroup"`
satellite/satellitedb/satellitedbtest: pass ctx as an argument ctx is created in most tests, instead pass in as argument to reduce code duplication. Change-Id: I466c51c008392001129c8b007c9d6b3619935ac4 2020-01-19 16:29:15 +00:00
satellite/downtime: new chore estimates downtime Adds EstimationChore to the downtime package, which is an independent chore that finds offline nodes given a configurable limit, then uptime checks those nodes, and sets a last contact success or failure given a response. For failed nodes, the chore updates the amount of downtime the node has been offline in the DowntimeTracking table. Design doc section: https://github.com/storj/storj/blob/master/docs/blueprints/storage-node-downtime-tracking.md#estimating-offline-time Jira: https://storjlabs.atlassian.net/browse/V3-2545 Change-Id: I60af95803930bf9b33232b248bb20cca6f0e0b5f 2020-01-07 21:34:48 +00:00			`"storj.io/common/testcontext"`
			`"storj.io/storj/private/testplanet"`
			`"storj.io/storj/satellite"`
satellite/downtime: update detection and estimation downtime chores for more trustworthy downtime tracking Detection chore: Do not update downtime at all from the detection chore. We only want to include downtime between two explicitly failed ping attempts (the duration between last contact success and the first failed ping is no longer included in downtime calculation) Estimation chore: If the satellite started after the last failed ping for a node, do not include offline time since the last failed ping time - only estimate based on two failed pings with no satellite downtime in between. This protects us from including satellite downtime in our storagenode downtime calculations. Change-Id: I1fddc9f7255a7023e02474255d70c64faae75b8a 2020-02-10 20:55:35 +00:00			`"storj.io/storj/satellite/downtime"`
satellite/downtime: new chore estimates downtime Adds EstimationChore to the downtime package, which is an independent chore that finds offline nodes given a configurable limit, then uptime checks those nodes, and sets a last contact success or failure given a response. For failed nodes, the chore updates the amount of downtime the node has been offline in the DowntimeTracking table. Design doc section: https://github.com/storj/storj/blob/master/docs/blueprints/storage-node-downtime-tracking.md#estimating-offline-time Jira: https://storjlabs.atlassian.net/browse/V3-2545 Change-Id: I60af95803930bf9b33232b248bb20cca6f0e0b5f 2020-01-07 21:34:48 +00:00			`)`

satellite/downtime: update detection and estimation downtime chores for more trustworthy downtime tracking Detection chore: Do not update downtime at all from the detection chore. We only want to include downtime between two explicitly failed ping attempts (the duration between last contact success and the first failed ping is no longer included in downtime calculation) Estimation chore: If the satellite started after the last failed ping for a node, do not include offline time since the last failed ping time - only estimate based on two failed pings with no satellite downtime in between. This protects us from including satellite downtime in our storagenode downtime calculations. Change-Id: I1fddc9f7255a7023e02474255d70c64faae75b8a 2020-02-10 20:55:35 +00:00			`// TestEstimationChoreBasic tests the basic functionality of the downtime estimation chore:`
			`// 1. Test that when a node that had one failed ping, and one successful ping >1s later does not have recorded downtime`
			`// 2. Test that when a node that had one failed ping, and another failed ping >1s later has at least 1s of recorded downtime`
			`func TestEstimationChoreBasic(t *testing.T) {`
satellite/downtime: new chore estimates downtime Adds EstimationChore to the downtime package, which is an independent chore that finds offline nodes given a configurable limit, then uptime checks those nodes, and sets a last contact success or failure given a response. For failed nodes, the chore updates the amount of downtime the node has been offline in the DowntimeTracking table. Design doc section: https://github.com/storj/storj/blob/master/docs/blueprints/storage-node-downtime-tracking.md#estimating-offline-time Jira: https://storjlabs.atlassian.net/browse/V3-2545 Change-Id: I60af95803930bf9b33232b248bb20cca6f0e0b5f 2020-01-07 21:34:48 +00:00			`testplanet.Run(t, testplanet.Config{`
			`SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,`
			`Reconfigure: testplanet.Reconfigure{`
			`Satellite: func(log zap.Logger, index int, config satellite.Config) {`
			`config.Downtime.EstimationBatchSize = 1`
			`},`
			`},`
			`}, func(t testing.T, ctx testcontext.Context, planet *testplanet.Planet) {`
			`node := planet.StorageNodes[0]`
			`satellite := planet.Satellites[0]`
{storagenode/contact, private/testplanet}: remove ErrFailureToStart and panic in testplanet.Start Change-Id: I252e8c9407400af7bda95a7657c8154660c3c801 2020-02-19 18:32:53 +00:00			`node.Contact.Chore.Pause(ctx)`
satellite/downtime: new chore estimates downtime Adds EstimationChore to the downtime package, which is an independent chore that finds offline nodes given a configurable limit, then uptime checks those nodes, and sets a last contact success or failure given a response. For failed nodes, the chore updates the amount of downtime the node has been offline in the DowntimeTracking table. Design doc section: https://github.com/storj/storj/blob/master/docs/blueprints/storage-node-downtime-tracking.md#estimating-offline-time Jira: https://storjlabs.atlassian.net/browse/V3-2545 Change-Id: I60af95803930bf9b33232b248bb20cca6f0e0b5f 2020-01-07 21:34:48 +00:00			`satellite.DowntimeTracking.EstimationChore.Loop.Pause()`
			`{ // test estimation chore updates uptime correctly for an online node`
			`// mark node as failing an uptime check so the estimation chore picks it up`
			`_, err := satellite.DB.OverlayCache().UpdateUptime(ctx, node.ID(), false)`
			`require.NoError(t, err)`
			`oldNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())`
			`require.NoError(t, err)`
			`require.True(t, oldNode.Reputation.LastContactSuccess.Before(oldNode.Reputation.LastContactFailure))`
			`// run estimation chore`
			`time.Sleep(1 * time.Second) // wait for 1s because estimation chore truncates offline duration to seconds`
			`satellite.DowntimeTracking.EstimationChore.Loop.TriggerWait()`
			`// get offline time for node, expect it to be 0 since node was online when chore pinged it`
			`downtime, err := satellite.DB.DowntimeTracking().GetOfflineTime(ctx, node.ID(), time.Now().Add(-5*time.Hour), time.Now())`
			`require.NoError(t, err)`
			`require.True(t, downtime == 0)`
			`// expect node last contact success was updated`
			`newNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())`
			`require.NoError(t, err)`
			`require.Equal(t, oldNode.Reputation.LastContactFailure, newNode.Reputation.LastContactFailure)`
			`require.True(t, oldNode.Reputation.LastContactSuccess.Before(newNode.Reputation.LastContactSuccess))`
			`require.True(t, newNode.Reputation.LastContactFailure.Before(newNode.Reputation.LastContactSuccess))`
			`}`
			`{ // test estimation chore correctly aggregates offline time`
			`// mark node as failing an uptime check so the estimation chore picks it up`
			`_, err := satellite.DB.OverlayCache().UpdateUptime(ctx, node.ID(), false)`
			`require.NoError(t, err)`
			`oldNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())`
			`require.NoError(t, err)`
			`require.True(t, oldNode.Reputation.LastContactSuccess.Before(oldNode.Reputation.LastContactFailure))`
			`// close the node service so the ping back will fail`
			`err = node.Server.Close()`
			`require.NoError(t, err)`
			`// run estimation chore`
			`time.Sleep(1 * time.Second) // wait for 1s because estimation chore truncates offline duration to seconds`
			`satellite.DowntimeTracking.EstimationChore.Loop.TriggerWait()`
			`// get offline time for node, expect it to be greater than 0 since node has been offline for at least 1s`
			`downtime, err := satellite.DB.DowntimeTracking().GetOfflineTime(ctx, node.ID(), time.Now().Add(-5*time.Hour), time.Now())`
			`require.NoError(t, err)`
			`require.True(t, downtime > 0)`
			`// expect node last contact failure was updated`
			`newNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())`
			`require.NoError(t, err)`
			`require.Equal(t, oldNode.Reputation.LastContactSuccess, newNode.Reputation.LastContactSuccess)`
			`require.True(t, oldNode.Reputation.LastContactFailure.Before(newNode.Reputation.LastContactFailure))`
			`}`
			`})`
			`}`
satellite/downtime: update detection and estimation downtime chores for more trustworthy downtime tracking Detection chore: Do not update downtime at all from the detection chore. We only want to include downtime between two explicitly failed ping attempts (the duration between last contact success and the first failed ping is no longer included in downtime calculation) Estimation chore: If the satellite started after the last failed ping for a node, do not include offline time since the last failed ping time - only estimate based on two failed pings with no satellite downtime in between. This protects us from including satellite downtime in our storagenode downtime calculations. Change-Id: I1fddc9f7255a7023e02474255d70c64faae75b8a 2020-02-10 20:55:35 +00:00
			`// TestEstimationChoreSatelliteDowntime tests the situation where downtime is estimated when the satellite was started after the last failed ping`
			`// If a storage node has a failed ping, then another ping fails later, the estimation chore will normally take the difference between these pings and record that as the downtime.`
			`// If the satellite was started between the old failed ping and the new failed ping, we do not want to risk including satellite downtime in our calculation - no downtime should be recorded in this case.`
			`func TestEstimationChoreSatelliteDowntime(t *testing.T) {`
			`testplanet.Run(t, testplanet.Config{`
			`SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,`
			`Reconfigure: testplanet.Reconfigure{`
			`Satellite: func(log zap.Logger, index int, config satellite.Config) {`
			`config.Downtime.EstimationBatchSize = 1`
			`},`
			`},`
			`}, func(t testing.T, ctx testcontext.Context, planet *testplanet.Planet) {`
			`node := planet.StorageNodes[0]`
			`satellite := planet.Satellites[0]`
{storagenode/contact, private/testplanet}: remove ErrFailureToStart and panic in testplanet.Start Change-Id: I252e8c9407400af7bda95a7657c8154660c3c801 2020-02-19 18:32:53 +00:00			`node.Contact.Chore.Pause(ctx)`
satellite/downtime: update detection and estimation downtime chores for more trustworthy downtime tracking Detection chore: Do not update downtime at all from the detection chore. We only want to include downtime between two explicitly failed ping attempts (the duration between last contact success and the first failed ping is no longer included in downtime calculation) Estimation chore: If the satellite started after the last failed ping for a node, do not include offline time since the last failed ping time - only estimate based on two failed pings with no satellite downtime in between. This protects us from including satellite downtime in our storagenode downtime calculations. Change-Id: I1fddc9f7255a7023e02474255d70c64faae75b8a 2020-02-10 20:55:35 +00:00			`satellite.DowntimeTracking.EstimationChore.Loop.Pause()`

			`// mark node as failing an uptime check so the estimation chore picks it up`
			`_, err := satellite.DB.OverlayCache().UpdateUptime(ctx, node.ID(), false)`
			`require.NoError(t, err)`
			`oldNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())`
			`require.NoError(t, err)`
			`require.True(t, oldNode.Reputation.LastContactSuccess.Before(oldNode.Reputation.LastContactFailure))`
			`// close the node service so the ping back will fail`
			`err = node.Server.Close()`
			`require.NoError(t, err)`

			`// create new estimation chore that starts after the node's last contacted time`
			`newEstimationChore := downtime.NewEstimationChore(`
			`satellite.Log,`
			`downtime.Config{`
			`EstimationInterval: 1 * time.Second,`
			`EstimationBatchSize: 10,`
			`},`
			`satellite.Overlay.Service,`
			`satellite.DowntimeTracking.Service,`
			`satellite.DB.DowntimeTracking(),`
			`)`

			`time.Sleep(1 * time.Second) // wait for 1s because estimation chore truncates offline duration to seconds`

			`var group errgroup.Group`
			`group.Go(func() error {`
			`return newEstimationChore.Run(ctx)`
			`})`
			`defer func() {`
			`err = newEstimationChore.Close()`
			`require.NoError(t, err)`
			`err = group.Wait()`
			`require.NoError(t, err)`
			`}()`

			`newEstimationChore.Loop.TriggerWait()`
			`// since the estimation chore was started after the last ping, the node's offline time should be 0`
			`downtime, err := satellite.DB.DowntimeTracking().GetOfflineTime(ctx, node.ID(), time.Now().Add(-5*time.Hour), time.Now())`
			`require.NoError(t, err)`
			`require.EqualValues(t, downtime, 0)`

			`// expect node last contact failure was updated`
			`newNode, err := satellite.DB.OverlayCache().Get(ctx, node.ID())`
			`require.NoError(t, err)`
			`require.Equal(t, oldNode.Reputation.LastContactSuccess, newNode.Reputation.LastContactSuccess)`
			`require.True(t, oldNode.Reputation.LastContactFailure.Before(newNode.Reputation.LastContactFailure))`
			`})`
			`}`