2020-01-07 21:34:48 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package downtime_test
import (
"testing"
"time"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
2020-02-10 20:55:35 +00:00
"golang.org/x/sync/errgroup"
2020-01-19 16:29:15 +00:00
2020-01-07 21:34:48 +00:00
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
2020-02-10 20:55:35 +00:00
"storj.io/storj/satellite/downtime"
2020-04-13 19:08:04 +01:00
"storj.io/storj/satellite/overlay"
2020-01-07 21:34:48 +00:00
)
2020-02-10 20:55:35 +00:00
// TestEstimationChoreBasic tests the basic functionality of the downtime estimation chore:
2020-07-16 16:27:24 +01:00
//
// 1. Test that when a node that had one failed ping, and one successful ping >1s later does not have recorded downtime.
// 2. Test that when a node that had one failed ping, and another failed ping >1s later has at least 1s of recorded downtime.
2020-02-10 20:55:35 +00:00
func TestEstimationChoreBasic ( t * testing . T ) {
2020-01-07 21:34:48 +00:00
testplanet . Run ( t , testplanet . Config {
2020-04-13 19:08:04 +01:00
SatelliteCount : 1 , StorageNodeCount : 2 , UplinkCount : 0 ,
2020-01-07 21:34:48 +00:00
Reconfigure : testplanet . Reconfigure {
Satellite : func ( log * zap . Logger , index int , config * satellite . Config ) {
2020-04-13 19:08:04 +01:00
config . Downtime . EstimationBatchSize = 2
2020-01-07 21:34:48 +00:00
} ,
} ,
} , func ( t * testing . T , ctx * testcontext . Context , planet * testplanet . Planet ) {
satellite := planet . Satellites [ 0 ]
satellite . DowntimeTracking . EstimationChore . Loop . Pause ( )
2020-04-13 19:08:04 +01:00
{ // test last_contact_success is updated for nodes where last_contact_failure > last_contact_success, but node is online
var oldNodes [ ] * overlay . NodeDossier
for _ , node := range planet . StorageNodes {
node . Contact . Chore . Pause ( ctx )
// mark node as failing an uptime check so the estimation chore picks it up
_ , err := satellite . DB . OverlayCache ( ) . UpdateUptime ( ctx , node . ID ( ) , false )
require . NoError ( t , err )
oldNode , err := satellite . DB . OverlayCache ( ) . Get ( ctx , node . ID ( ) )
require . NoError ( t , err )
require . True ( t , oldNode . Reputation . LastContactSuccess . Before ( oldNode . Reputation . LastContactFailure ) )
oldNodes = append ( oldNodes , oldNode )
}
2020-01-07 21:34:48 +00:00
// run estimation chore
time . Sleep ( 1 * time . Second ) // wait for 1s because estimation chore truncates offline duration to seconds
satellite . DowntimeTracking . EstimationChore . Loop . TriggerWait ( )
2020-04-13 19:08:04 +01:00
for i , node := range planet . StorageNodes {
// get offline time for node, expect it to be 0 since node was online when chore pinged it
downtime , err := satellite . DB . DowntimeTracking ( ) . GetOfflineTime ( ctx , node . ID ( ) , time . Now ( ) . Add ( - 5 * time . Hour ) , time . Now ( ) )
require . NoError ( t , err )
require . True ( t , downtime == 0 )
// expect node last contact success was updated
newNode , err := satellite . DB . OverlayCache ( ) . Get ( ctx , node . ID ( ) )
require . NoError ( t , err )
require . Equal ( t , oldNodes [ i ] . Reputation . LastContactFailure , newNode . Reputation . LastContactFailure )
require . True ( t , oldNodes [ i ] . Reputation . LastContactSuccess . Before ( newNode . Reputation . LastContactSuccess ) )
require . True ( t , newNode . Reputation . LastContactFailure . Before ( newNode . Reputation . LastContactSuccess ) )
}
2020-01-07 21:34:48 +00:00
}
2020-04-13 19:08:04 +01:00
{ // test last_contact_failure is updated and downtime is recorded for nodes where last_contact_failure > last_contact_success and node is offline
var oldNodes [ ] * overlay . NodeDossier
for _ , node := range planet . StorageNodes {
// mark node as failing an uptime check so the estimation chore picks it up
_ , err := satellite . DB . OverlayCache ( ) . UpdateUptime ( ctx , node . ID ( ) , false )
require . NoError ( t , err )
oldNode , err := satellite . DB . OverlayCache ( ) . Get ( ctx , node . ID ( ) )
require . NoError ( t , err )
require . True ( t , oldNode . Reputation . LastContactSuccess . Before ( oldNode . Reputation . LastContactFailure ) )
// close the node service so the ping back will fail
err = node . Server . Close ( )
require . NoError ( t , err )
oldNodes = append ( oldNodes , oldNode )
}
2020-01-07 21:34:48 +00:00
// run estimation chore
time . Sleep ( 1 * time . Second ) // wait for 1s because estimation chore truncates offline duration to seconds
satellite . DowntimeTracking . EstimationChore . Loop . TriggerWait ( )
2020-04-13 19:08:04 +01:00
for i , node := range planet . StorageNodes {
// get offline time for node, expect it to be greater than 0 since node has been offline for at least 1s
downtime , err := satellite . DB . DowntimeTracking ( ) . GetOfflineTime ( ctx , node . ID ( ) , time . Now ( ) . Add ( - 5 * time . Hour ) , time . Now ( ) )
require . NoError ( t , err )
require . True ( t , downtime > 0 )
// expect node last contact failure was updated
newNode , err := satellite . DB . OverlayCache ( ) . Get ( ctx , node . ID ( ) )
require . NoError ( t , err )
require . Equal ( t , oldNodes [ i ] . Reputation . LastContactSuccess , newNode . Reputation . LastContactSuccess )
require . True ( t , oldNodes [ i ] . Reputation . LastContactFailure . Before ( newNode . Reputation . LastContactFailure ) )
}
2020-01-07 21:34:48 +00:00
}
} )
}
2020-02-10 20:55:35 +00:00
// TestEstimationChoreSatelliteDowntime tests the situation where downtime is estimated when the satellite was started after the last failed ping
// If a storage node has a failed ping, then another ping fails later, the estimation chore will normally take the difference between these pings and record that as the downtime.
// If the satellite was started between the old failed ping and the new failed ping, we do not want to risk including satellite downtime in our calculation - no downtime should be recorded in this case.
func TestEstimationChoreSatelliteDowntime ( t * testing . T ) {
testplanet . Run ( t , testplanet . Config {
SatelliteCount : 1 , StorageNodeCount : 1 , UplinkCount : 0 ,
Reconfigure : testplanet . Reconfigure {
Satellite : func ( log * zap . Logger , index int , config * satellite . Config ) {
config . Downtime . EstimationBatchSize = 1
} ,
} ,
} , func ( t * testing . T , ctx * testcontext . Context , planet * testplanet . Planet ) {
node := planet . StorageNodes [ 0 ]
satellite := planet . Satellites [ 0 ]
2020-02-19 18:32:53 +00:00
node . Contact . Chore . Pause ( ctx )
2020-02-10 20:55:35 +00:00
satellite . DowntimeTracking . EstimationChore . Loop . Pause ( )
// mark node as failing an uptime check so the estimation chore picks it up
_ , err := satellite . DB . OverlayCache ( ) . UpdateUptime ( ctx , node . ID ( ) , false )
require . NoError ( t , err )
oldNode , err := satellite . DB . OverlayCache ( ) . Get ( ctx , node . ID ( ) )
require . NoError ( t , err )
require . True ( t , oldNode . Reputation . LastContactSuccess . Before ( oldNode . Reputation . LastContactFailure ) )
// close the node service so the ping back will fail
err = node . Server . Close ( )
require . NoError ( t , err )
// create new estimation chore that starts after the node's last contacted time
newEstimationChore := downtime . NewEstimationChore (
satellite . Log ,
downtime . Config {
2020-04-13 19:08:04 +01:00
EstimationInterval : 1 * time . Second ,
EstimationBatchSize : 10 ,
EstimationConcurrencyLimit : 10 ,
2020-02-10 20:55:35 +00:00
} ,
satellite . Overlay . Service ,
satellite . DowntimeTracking . Service ,
satellite . DB . DowntimeTracking ( ) ,
)
time . Sleep ( 1 * time . Second ) // wait for 1s because estimation chore truncates offline duration to seconds
var group errgroup . Group
group . Go ( func ( ) error {
return newEstimationChore . Run ( ctx )
} )
defer func ( ) {
err = newEstimationChore . Close ( )
require . NoError ( t , err )
err = group . Wait ( )
require . NoError ( t , err )
} ( )
newEstimationChore . Loop . TriggerWait ( )
// since the estimation chore was started after the last ping, the node's offline time should be 0
downtime , err := satellite . DB . DowntimeTracking ( ) . GetOfflineTime ( ctx , node . ID ( ) , time . Now ( ) . Add ( - 5 * time . Hour ) , time . Now ( ) )
require . NoError ( t , err )
require . EqualValues ( t , downtime , 0 )
// expect node last contact failure was updated
newNode , err := satellite . DB . OverlayCache ( ) . Get ( ctx , node . ID ( ) )
require . NoError ( t , err )
require . Equal ( t , oldNode . Reputation . LastContactSuccess , newNode . Reputation . LastContactSuccess )
require . True ( t , oldNode . Reputation . LastContactFailure . Before ( newNode . Reputation . LastContactFailure ) )
} )
}