2019-04-08 18:33:47 +01:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
2019-07-28 10:15:34 +01:00
|
|
|
package repair_test
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
import (
|
2019-07-11 23:44:47 +01:00
|
|
|
"context"
|
2019-09-06 20:20:36 +01:00
|
|
|
"io"
|
2019-07-11 23:44:47 +01:00
|
|
|
"math"
|
2019-04-08 18:33:47 +01:00
|
|
|
"testing"
|
2019-11-15 22:43:06 +00:00
|
|
|
"time"
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
"github.com/stretchr/testify/require"
|
2019-07-10 03:36:09 +01:00
|
|
|
"go.uber.org/zap"
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/memory"
|
|
|
|
"storj.io/common/pb"
|
|
|
|
"storj.io/common/storj"
|
|
|
|
"storj.io/common/testcontext"
|
|
|
|
"storj.io/common/testrand"
|
2019-11-14 19:46:15 +00:00
|
|
|
"storj.io/storj/private/testplanet"
|
2019-07-01 16:34:42 +01:00
|
|
|
"storj.io/storj/satellite"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2019-09-06 20:20:36 +01:00
|
|
|
"storj.io/storj/storage"
|
|
|
|
"storj.io/storj/storagenode"
|
2019-04-08 18:33:47 +01:00
|
|
|
)
|
|
|
|
|
2019-07-01 16:34:42 +01:00
|
|
|
// TestDataRepair does the following:
|
2019-07-11 23:44:47 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes and disqualifies 1
|
|
|
|
// - Triggers data repair, which repairs the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Shuts down several nodes, but keeping up a number equal to the minim
|
|
|
|
// threshold
|
|
|
|
// - Downloads the data from those left nodes and check that it's the same than
|
|
|
|
// the uploaded one
|
2019-04-08 18:33:47 +01:00
|
|
|
func TestDataRepair(t *testing.T) {
|
2020-01-21 10:38:41 +00:00
|
|
|
const (
|
|
|
|
RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
minThreshold = 3
|
|
|
|
successThreshold = 7
|
|
|
|
)
|
2019-07-11 23:44:47 +01:00
|
|
|
|
2019-04-08 18:33:47 +01:00
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
2019-07-11 23:44:47 +01:00
|
|
|
StorageNodeCount: 14,
|
2019-04-08 18:33:47 +01:00
|
|
|
UplinkCount: 1,
|
2019-07-10 03:36:09 +01:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
2019-08-20 15:46:39 +01:00
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2020-01-21 10:38:41 +00:00
|
|
|
|
|
|
|
config.Metainfo.RS.MinThreshold = minThreshold
|
|
|
|
config.Metainfo.RS.RepairThreshold = 5
|
|
|
|
config.Metainfo.RS.SuccessThreshold = successThreshold
|
|
|
|
config.Metainfo.RS.TotalThreshold = 9
|
2019-07-10 03:36:09 +01:00
|
|
|
},
|
|
|
|
},
|
2019-04-08 18:33:47 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-11-15 22:43:06 +00:00
|
|
|
|
2019-04-08 18:33:47 +01:00
|
|
|
// first, upload some remote data
|
2019-07-22 20:10:04 +01:00
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-07-01 16:15:45 +01:00
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-04-08 18:33:47 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
pointer, path := getRemoteSegment(t, ctx, satellite)
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
redundancy := pointer.GetRemote().GetRedundancy()
|
|
|
|
minReq := redundancy.GetMinReq()
|
2019-07-11 23:44:47 +01:00
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
2019-04-08 18:33:47 +01:00
|
|
|
numPieces := len(remotePieces)
|
2019-07-01 16:34:42 +01:00
|
|
|
// disqualify one storage node
|
|
|
|
toDisqualify := 1
|
2019-09-06 20:20:36 +01:00
|
|
|
toKill := numPieces - toDisqualify - int(minReq)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.True(t, toKill >= 1)
|
2019-07-11 23:44:47 +01:00
|
|
|
maxNumRepairedPieces := int(
|
|
|
|
math.Ceil(
|
2019-08-20 15:46:39 +01:00
|
|
|
float64(successThreshold) * (1 + RepairMaxExcessRateOptimalThreshold),
|
2019-07-11 23:44:47 +01:00
|
|
|
),
|
|
|
|
)
|
|
|
|
numStorageNodes := len(planet.StorageNodes)
|
|
|
|
// Ensure that there are enough storage nodes to upload repaired segments
|
|
|
|
require.Falsef(t,
|
|
|
|
(numStorageNodes-toKill-toDisqualify) < maxNumRepairedPieces,
|
|
|
|
"there is not enough available nodes for repairing: need= %d, have= %d",
|
2019-10-02 13:58:37 +01:00
|
|
|
maxNumRepairedPieces, numStorageNodes-toKill-toDisqualify,
|
2019-07-11 23:44:47 +01:00
|
|
|
)
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
2019-07-01 16:34:42 +01:00
|
|
|
nodesToDisqualify := make(map[storj.NodeID]bool)
|
2019-04-08 18:33:47 +01:00
|
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
|
|
|
2019-07-01 16:34:42 +01:00
|
|
|
var numDisqualified int
|
2019-04-08 18:33:47 +01:00
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
2019-07-01 16:34:42 +01:00
|
|
|
if numDisqualified < toDisqualify {
|
|
|
|
nodesToDisqualify[piece.NodeId] = true
|
|
|
|
numDisqualified++
|
|
|
|
}
|
2019-04-08 18:33:47 +01:00
|
|
|
nodesToKeepAlive[piece.NodeId] = true
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
2019-07-01 16:34:42 +01:00
|
|
|
if nodesToDisqualify[node.ID()] {
|
2020-01-03 00:00:18 +00:00
|
|
|
err := satellite.DB.OverlayCache().DisqualifyNode(ctx, node.ID())
|
|
|
|
require.NoError(t, err)
|
2019-07-01 16:34:42 +01:00
|
|
|
continue
|
|
|
|
}
|
2019-04-08 18:33:47 +01:00
|
|
|
if nodesToKill[node.ID()] {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// repaired segment should not contain any piece in the killed and DQ nodes
|
2019-09-12 18:16:39 +01:00
|
|
|
metainfoService := satellite.Metainfo.Service
|
2019-07-22 20:10:04 +01:00
|
|
|
pointer, err = metainfoService.Get(ctx, path)
|
2019-07-11 23:44:47 +01:00
|
|
|
require.NoError(t, err)
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
nodesToKillForMinThreshold := len(remotePieces) - minThreshold
|
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
|
|
for _, piece := range remotePieces {
|
|
|
|
require.NotContains(t, nodesToKill, piece.NodeId, "there shouldn't be pieces in killed nodes")
|
|
|
|
require.NotContains(t, nodesToDisqualify, piece.NodeId, "there shouldn't be pieces in DQ nodes")
|
|
|
|
|
2019-09-16 17:11:12 +01:00
|
|
|
require.Nil(t, piece.Hash, "piece hashes should be set to nil")
|
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// Kill the original nodes which were kept alive to ensure that we can
|
|
|
|
// download from the new nodes that the repaired pieces have been uploaded
|
|
|
|
if _, ok := nodesToKeepAlive[piece.NodeId]; ok && nodesToKillForMinThreshold > 0 {
|
|
|
|
stopNodeByID(t, ctx, planet, piece.NodeId)
|
|
|
|
nodesToKillForMinThreshold--
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
// we should be able to download data without any of the original nodes
|
2019-09-12 18:16:39 +01:00
|
|
|
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
2019-07-01 16:15:45 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, newData, testData)
|
2019-07-01 16:34:42 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
// TestCorruptDataRepair_Failed does the following:
|
2019-09-06 20:20:36 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills all but the minimum number of nodes carrying the uploaded segment
|
|
|
|
// - On one of the remaining nodes, corrupt the piece data being stored by that node
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair failed and the pointer was not updated
|
2019-09-16 18:13:24 +01:00
|
|
|
func TestCorruptDataRepair_Failed(t *testing.T) {
|
2019-09-06 20:20:36 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2020-01-21 10:38:41 +00:00
|
|
|
|
|
|
|
config.Metainfo.RS.MinThreshold = 3
|
|
|
|
config.Metainfo.RS.RepairThreshold = 5
|
|
|
|
config.Metainfo.RS.SuccessThreshold = 7
|
|
|
|
config.Metainfo.RS.TotalThreshold = 9
|
2019-09-06 20:20:36 +01:00
|
|
|
},
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-09-06 20:20:36 +01:00
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-09-06 20:20:36 +01:00
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-09-06 20:20:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
pointer, path := getRemoteSegment(t, ctx, satellite)
|
2019-09-06 20:20:36 +01:00
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
redundancy := pointer.GetRemote().GetRedundancy()
|
|
|
|
minReq := redundancy.GetMinReq()
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toKill := numPieces - int(minReq)
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
var corruptedNode *storagenode.Peer
|
|
|
|
var corruptedNodeID storj.NodeID
|
2019-09-16 18:13:24 +01:00
|
|
|
var corruptedPieceID storj.PieceID
|
2019-09-06 20:20:36 +01:00
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
originalNodes[piece.NodeId] = true
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
// choose a node and pieceID to corrupt so repair fails
|
2019-09-16 18:13:24 +01:00
|
|
|
if corruptedNodeID.IsZero() || corruptedPieceID.IsZero() {
|
2019-09-06 20:20:36 +01:00
|
|
|
corruptedNodeID = piece.NodeId
|
2019-09-16 18:13:24 +01:00
|
|
|
corruptedPieceID = pointer.GetRemote().RootPieceId.Derive(corruptedNodeID, piece.PieceNum)
|
2019-09-06 20:20:36 +01:00
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
require.NotNil(t, corruptedNodeID)
|
2019-09-16 18:13:24 +01:00
|
|
|
require.NotNil(t, corruptedPieceID)
|
2019-09-06 20:20:36 +01:00
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if node.ID() == corruptedNodeID {
|
|
|
|
corruptedNode = node
|
|
|
|
}
|
|
|
|
if nodesToKill[node.ID()] {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-09-06 20:20:36 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
require.NotNil(t, corruptedNode)
|
|
|
|
|
2019-09-13 17:21:20 +01:00
|
|
|
overlay := planet.Satellites[0].Overlay.Service
|
|
|
|
node, err := overlay.Get(ctx, corruptedNodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
corruptedNodeReputation := node.Reputation
|
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
corruptPieceData(ctx, t, planet, corruptedNode, corruptedPieceID)
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
|
|
|
|
|
|
|
// repair should update audit status as fail
|
|
|
|
node, err = overlay.Get(ctx, corruptedNodeID)
|
2019-09-06 20:20:36 +01:00
|
|
|
require.NoError(t, err)
|
2019-09-16 18:13:24 +01:00
|
|
|
require.Equal(t, corruptedNodeReputation.AuditCount+1, node.Reputation.AuditCount)
|
|
|
|
require.True(t, corruptedNodeReputation.AuditReputationBeta < node.Reputation.AuditReputationBeta)
|
|
|
|
require.True(t, corruptedNodeReputation.AuditReputationAlpha >= node.Reputation.AuditReputationAlpha)
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
// repair should fail, so segment should contain all the original nodes
|
|
|
|
metainfoService := satellite.Metainfo.Service
|
|
|
|
pointer, err = metainfoService.Get(ctx, path)
|
2019-09-06 20:20:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
|
|
for _, piece := range remotePieces {
|
|
|
|
require.Contains(t, originalNodes, piece.NodeId, "there should be no new nodes in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestCorruptDataRepair does the following:
|
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes carrying the uploaded segment but keep it above minimum requirement
|
|
|
|
// - On one of the remaining nodes, corrupt the piece data being stored by that node
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair succeed and the pointer should not contain the corrupted piece
|
|
|
|
func TestCorruptDataRepair_Succeed(t *testing.T) {
|
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2020-01-21 10:38:41 +00:00
|
|
|
|
|
|
|
config.Metainfo.RS.MinThreshold = 3
|
|
|
|
config.Metainfo.RS.RepairThreshold = 5
|
|
|
|
config.Metainfo.RS.SuccessThreshold = 7
|
|
|
|
config.Metainfo.RS.TotalThreshold = 9
|
2019-09-16 18:13:24 +01:00
|
|
|
},
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-09-06 20:20:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
pointer, path := getRemoteSegment(t, ctx, satellite)
|
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
redundancy := pointer.GetRemote().GetRedundancy()
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toKill := numPieces - int(redundancy.RepairThreshold)
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
var corruptedNode *storagenode.Peer
|
|
|
|
var corruptedNodeID storj.NodeID
|
|
|
|
var corruptedPieceID storj.PieceID
|
|
|
|
var corruptedPiece *pb.RemotePiece
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
originalNodes[piece.NodeId] = true
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
// choose a node and pieceID to corrupt so repair fails
|
|
|
|
if corruptedNodeID.IsZero() || corruptedPieceID.IsZero() {
|
|
|
|
corruptedNodeID = piece.NodeId
|
|
|
|
corruptedPieceID = pointer.GetRemote().RootPieceId.Derive(corruptedNodeID, piece.PieceNum)
|
|
|
|
corruptedPiece = piece
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
require.NotNil(t, corruptedNodeID)
|
|
|
|
require.NotNil(t, corruptedPieceID)
|
2019-10-02 13:58:37 +01:00
|
|
|
require.NotNil(t, corruptedPiece)
|
2019-09-16 18:13:24 +01:00
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if node.ID() == corruptedNodeID {
|
|
|
|
corruptedNode = node
|
|
|
|
}
|
|
|
|
if nodesToKill[node.ID()] {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-09-16 18:13:24 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
require.NotNil(t, corruptedNode)
|
|
|
|
|
|
|
|
corruptPieceData(ctx, t, planet, corruptedNode, corruptedPieceID)
|
|
|
|
|
|
|
|
overlay := planet.Satellites[0].Overlay.Service
|
|
|
|
node, err := overlay.Get(ctx, corruptedNodeID)
|
2019-09-06 20:20:36 +01:00
|
|
|
require.NoError(t, err)
|
2019-09-16 18:13:24 +01:00
|
|
|
corruptedNodeReputation := node.Reputation
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2019-09-13 17:21:20 +01:00
|
|
|
// repair should update audit status as fail
|
|
|
|
node, err = overlay.Get(ctx, corruptedNodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, corruptedNodeReputation.AuditCount+1, node.Reputation.AuditCount)
|
|
|
|
require.True(t, corruptedNodeReputation.AuditReputationBeta < node.Reputation.AuditReputationBeta)
|
|
|
|
require.True(t, corruptedNodeReputation.AuditReputationAlpha >= node.Reputation.AuditReputationAlpha)
|
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
// get the new pointer
|
2019-09-12 18:16:39 +01:00
|
|
|
metainfoService := satellite.Metainfo.Service
|
2019-09-06 20:20:36 +01:00
|
|
|
pointer, err = metainfoService.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
|
|
for _, piece := range remotePieces {
|
2019-09-16 18:13:24 +01:00
|
|
|
require.NotEqual(t, piece.PieceNum, corruptedPiece.PieceNum, "there should be no corrupted piece in pointer")
|
2019-09-06 20:20:36 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
// TestRemoveDeletedSegmentFromQueue
|
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Kill nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Delete segment from the satellite database
|
|
|
|
// - Run the repairer
|
|
|
|
// - Verify segment is no longer in the repair queue
|
|
|
|
func TestRemoveDeletedSegmentFromQueue(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
2020-01-21 10:38:41 +00:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
2019-10-15 04:39:28 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Stop()
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-10-15 04:39:28 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
pointer, _ := getRemoteSegment(t, ctx, satellite)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToDQ := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
// Kill 3 nodes so that pointer has 4 left (less than repair threshold)
|
|
|
|
toKill := 3
|
|
|
|
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToDQ[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for nodeID := range nodesToDQ {
|
2020-01-03 00:00:18 +00:00
|
|
|
err := satellite.DB.OverlayCache().DisqualifyNode(ctx, nodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// trigger checker to add segment to repair queue
|
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
|
|
|
|
// Delete segment from the satellite database
|
2020-02-10 12:18:18 +00:00
|
|
|
err = uplinkPeer.DeleteObject(ctx, satellite, "testbucket", "test/path")
|
2019-10-15 04:39:28 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// Verify that the segment is on the repair queue
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 1)
|
|
|
|
|
|
|
|
// Run the repairer
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
|
|
|
|
|
|
|
// Verify that the segment was removed
|
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-07-30 16:38:25 +01:00
|
|
|
// TestRemoveIrreparableSegmentFromQueue
|
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Kill nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Kill nodes so that online nodes < minimum threshold
|
|
|
|
// - Run the repairer
|
|
|
|
// - Verify segment is no longer in the repair queue and segment should be the same
|
|
|
|
func TestRemoveIrreparableSegmentFromQueue(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
2020-01-21 10:38:41 +00:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
2019-07-30 16:38:25 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-07-30 16:38:25 +01:00
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Stop()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-30 16:38:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
pointer, _ := getRemoteSegment(t, ctx, satellite)
|
2019-07-30 16:38:25 +01:00
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToDQ := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
// Kill 3 nodes so that pointer has 4 left (less than repair threshold)
|
|
|
|
toKill := 3
|
|
|
|
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToDQ[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for nodeID := range nodesToDQ {
|
2020-01-03 00:00:18 +00:00
|
|
|
err := satellite.DB.OverlayCache().DisqualifyNode(ctx, nodeID)
|
|
|
|
require.NoError(t, err)
|
2019-07-30 16:38:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// trigger checker to add segment to repair queue
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
|
|
|
// Kill nodes so that online nodes < minimum threshold
|
|
|
|
// This will make the segment irreparable
|
|
|
|
for _, piece := range remotePieces {
|
2020-01-03 00:00:18 +00:00
|
|
|
err := satellite.DB.OverlayCache().DisqualifyNode(ctx, piece.NodeId)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-07-30 16:38:25 +01:00
|
|
|
}
|
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
// Verify that the segment is on the repair queue
|
2019-09-12 18:16:39 +01:00
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
2019-07-30 16:38:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 1)
|
|
|
|
|
|
|
|
// Run the repairer
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
// Verify that the segment was removed
|
2019-09-12 18:16:39 +01:00
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
2019-07-30 16:38:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-07-01 16:34:42 +01:00
|
|
|
// TestRepairMultipleDisqualified does the following:
|
|
|
|
// - Uploads test data to 7 nodes
|
|
|
|
// - Disqualifies 3 nodes
|
|
|
|
// - Triggers data repair, which repairs the data from the remaining 4 nodes to additional 3 new nodes
|
|
|
|
// - Shuts down the 4 nodes from which the data was repaired
|
|
|
|
// - Now we have just the 3 new nodes to which the data was repaired
|
|
|
|
// - Downloads the data from these 3 nodes (succeeds because 3 nodes are enough for download)
|
|
|
|
func TestRepairMultipleDisqualified(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 12,
|
|
|
|
UplinkCount: 1,
|
2020-01-21 10:38:41 +00:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
2019-07-01 16:34:42 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
2019-07-22 20:10:04 +01:00
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// get a remote segment from metainfo
|
2019-09-12 18:16:39 +01:00
|
|
|
metainfo := satellite.Metainfo.Service
|
2019-09-25 22:30:41 +01:00
|
|
|
listResponse, _, err := metainfo.List(ctx, "", "", true, 0, 0)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
var path string
|
|
|
|
var pointer *pb.Pointer
|
|
|
|
for _, v := range listResponse {
|
|
|
|
path = v.GetPath()
|
|
|
|
pointer, err = metainfo.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
if pointer.GetType() == pb.Pointer_REMOTE {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// calculate how many storagenodes to disqualify
|
|
|
|
numStorageNodes := len(planet.StorageNodes)
|
|
|
|
redundancy := pointer.GetRemote().GetRedundancy()
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
minReq := redundancy.GetMinReq()
|
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toDisqualify := numPieces - (int(minReq + 1))
|
|
|
|
// we should have enough storage nodes to repair on
|
|
|
|
require.True(t, (numStorageNodes-toDisqualify) >= numPieces)
|
|
|
|
|
|
|
|
// disqualify nodes and track lost pieces
|
|
|
|
nodesToDisqualify := make(map[storj.NodeID]bool)
|
|
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toDisqualify {
|
|
|
|
nodesToKeepAlive[piece.NodeId] = true
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToDisqualify[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToDisqualify[node.ID()] {
|
2020-01-03 00:00:18 +00:00
|
|
|
err := satellite.DB.OverlayCache().DisqualifyNode(ctx, node.ID())
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-07-01 16:34:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
err = satellite.Repair.Checker.RefreshReliabilityCache(ctx)
|
2019-07-08 23:04:35 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
2019-07-01 16:34:42 +01:00
|
|
|
|
|
|
|
// kill nodes kept alive to ensure repair worked
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKeepAlive[node.ID()] {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-07-01 16:34:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// we should be able to download data without any of the original nodes
|
2019-09-12 18:16:39 +01:00
|
|
|
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, newData, testData)
|
|
|
|
|
|
|
|
// updated pointer should not contain any of the disqualified nodes
|
|
|
|
pointer, err = metainfo.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
|
|
for _, piece := range remotePieces {
|
|
|
|
require.False(t, nodesToDisqualify[piece.NodeId])
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2019-10-02 13:58:37 +01:00
|
|
|
// TestDataRepairOverride_HigherLimit does the following:
|
|
|
|
// - Uploads test data
|
|
|
|
// - Kills nodes to fall to the Repair Override Value of the checker but stays above the original Repair Threshold
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
func TestDataRepairOverride_HigherLimit(t *testing.T) {
|
|
|
|
const repairOverride = 6
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Checker.RepairOverride = repairOverride
|
2020-01-21 10:38:41 +00:00
|
|
|
|
|
|
|
config.Metainfo.RS.MinThreshold = 3
|
|
|
|
config.Metainfo.RS.RepairThreshold = 4
|
|
|
|
config.Metainfo.RS.SuccessThreshold = 9
|
|
|
|
config.Metainfo.RS.TotalThreshold = 9
|
2019-10-02 13:58:37 +01:00
|
|
|
},
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-10-02 13:58:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
pointer, path := getRemoteSegment(t, ctx, satellite)
|
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
// kill one nodes less than repair threshold to ensure we dont hit it.
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toKill := numPieces - repairOverride
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
originalNodes[piece.NodeId] = true
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKill[node.ID()] {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
|
|
|
|
|
|
|
// repair should have been done, due to the override
|
|
|
|
metainfoService := satellite.Metainfo.Service
|
|
|
|
pointer, err = metainfoService.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// pointer should have the success count of pieces
|
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
|
|
require.Equal(t, int(pointer.Remote.Redundancy.SuccessThreshold), len(remotePieces))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestDataRepairOverride_LowerLimit does the following:
|
|
|
|
// - Uploads test data
|
|
|
|
// - Kills nodes to fall to the Repair Threshold of the checker that should not trigger repair any longer
|
|
|
|
// - Starts Checker and Repairer and ensures this is the case.
|
|
|
|
// - Kills more nodes to fall to the Override Value to trigger repair
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
func TestDataRepairOverride_LowerLimit(t *testing.T) {
|
|
|
|
const repairOverride = 4
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Checker.RepairOverride = repairOverride
|
2020-01-21 10:38:41 +00:00
|
|
|
|
|
|
|
config.Metainfo.RS.MinThreshold = 3
|
|
|
|
config.Metainfo.RS.RepairThreshold = 6
|
|
|
|
config.Metainfo.RS.SuccessThreshold = 9
|
|
|
|
config.Metainfo.RS.TotalThreshold = 9
|
2019-10-02 13:58:37 +01:00
|
|
|
},
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-10-02 13:58:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
pointer, path := getRemoteSegment(t, ctx, satellite)
|
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
// to hit the repair threshold
|
|
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
repairThreshold := int(pointer.GetRemote().Redundancy.RepairThreshold)
|
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toKill := numPieces - repairThreshold
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
originalNodes[piece.NodeId] = true
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKill[node.ID()] {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
|
|
|
|
|
|
|
// Increase offline count by the difference to trigger repair
|
|
|
|
toKill += repairThreshold - repairOverride
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
originalNodes[piece.NodeId] = true
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.NodeId] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKill[node.ID()] {
|
|
|
|
err = planet.StopPeer(node)
|
|
|
|
require.NoError(t, err)
|
|
|
|
_, err = satellite.Overlay.Service.UpdateUptime(ctx, node.ID(), false)
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
|
|
|
|
|
|
|
// repair should have been done, due to the override
|
|
|
|
metainfoService := satellite.Metainfo.Service
|
|
|
|
pointer, err = metainfoService.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// pointer should have the success count of pieces
|
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
|
|
require.Equal(t, int(pointer.Remote.Redundancy.SuccessThreshold), len(remotePieces))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// TestDataRepairUploadLimits does the following:
|
|
|
|
// - Uploads test data to nodes
|
|
|
|
// - Get one segment of that data to check in which nodes its pieces are stored
|
|
|
|
// - Kills as many nodes as needed which store such segment pieces
|
|
|
|
// - Triggers data repair
|
|
|
|
// - Verify that the number of pieces which repaired has uploaded don't overpass
|
|
|
|
// the established limit (success threshold + % of excess)
|
|
|
|
func TestDataRepairUploadLimit(t *testing.T) {
|
2020-01-21 10:38:41 +00:00
|
|
|
const (
|
|
|
|
RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
repairThreshold = 5
|
|
|
|
successThreshold = 7
|
|
|
|
maxThreshold = 9
|
|
|
|
)
|
2019-07-11 23:44:47 +01:00
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 13,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
2019-08-20 15:46:39 +01:00
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2020-01-21 10:38:41 +00:00
|
|
|
|
|
|
|
config.Metainfo.RS.MinThreshold = 3
|
|
|
|
config.Metainfo.RS.RepairThreshold = repairThreshold
|
|
|
|
config.Metainfo.RS.SuccessThreshold = successThreshold
|
|
|
|
config.Metainfo.RS.TotalThreshold = maxThreshold
|
2019-07-11 23:44:47 +01:00
|
|
|
},
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2019-07-11 23:44:47 +01:00
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var (
|
|
|
|
maxRepairUploadThreshold = int(
|
|
|
|
math.Ceil(
|
2019-08-20 15:46:39 +01:00
|
|
|
float64(successThreshold) * (1 + RepairMaxExcessRateOptimalThreshold),
|
2019-07-11 23:44:47 +01:00
|
|
|
),
|
|
|
|
)
|
|
|
|
ul = planet.Uplinks[0]
|
|
|
|
testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-11 23:44:47 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
pointer, path := getRemoteSegment(t, ctx, satellite)
|
|
|
|
originalPieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
require.True(t, len(originalPieces) <= maxThreshold)
|
|
|
|
|
|
|
|
{ // Check that there is enough nodes in the network which don't contain
|
|
|
|
// pieces of the segment for being able to repair the lost pieces
|
|
|
|
availableNumNodes := len(planet.StorageNodes) - len(originalPieces)
|
|
|
|
neededNodesForRepair := maxRepairUploadThreshold - repairThreshold
|
|
|
|
require.Truef(t,
|
|
|
|
availableNumNodes >= neededNodesForRepair,
|
|
|
|
"Not enough remaining nodes in the network for repairing the pieces: have= %d, need= %d",
|
|
|
|
availableNumNodes, neededNodesForRepair,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
originalStorageNodes := make(map[storj.NodeID]struct{})
|
|
|
|
for _, p := range originalPieces {
|
|
|
|
originalStorageNodes[p.NodeId] = struct{}{}
|
|
|
|
}
|
|
|
|
|
|
|
|
killedNodes := make(map[storj.NodeID]struct{})
|
|
|
|
{ // Register nodes of the network which don't have pieces for the segment
|
|
|
|
// to be injured and ill nodes which have pieces of the segment in order
|
|
|
|
// to injure it
|
|
|
|
numNodesToKill := len(originalPieces) - repairThreshold
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if _, ok := originalStorageNodes[node.ID()]; !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(killedNodes) < numNodesToKill {
|
2019-11-15 22:43:06 +00:00
|
|
|
stopNodeByID(t, ctx, planet, node.ID())
|
2019-07-11 23:44:47 +01:00
|
|
|
|
|
|
|
killedNodes[node.ID()] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
satellite.Repair.Checker.Loop.Restart()
|
|
|
|
satellite.Repair.Checker.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Checker.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.Limiter.Wait()
|
|
|
|
|
|
|
|
// Get the pointer after repair to check the nodes where the pieces are
|
|
|
|
// stored
|
|
|
|
pointer, err = satellite.Metainfo.Service.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// Check that repair has uploaded missed pieces to an expected number of
|
|
|
|
// nodes
|
|
|
|
afterRepairPieces := pointer.GetRemote().GetRemotePieces()
|
|
|
|
require.Falsef(t,
|
|
|
|
len(afterRepairPieces) > maxRepairUploadThreshold,
|
|
|
|
"Repaired pieces cannot be over max repair upload threshold. maxRepairUploadThreshold= %d, have= %d",
|
|
|
|
maxRepairUploadThreshold, len(afterRepairPieces),
|
|
|
|
)
|
|
|
|
require.Falsef(t,
|
|
|
|
len(afterRepairPieces) < successThreshold,
|
|
|
|
"Repaired pieces shouldn't be under success threshold. successThreshold= %d, have= %d",
|
|
|
|
successThreshold, len(afterRepairPieces),
|
|
|
|
)
|
|
|
|
|
|
|
|
// Check that after repair, the segment doesn't have more pieces on the
|
|
|
|
// killed nodes
|
|
|
|
for _, p := range afterRepairPieces {
|
|
|
|
require.NotContains(t, killedNodes, p.NodeId, "there shouldn't be pieces in killed nodes")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// getRemoteSegment returns a remote pointer its path from satellite.
|
|
|
|
// nolint:golint
|
|
|
|
func getRemoteSegment(
|
2019-09-17 21:14:49 +01:00
|
|
|
t *testing.T, ctx context.Context, satellite *testplanet.SatelliteSystem,
|
2019-07-11 23:44:47 +01:00
|
|
|
) (_ *pb.Pointer, path string) {
|
|
|
|
t.Helper()
|
|
|
|
|
|
|
|
// get a remote segment from metainfo
|
|
|
|
metainfo := satellite.Metainfo.Service
|
2019-09-25 22:30:41 +01:00
|
|
|
listResponse, _, err := metainfo.List(ctx, "", "", true, 0, 0)
|
2019-07-11 23:44:47 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
for _, v := range listResponse {
|
|
|
|
path := v.GetPath()
|
|
|
|
pointer, err := metainfo.Get(ctx, path)
|
|
|
|
require.NoError(t, err)
|
|
|
|
if pointer.GetType() == pb.Pointer_REMOTE {
|
|
|
|
return pointer, path
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
t.Fatal("satellite doesn't have any remote segment")
|
|
|
|
return nil, ""
|
|
|
|
}
|
|
|
|
|
|
|
|
// nolint:golint
|
|
|
|
func stopNodeByID(t *testing.T, ctx context.Context, planet *testplanet.Planet, nodeID storj.NodeID) {
|
|
|
|
t.Helper()
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if node.ID() == nodeID {
|
|
|
|
err := planet.StopPeer(node)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-10-02 13:58:37 +01:00
|
|
|
for _, satellite := range planet.Satellites {
|
2019-11-15 22:43:06 +00:00
|
|
|
err = satellite.Overlay.Service.UpdateCheckIn(ctx, overlay.NodeCheckInInfo{
|
|
|
|
NodeID: node.ID(),
|
|
|
|
Address: &pb.NodeAddress{
|
|
|
|
Address: node.Addr(),
|
|
|
|
},
|
|
|
|
IsUp: true,
|
|
|
|
Version: &pb.NodeVersion{
|
|
|
|
Version: "v0.0.0",
|
|
|
|
CommitHash: "",
|
|
|
|
Timestamp: time.Time{},
|
|
|
|
Release: false,
|
|
|
|
},
|
|
|
|
}, time.Now().UTC().Add(-4*time.Hour))
|
2019-07-11 23:44:47 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-09-16 18:13:24 +01:00
|
|
|
|
|
|
|
// corruptPieceData manipulates piece data on a storage node.
|
|
|
|
func corruptPieceData(ctx context.Context, t *testing.T, planet *testplanet.Planet, corruptedNode *storagenode.Peer, corruptedPieceID storj.PieceID) {
|
|
|
|
t.Helper()
|
|
|
|
|
|
|
|
blobRef := storage.BlobRef{
|
|
|
|
Namespace: planet.Satellites[0].ID().Bytes(),
|
|
|
|
Key: corruptedPieceID.Bytes(),
|
|
|
|
}
|
|
|
|
|
|
|
|
// get currently stored piece data from storagenode
|
|
|
|
reader, err := corruptedNode.Storage2.BlobsCache.Open(ctx, blobRef)
|
|
|
|
require.NoError(t, err)
|
|
|
|
pieceSize, err := reader.Size()
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.True(t, pieceSize > 0)
|
|
|
|
pieceData := make([]byte, pieceSize)
|
|
|
|
n, err := io.ReadFull(reader, pieceData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.EqualValues(t, n, pieceSize)
|
|
|
|
|
|
|
|
// delete piece data
|
|
|
|
err = corruptedNode.Storage2.BlobsCache.Delete(ctx, blobRef)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// corrupt piece data (not PieceHeader) and write back to storagenode
|
|
|
|
// this means repair downloading should fail during piece hash verification
|
|
|
|
pieceData[pieceSize-1]++ // if we don't do this, this test should fail
|
|
|
|
writer, err := corruptedNode.Storage2.BlobsCache.Create(ctx, blobRef, pieceSize)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
n, err = writer.Write(pieceData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.EqualValues(t, n, pieceSize)
|
|
|
|
|
|
|
|
err = writer.Commit(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|