2019-04-08 18:33:47 +01:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
2019-07-28 10:15:34 +01:00
|
|
|
package repair_test
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
import (
|
2019-07-11 23:44:47 +01:00
|
|
|
"context"
|
2021-07-13 14:52:37 +01:00
|
|
|
"crypto/tls"
|
|
|
|
"fmt"
|
2019-09-06 20:20:36 +01:00
|
|
|
"io"
|
2019-07-11 23:44:47 +01:00
|
|
|
"math"
|
2021-07-13 14:52:37 +01:00
|
|
|
"net"
|
2019-04-08 18:33:47 +01:00
|
|
|
"testing"
|
2019-11-15 22:43:06 +00:00
|
|
|
"time"
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
"github.com/stretchr/testify/require"
|
2021-08-03 14:21:27 +01:00
|
|
|
"github.com/zeebo/errs"
|
2019-07-10 03:36:09 +01:00
|
|
|
"go.uber.org/zap"
|
2021-07-13 14:52:37 +01:00
|
|
|
"go.uber.org/zap/zaptest"
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/memory"
|
|
|
|
"storj.io/common/pb"
|
2021-07-13 14:52:37 +01:00
|
|
|
"storj.io/common/rpc"
|
|
|
|
"storj.io/common/signing"
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/storj"
|
|
|
|
"storj.io/common/testcontext"
|
|
|
|
"storj.io/common/testrand"
|
2020-12-14 14:29:48 +00:00
|
|
|
"storj.io/common/uuid"
|
2019-11-14 19:46:15 +00:00
|
|
|
"storj.io/storj/private/testplanet"
|
2019-07-01 16:34:42 +01:00
|
|
|
"storj.io/storj/satellite"
|
2021-06-17 16:05:04 +01:00
|
|
|
"storj.io/storj/satellite/accounting"
|
2021-04-21 13:42:57 +01:00
|
|
|
"storj.io/storj/satellite/metabase"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2020-10-27 18:26:46 +00:00
|
|
|
"storj.io/storj/satellite/repair/checker"
|
2022-12-13 20:40:15 +00:00
|
|
|
"storj.io/storj/satellite/repair/queue"
|
2021-07-13 14:52:37 +01:00
|
|
|
"storj.io/storj/satellite/repair/repairer"
|
2021-08-03 14:21:27 +01:00
|
|
|
"storj.io/storj/satellite/reputation"
|
|
|
|
"storj.io/storj/storagenode"
|
2023-04-05 18:03:06 +01:00
|
|
|
"storj.io/storj/storagenode/blobstore"
|
|
|
|
"storj.io/storj/storagenode/blobstore/testblobs"
|
2021-07-13 14:52:37 +01:00
|
|
|
"storj.io/uplink/private/eestream"
|
2022-08-01 10:30:33 +01:00
|
|
|
"storj.io/uplink/private/piecestore"
|
2019-04-08 18:33:47 +01:00
|
|
|
)
|
|
|
|
|
2019-07-01 16:34:42 +01:00
|
|
|
// TestDataRepair does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes and disqualifies 1
|
|
|
|
// - Triggers data repair, which repairs the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Shuts down several nodes, but keeping up a number equal to the minim
|
|
|
|
// threshold
|
|
|
|
// - Downloads the data from those left nodes and check that it's the same than the uploaded one.
|
2022-08-01 10:30:33 +01:00
|
|
|
func TestDataRepairInMemoryBlake(t *testing.T) {
|
|
|
|
testDataRepair(t, true, pb.PieceHashAlgorithm_BLAKE3)
|
2020-03-18 23:55:09 +00:00
|
|
|
}
|
2022-08-01 10:30:33 +01:00
|
|
|
|
|
|
|
func TestDataRepairToDiskSHA256(t *testing.T) {
|
|
|
|
testDataRepair(t, false, pb.PieceHashAlgorithm_SHA256)
|
2020-03-18 23:55:09 +00:00
|
|
|
}
|
|
|
|
|
2022-08-01 10:30:33 +01:00
|
|
|
func testDataRepair(t *testing.T, inMemoryRepair bool, hashAlgo pb.PieceHashAlgorithm) {
|
2020-01-21 10:38:41 +00:00
|
|
|
const (
|
|
|
|
RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
minThreshold = 3
|
|
|
|
successThreshold = 7
|
|
|
|
)
|
2019-07-11 23:44:47 +01:00
|
|
|
|
2019-04-08 18:33:47 +01:00
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
2019-07-11 23:44:47 +01:00
|
|
|
StorageNodeCount: 14,
|
2019-04-08 18:33:47 +01:00
|
|
|
UplinkCount: 1,
|
2019-07-10 03:36:09 +01:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
|
|
|
config.Repairer.InMemoryRepair = inMemoryRepair
|
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(minThreshold, 5, successThreshold, 9),
|
|
|
|
),
|
2019-07-10 03:36:09 +01:00
|
|
|
},
|
2019-04-08 18:33:47 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
2019-07-22 20:10:04 +01:00
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-07-01 16:15:45 +01:00
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2021-06-17 16:05:04 +01:00
|
|
|
for _, storageNode := range planet.StorageNodes {
|
|
|
|
storageNode.Storage2.Orders.Sender.Pause()
|
|
|
|
}
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2022-08-01 10:30:33 +01:00
|
|
|
|
|
|
|
err := uplinkPeer.Upload(piecestore.WithPieceHashAlgo(ctx, hashAlgo), satellite, "testbucket", "test/path", testData)
|
2019-04-08 18:33:47 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
2020-12-14 14:29:48 +00:00
|
|
|
redundancy := segment.Redundancy
|
|
|
|
minReq := redundancy.RequiredShares
|
|
|
|
remotePieces := segment.Pieces
|
2019-04-08 18:33:47 +01:00
|
|
|
numPieces := len(remotePieces)
|
2019-07-01 16:34:42 +01:00
|
|
|
// disqualify one storage node
|
|
|
|
toDisqualify := 1
|
2019-09-06 20:20:36 +01:00
|
|
|
toKill := numPieces - toDisqualify - int(minReq)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.True(t, toKill >= 1)
|
2019-07-11 23:44:47 +01:00
|
|
|
maxNumRepairedPieces := int(
|
|
|
|
math.Ceil(
|
2019-08-20 15:46:39 +01:00
|
|
|
float64(successThreshold) * (1 + RepairMaxExcessRateOptimalThreshold),
|
2019-07-11 23:44:47 +01:00
|
|
|
),
|
|
|
|
)
|
|
|
|
numStorageNodes := len(planet.StorageNodes)
|
|
|
|
// Ensure that there are enough storage nodes to upload repaired segments
|
|
|
|
require.Falsef(t,
|
|
|
|
(numStorageNodes-toKill-toDisqualify) < maxNumRepairedPieces,
|
|
|
|
"there is not enough available nodes for repairing: need= %d, have= %d",
|
2019-10-02 13:58:37 +01:00
|
|
|
maxNumRepairedPieces, numStorageNodes-toKill-toDisqualify,
|
2019-07-11 23:44:47 +01:00
|
|
|
)
|
2019-04-08 18:33:47 +01:00
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
2019-07-01 16:34:42 +01:00
|
|
|
nodesToDisqualify := make(map[storj.NodeID]bool)
|
2019-04-08 18:33:47 +01:00
|
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
|
|
|
2019-07-01 16:34:42 +01:00
|
|
|
var numDisqualified int
|
2019-04-08 18:33:47 +01:00
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
2019-07-01 16:34:42 +01:00
|
|
|
if numDisqualified < toDisqualify {
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToDisqualify[piece.StorageNode] = true
|
2019-07-01 16:34:42 +01:00
|
|
|
numDisqualified++
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToKeepAlive[piece.StorageNode] = true
|
2019-04-08 18:33:47 +01:00
|
|
|
continue
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToKill[piece.StorageNode] = true
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
2019-07-01 16:34:42 +01:00
|
|
|
if nodesToDisqualify[node.ID()] {
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, node.ID(), time.Now(), overlay.DisqualificationReasonUnknown)
|
2020-01-03 00:00:18 +00:00
|
|
|
require.NoError(t, err)
|
2019-07-01 16:34:42 +01:00
|
|
|
continue
|
|
|
|
}
|
2019-04-08 18:33:47 +01:00
|
|
|
if nodesToKill[node.ID()] {
|
2020-05-07 09:23:40 +01:00
|
|
|
require.NoError(t, planet.StopNodeAndUpdate(ctx, node))
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// repaired segment should not contain any piece in the killed and DQ nodes
|
2021-05-14 16:05:42 +01:00
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2019-04-08 18:33:47 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
nodesToKillForMinThreshold := len(remotePieces) - minThreshold
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces = segmentAfter.Pieces
|
2019-07-11 23:44:47 +01:00
|
|
|
for _, piece := range remotePieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
require.NotContains(t, nodesToKill, piece.StorageNode, "there shouldn't be pieces in killed nodes")
|
|
|
|
require.NotContains(t, nodesToDisqualify, piece.StorageNode, "there shouldn't be pieces in DQ nodes")
|
2019-09-16 17:11:12 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// Kill the original nodes which were kept alive to ensure that we can
|
|
|
|
// download from the new nodes that the repaired pieces have been uploaded
|
2020-12-14 14:29:48 +00:00
|
|
|
if _, ok := nodesToKeepAlive[piece.StorageNode]; ok && nodesToKillForMinThreshold > 0 {
|
|
|
|
require.NoError(t, planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode)))
|
2019-07-11 23:44:47 +01:00
|
|
|
nodesToKillForMinThreshold--
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
}
|
2021-06-17 16:05:04 +01:00
|
|
|
|
|
|
|
{
|
|
|
|
// test that while repair, order limits without specified bucket are counted correctly
|
|
|
|
// for storage node repair bandwidth usage and the storage nodes will be paid for that
|
|
|
|
|
|
|
|
require.NoError(t, planet.WaitForStorageNodeEndpoints(ctx))
|
|
|
|
for _, storageNode := range planet.StorageNodes {
|
|
|
|
storageNode.Storage2.Orders.SendOrders(ctx, time.Now().Add(24*time.Hour))
|
|
|
|
}
|
|
|
|
repairSettled := make(map[storj.NodeID]uint64)
|
|
|
|
err = satellite.DB.StoragenodeAccounting().GetBandwidthSince(ctx, time.Time{}, func(c context.Context, sbr *accounting.StoragenodeBandwidthRollup) error {
|
|
|
|
if sbr.Action == uint(pb.PieceAction_GET_REPAIR) {
|
|
|
|
repairSettled[sbr.NodeID] += sbr.Settled
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, minThreshold, len(repairSettled))
|
|
|
|
|
|
|
|
for _, value := range repairSettled {
|
|
|
|
// TODO verify node ids
|
|
|
|
require.NotZero(t, value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-08 18:33:47 +01:00
|
|
|
// we should be able to download data without any of the original nodes
|
2019-09-12 18:16:39 +01:00
|
|
|
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
2019-07-01 16:15:45 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, newData, testData)
|
2019-07-01 16:34:42 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-01-29 15:28:19 +00:00
|
|
|
// TestDataRepairPendingObject does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Starts new multipart upload with one part of test data. Does not complete the multipart upload.
|
|
|
|
// - Kills some nodes and disqualifies 1
|
|
|
|
// - Triggers data repair, which repairs the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Shuts down several nodes, but keeping up a number equal to the minim
|
|
|
|
// threshold
|
|
|
|
// - Completes the multipart upload.
|
|
|
|
// - Downloads the data from those left nodes and check that it's the same than the uploaded one.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestDataRepairPendingObject(t *testing.T) {
|
2021-01-29 15:28:19 +00:00
|
|
|
const (
|
|
|
|
RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
minThreshold = 3
|
|
|
|
successThreshold = 7
|
|
|
|
)
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2021-01-29 15:28:19 +00:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(minThreshold, 5, successThreshold, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
|
|
|
|
// first, start a new multipart upload and upload one part with some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-01-29 15:28:19 +00:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
|
|
|
project, err := planet.Uplinks[0].OpenProject(ctx, planet.Satellites[0])
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer ctx.Check(project.Close)
|
|
|
|
|
|
|
|
_, err = project.EnsureBucket(ctx, "testbucket")
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// upload pending object
|
2021-04-20 09:06:56 +01:00
|
|
|
info, err := project.BeginUpload(ctx, "testbucket", "test/path", nil)
|
2021-01-29 15:28:19 +00:00
|
|
|
require.NoError(t, err)
|
2021-04-20 09:06:56 +01:00
|
|
|
upload, err := project.UploadPart(ctx, "testbucket", "test/path", info.UploadID, 7)
|
2021-01-29 15:28:19 +00:00
|
|
|
require.NoError(t, err)
|
2021-04-20 09:06:56 +01:00
|
|
|
_, err = upload.Write(testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.NoError(t, upload.Commit())
|
2021-01-29 15:28:19 +00:00
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2021-01-29 15:28:19 +00:00
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
redundancy := segment.Redundancy
|
|
|
|
minReq := redundancy.RequiredShares
|
|
|
|
remotePieces := segment.Pieces
|
|
|
|
numPieces := len(remotePieces)
|
|
|
|
// disqualify one storage node
|
|
|
|
toDisqualify := 1
|
|
|
|
toKill := numPieces - toDisqualify - int(minReq)
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
maxNumRepairedPieces := int(
|
|
|
|
math.Ceil(
|
|
|
|
float64(successThreshold) * (1 + RepairMaxExcessRateOptimalThreshold),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
numStorageNodes := len(planet.StorageNodes)
|
|
|
|
// Ensure that there are enough storage nodes to upload repaired segments
|
|
|
|
require.Falsef(t,
|
|
|
|
(numStorageNodes-toKill-toDisqualify) < maxNumRepairedPieces,
|
|
|
|
"there is not enough available nodes for repairing: need= %d, have= %d",
|
|
|
|
maxNumRepairedPieces, numStorageNodes-toKill-toDisqualify,
|
|
|
|
)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
nodesToDisqualify := make(map[storj.NodeID]bool)
|
|
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
var numDisqualified int
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
|
|
|
if numDisqualified < toDisqualify {
|
|
|
|
nodesToDisqualify[piece.StorageNode] = true
|
|
|
|
numDisqualified++
|
|
|
|
}
|
|
|
|
nodesToKeepAlive[piece.StorageNode] = true
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodesToKill[piece.StorageNode] = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToDisqualify[node.ID()] {
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, node.ID(), time.Now(), overlay.DisqualificationReasonUnknown)
|
2021-01-29 15:28:19 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if nodesToKill[node.ID()] {
|
|
|
|
require.NoError(t, planet.StopNodeAndUpdate(ctx, node))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-01-29 15:28:19 +00:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// repaired segment should not contain any piece in the killed and DQ nodes
|
2021-05-14 16:05:42 +01:00
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2021-01-29 15:28:19 +00:00
|
|
|
|
|
|
|
nodesToKillForMinThreshold := len(remotePieces) - minThreshold
|
|
|
|
remotePieces = segmentAfter.Pieces
|
|
|
|
for _, piece := range remotePieces {
|
|
|
|
require.NotContains(t, nodesToKill, piece.StorageNode, "there shouldn't be pieces in killed nodes")
|
|
|
|
require.NotContains(t, nodesToDisqualify, piece.StorageNode, "there shouldn't be pieces in DQ nodes")
|
|
|
|
|
|
|
|
// Kill the original nodes which were kept alive to ensure that we can
|
|
|
|
// download from the new nodes that the repaired pieces have been uploaded
|
|
|
|
if _, ok := nodesToKeepAlive[piece.StorageNode]; ok && nodesToKillForMinThreshold > 0 {
|
|
|
|
require.NoError(t, planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode)))
|
|
|
|
nodesToKillForMinThreshold--
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// complete the pending multipart upload
|
2021-04-20 09:06:56 +01:00
|
|
|
_, err = project.CommitUpload(ctx, "testbucket", "test/path", info.UploadID, nil)
|
2021-01-29 15:28:19 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// we should be able to download data without any of the original nodes
|
|
|
|
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, newData, testData)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
// TestMinRequiredDataRepair does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills all but the minimum number of nodes carrying the uploaded segment
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair succeed.
|
|
|
|
// Reputation info to be updated for all remaining nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestMinRequiredDataRepair(t *testing.T) {
|
2021-08-03 14:21:27 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.InitialBeta = 0.01
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2021-08-03 14:21:27 +01:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(4, 4, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 4, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availableNodes storj.NodeIDList
|
|
|
|
var killedNodes storj.NodeIDList
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availableNodes = append(availableNodes, piece.StorageNode)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
killedNodes = append(killedNodes, piece.StorageNode)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(availableNodes))
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
// Here we use a different reputation service from the one the
|
|
|
|
// repairer is reporting to. To get correct results in a short
|
|
|
|
// amount of time, we have to flush all cached node info using
|
|
|
|
// TestFlushAllNodeInfo(), below.
|
2021-08-03 14:21:27 +01:00
|
|
|
reputationService := planet.Satellites[0].Reputation.Service
|
|
|
|
|
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, nodeID := range availableNodes {
|
|
|
|
info, err := reputationService.Get(ctx, nodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[nodeID] = *info
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2022-05-07 20:04:12 +01:00
|
|
|
err = satellite.Repairer.Reputation.TestFlushAllNodeInfo(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
err = reputationService.TestFlushAllNodeInfo(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
for _, nodeID := range availableNodes {
|
|
|
|
info, err := reputationService.Get(ctx, nodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
infoBefore := nodesReputation[nodeID]
|
|
|
|
require.Equal(t, infoBefore.TotalAuditCount+1, info.TotalAuditCount)
|
|
|
|
require.Equal(t, infoBefore.AuditSuccessCount+1, info.AuditSuccessCount)
|
2022-08-11 15:17:12 +01:00
|
|
|
require.Greater(t, reputationRatio(*info), reputationRatio(infoBefore))
|
2021-08-03 14:21:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// repair succeed, so segment should not contain any killed node
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.NotContains(t, killedNodes, piece.StorageNode, "there should be no killed nodes in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestFailedDataRepair does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes carrying the uploaded segment but keep it above minimum requirement
|
|
|
|
// - On one of the remaining nodes, return unknown error during downloading of the piece
|
|
|
|
// - Stop one of the remaining nodes, for it to be offline during repair
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair failed and the pointer was not updated.
|
|
|
|
// Reputation info to be updated for all remaining nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestFailedDataRepair(t *testing.T) {
|
2021-08-03 14:21:27 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
|
|
return testblobs.NewBadDB(log.Named("baddb"), db), nil
|
|
|
|
},
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.InitialBeta = 0.01
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2021-08-03 14:21:27 +01:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(4, 5, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 4, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 4
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availablePieces metabase.Pieces
|
|
|
|
var originalNodes storj.NodeIDList
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
originalNodes = append(originalNodes, piece.StorageNode)
|
|
|
|
if i >= toKill {
|
|
|
|
availablePieces = append(availablePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 5, len(availablePieces))
|
|
|
|
|
|
|
|
// choose first piece for shutting down node, for it to always be in the first limiter batch
|
|
|
|
offlinePiece := availablePieces[0]
|
|
|
|
// choose last piece for bad node, for it to always be in the last limiter batch
|
|
|
|
unknownPiece := availablePieces[4]
|
|
|
|
|
|
|
|
// stop offline node
|
|
|
|
offlineNode := planet.FindNode(offlinePiece.StorageNode)
|
|
|
|
require.NotNil(t, offlineNode)
|
|
|
|
require.NoError(t, planet.StopPeer(offlineNode))
|
|
|
|
|
|
|
|
// set unknown error for download from bad node
|
|
|
|
badNode := planet.FindNode(unknownPiece.StorageNode)
|
|
|
|
require.NotNil(t, badNode)
|
|
|
|
badNodeDB := badNode.DB.(*testblobs.BadDB)
|
|
|
|
badNodeDB.SetError(errs.New("unknown error"))
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(2) // expecting one erroring node, one offline node
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2021-11-12 21:04:30 +00:00
|
|
|
// repair shouldn't update audit status
|
|
|
|
for _, piece := range availablePieces {
|
2021-08-03 14:21:27 +01:00
|
|
|
successfulNodeReputation := nodesReputation[piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
|
2021-11-12 21:04:30 +00:00
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount, successfulNodeReputationAfter.AuditSuccessCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditReputationAlpha, successfulNodeReputationAfter.AuditReputationAlpha)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditReputationBeta, successfulNodeReputationAfter.AuditReputationBeta)
|
2021-08-03 14:21:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// repair should fail, so segment should contain all the original nodes
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.Contains(t, originalNodes, piece.StorageNode, "there should be no new nodes in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestOfflineNodeDataRepair does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes carrying the uploaded segment but keep it above minimum requirement
|
|
|
|
// - Stop one of the remaining nodes, for it to be offline during repair
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair succeed and the pointer should contain the offline piece.
|
|
|
|
// Reputation info to be updated for all remaining nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestOfflineNodeDataRepair(t *testing.T) {
|
2021-08-03 14:21:27 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.InitialBeta = 0.01
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2021-08-03 14:21:27 +01:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availablePieces metabase.Pieces
|
|
|
|
var killedNodes storj.NodeIDList
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availablePieces = append(availablePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
killedNodes = append(killedNodes, piece.StorageNode)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(availablePieces))
|
|
|
|
require.Equal(t, 5, len(killedNodes))
|
|
|
|
|
|
|
|
// choose first piece for shutting down node, for it to always be in the first limiter batch
|
|
|
|
offlinePiece := availablePieces[0]
|
|
|
|
|
|
|
|
// stop offline node
|
|
|
|
offlineNode := planet.FindNode(offlinePiece.StorageNode)
|
|
|
|
require.NotNil(t, offlineNode)
|
|
|
|
require.NoError(t, planet.StopPeer(offlineNode))
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(1) // expect one offline node
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
|
|
|
// repair should update audit status
|
|
|
|
for _, piece := range availablePieces[1:] {
|
|
|
|
successfulNodeReputation := nodesReputation[piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
|
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount)
|
2022-08-11 15:17:12 +01:00
|
|
|
require.Greater(t, reputationRatio(successfulNodeReputationAfter), reputationRatio(successfulNodeReputation))
|
2021-08-03 14:21:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
offlineNodeReputation := nodesReputation[offlinePiece.StorageNode]
|
|
|
|
offlineNodeReputationAfter := nodesReputationAfter[offlinePiece.StorageNode]
|
|
|
|
require.Equal(t, offlineNodeReputation.TotalAuditCount+1, offlineNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, int32(0), offlineNodeReputationAfter.AuditHistory.Windows[0].OnlineCount)
|
|
|
|
|
|
|
|
// repair succeed, so segment should not contain any killed node
|
|
|
|
// offline node's piece should still exists
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Contains(t, segmentAfter.Pieces, offlinePiece, "offline piece should still be in segment")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.NotContains(t, killedNodes, piece.StorageNode, "there should be no killed nodes in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestUnknownErrorDataRepair does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes carrying the uploaded segment but keep it above minimum requirement
|
|
|
|
// - On one of the remaining nodes, return unknown error during downloading of the piece
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair succeed and the pointer should contain the unknown piece.
|
|
|
|
// Reputation info to be updated for all remaining nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestUnknownErrorDataRepair(t *testing.T) {
|
2021-08-03 14:21:27 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
|
|
return testblobs.NewBadDB(log.Named("baddb"), db), nil
|
|
|
|
},
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.InitialBeta = 0.01
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2021-08-03 14:21:27 +01:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availablePieces metabase.Pieces
|
|
|
|
var killedNodes storj.NodeIDList
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availablePieces = append(availablePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
killedNodes = append(killedNodes, piece.StorageNode)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(availablePieces))
|
|
|
|
require.Equal(t, 5, len(killedNodes))
|
|
|
|
|
|
|
|
// choose first piece for corruption, for it to always be in the first limiter batch
|
|
|
|
unknownPiece := availablePieces[0]
|
|
|
|
|
|
|
|
// set unknown error for download from bad node
|
|
|
|
badNode := planet.FindNode(unknownPiece.StorageNode)
|
|
|
|
require.NotNil(t, badNode)
|
|
|
|
badNodeDB := badNode.DB.(*testblobs.BadDB)
|
|
|
|
badNodeDB.SetError(errs.New("unknown error"))
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(1) // expecting one bad node
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
|
|
|
// repair should update audit status
|
|
|
|
for _, piece := range availablePieces[1:] {
|
|
|
|
successfulNodeReputation := nodesReputation[piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
|
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount)
|
2022-08-11 15:17:12 +01:00
|
|
|
require.Greater(t, reputationRatio(successfulNodeReputationAfter), reputationRatio(successfulNodeReputation))
|
2021-08-03 14:21:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
badNodeReputation := nodesReputation[unknownPiece.StorageNode]
|
|
|
|
badNodeReputationAfter := nodesReputationAfter[unknownPiece.StorageNode]
|
|
|
|
require.Equal(t, badNodeReputation.TotalAuditCount+1, badNodeReputationAfter.TotalAuditCount)
|
2022-08-11 15:17:12 +01:00
|
|
|
require.Less(t, badNodeReputation.UnknownAuditReputationBeta, badNodeReputationAfter.UnknownAuditReputationBeta)
|
|
|
|
require.GreaterOrEqual(t, badNodeReputation.UnknownAuditReputationAlpha, badNodeReputationAfter.UnknownAuditReputationAlpha)
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
// repair succeed, so segment should not contain any killed node
|
|
|
|
// unknown piece should still exists
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Contains(t, segmentAfter.Pieces, unknownPiece, "unknown piece should still be in segment")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.NotContains(t, killedNodes, piece.StorageNode, "there should be no killed nodes in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestMissingPieceDataRepair_Succeed does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills some nodes carrying the uploaded segment but keep it above minimum requirement
|
|
|
|
// - On one of the remaining nodes, delete the piece data being stored by that node
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair succeed and the pointer should not contain the missing piece.
|
|
|
|
// Reputation info to be updated for all remaining nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestMissingPieceDataRepair_Succeed(t *testing.T) {
|
2021-08-03 14:21:27 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.InitialBeta = 0.01
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2021-08-03 14:21:27 +01:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availablePieces metabase.Pieces
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availablePieces = append(availablePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(availablePieces))
|
|
|
|
|
|
|
|
// choose first piece for deletion, for it to always be in the first limiter batch
|
|
|
|
missingPiece := availablePieces[0]
|
|
|
|
|
|
|
|
// delete piece
|
|
|
|
missingPieceNode := planet.FindNode(missingPiece.StorageNode)
|
|
|
|
require.NotNil(t, missingPieceNode)
|
|
|
|
pieceID := segment.RootPieceID.Derive(missingPiece.StorageNode, int32(missingPiece.Number))
|
|
|
|
err = missingPieceNode.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(1) // expect one node to have a missing piece
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
|
|
|
// repair should update audit status
|
|
|
|
for _, piece := range availablePieces[1:] {
|
|
|
|
successfulNodeReputation := nodesReputation[piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
|
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount)
|
2022-08-11 15:17:12 +01:00
|
|
|
require.Greater(t, reputationRatio(successfulNodeReputationAfter), reputationRatio(successfulNodeReputation))
|
2021-08-03 14:21:27 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
missingPieceNodeReputation := nodesReputation[missingPiece.StorageNode]
|
|
|
|
missingPieceNodeReputationAfter := nodesReputationAfter[missingPiece.StorageNode]
|
|
|
|
require.Equal(t, missingPieceNodeReputation.TotalAuditCount+1, missingPieceNodeReputationAfter.TotalAuditCount)
|
2022-08-11 15:17:12 +01:00
|
|
|
require.Less(t, reputationRatio(missingPieceNodeReputationAfter), reputationRatio(missingPieceNodeReputation))
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
// repair succeeded, so segment should not contain missing piece
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.NotEqual(t, piece.Number, missingPiece.Number, "there should be no missing piece in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestMissingPieceDataRepair_Failed does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills all but the minimum number of nodes carrying the uploaded segment
|
|
|
|
// - On one of the remaining nodes, delete the piece data being stored by that node
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair failed and the pointer was not updated.
|
|
|
|
// Reputation info to be updated for node missing the piece.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestMissingPieceDataRepair(t *testing.T) {
|
2021-08-03 14:21:27 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2021-08-03 14:21:27 +01:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(4, 4, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 4, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
var availablePieces metabase.Pieces
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
originalNodes[piece.StorageNode] = true
|
|
|
|
if i >= toKill {
|
|
|
|
availablePieces = append(availablePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(availablePieces))
|
|
|
|
|
|
|
|
missingPiece := availablePieces[0]
|
|
|
|
|
|
|
|
// delete piece
|
|
|
|
missingPieceNode := planet.FindNode(missingPiece.StorageNode)
|
|
|
|
require.NotNil(t, missingPieceNode)
|
|
|
|
pieceID := segment.RootPieceID.Derive(missingPiece.StorageNode, int32(missingPiece.Number))
|
|
|
|
err = missingPieceNode.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2021-08-03 14:21:27 +01:00
|
|
|
|
2021-10-04 15:18:41 +01:00
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2022-09-19 22:16:48 +01:00
|
|
|
var successful []repairer.PieceFetchResult
|
2022-09-19 22:13:43 +01:00
|
|
|
satellite.Repairer.SegmentRepairer.OnTestingPiecesReportHook = func(pieces repairer.FetchResultReport) {
|
2021-10-04 15:18:41 +01:00
|
|
|
successful = pieces.Successful
|
|
|
|
}
|
2021-08-03 14:21:27 +01:00
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(1) // expect one missing piece
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
2021-10-04 15:18:41 +01:00
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
2021-08-03 14:21:27 +01:00
|
|
|
|
2021-11-12 21:04:30 +00:00
|
|
|
// repair shouldn't update audit status
|
2022-09-19 22:13:43 +01:00
|
|
|
for _, result := range successful {
|
2022-09-19 22:16:48 +01:00
|
|
|
successfulNodeReputation := nodesReputation[result.Piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[result.Piece.StorageNode]
|
2021-11-12 21:04:30 +00:00
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount, successfulNodeReputationAfter.AuditSuccessCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditReputationAlpha, successfulNodeReputationAfter.AuditReputationAlpha)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditReputationBeta, successfulNodeReputationAfter.AuditReputationBeta)
|
2021-10-04 15:18:41 +01:00
|
|
|
}
|
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
// repair should fail, so segment should contain all the original nodes
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.Contains(t, originalNodes, piece.StorageNode, "there should be no new nodes in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestCorruptDataRepair_Succeed does the following:
|
2023-05-10 10:23:38 +01:00
|
|
|
// - Uploads test data using different hash algorithms (Blake3 and SHA256)
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Kills some nodes carrying the uploaded segment but keep it above minimum requirement
|
|
|
|
// - On one of the remaining nodes, corrupt the piece data being stored by that node
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair succeed and the pointer should not contain the corrupted piece.
|
|
|
|
// Reputation info to be updated for all remaining nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestCorruptDataRepair_Succeed(t *testing.T) {
|
2019-09-06 20:20:36 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
for _, tt := range []struct {
|
|
|
|
name string
|
|
|
|
hashAlgo pb.PieceHashAlgorithm
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "BLAKE3",
|
|
|
|
hashAlgo: pb.PieceHashAlgorithm_BLAKE3,
|
2019-09-06 20:20:36 +01:00
|
|
|
},
|
2023-05-10 10:23:38 +01:00
|
|
|
{
|
|
|
|
name: "SHA256",
|
|
|
|
hashAlgo: pb.PieceHashAlgorithm_SHA256,
|
|
|
|
},
|
|
|
|
} {
|
|
|
|
tt := tt
|
|
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 15,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
|
|
|
config.Repairer.InMemoryRepair = true
|
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 9, 9),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(piecestore.WithPieceHashAlgo(ctx, tt.hashAlgo), satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availablePieces metabase.Pieces
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availablePieces = append(availablePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
2020-05-07 09:23:40 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(availablePieces))
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
// choose first piece for corruption, for it to always be in the first limiter batch
|
|
|
|
corruptedPiece := availablePieces[0]
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
// corrupt piece data
|
|
|
|
corruptedNode := planet.FindNode(corruptedPiece.StorageNode)
|
|
|
|
require.NotNil(t, corruptedNode)
|
|
|
|
corruptedPieceID := segment.RootPieceID.Derive(corruptedPiece.StorageNode, int32(corruptedPiece.Number))
|
|
|
|
corruptPieceData(ctx, t, planet, corruptedNode, corruptedPieceID)
|
2019-09-13 17:21:20 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
2019-09-16 18:13:24 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(1) // expect one node with bad data
|
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
// repair should update audit status
|
|
|
|
for _, piece := range availablePieces[1:] {
|
|
|
|
successfulNodeReputation := nodesReputation[piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
|
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount)
|
|
|
|
require.GreaterOrEqual(t, reputationRatio(successfulNodeReputationAfter), reputationRatio(successfulNodeReputation))
|
|
|
|
}
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
corruptedNodeReputation := nodesReputation[corruptedPiece.StorageNode]
|
|
|
|
corruptedNodeReputationAfter := nodesReputationAfter[corruptedPiece.StorageNode]
|
|
|
|
require.Equal(t, corruptedNodeReputation.TotalAuditCount+1, corruptedNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Less(t, reputationRatio(corruptedNodeReputationAfter), reputationRatio(corruptedNodeReputation))
|
2021-08-03 14:21:27 +01:00
|
|
|
|
2023-05-10 10:23:38 +01:00
|
|
|
// repair succeeded, so segment should not contain corrupted piece
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.NotEqual(t, piece.Number, corruptedPiece.Number, "there should be no corrupted piece in pointer")
|
|
|
|
}
|
|
|
|
})
|
|
|
|
})
|
|
|
|
}
|
2019-09-16 18:13:24 +01:00
|
|
|
}
|
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
// TestCorruptDataRepair_Failed does the following:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills all but the minimum number of nodes carrying the uploaded segment
|
|
|
|
// - On one of the remaining nodes, corrupt the piece data being stored by that node
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
|
|
|
// - Expects that the repair failed and the pointer was not updated.
|
|
|
|
// Reputation info to be updated for corrupted node.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestCorruptDataRepair_Failed(t *testing.T) {
|
2019-09-16 18:13:24 +01:00
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
2021-08-03 14:21:27 +01:00
|
|
|
StorageNodeCount: 15,
|
2019-09-16 18:13:24 +01:00
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2022-11-24 13:02:08 +00:00
|
|
|
config.Repairer.ReputationUpdateEnabled = true
|
2022-08-11 15:17:12 +01:00
|
|
|
config.Reputation.InitialAlpha = 1
|
|
|
|
config.Reputation.AuditLambda = 0.95
|
2020-10-27 17:34:59 +00:00
|
|
|
},
|
2021-08-03 14:21:27 +01:00
|
|
|
testplanet.ReconfigureRS(4, 4, 9, 9),
|
2020-10-27 17:34:59 +00:00
|
|
|
),
|
2019-09-16 18:13:24 +01:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-09-16 18:13:24 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-09-06 20:20:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Equal(t, 9, len(segment.Pieces))
|
|
|
|
require.Equal(t, 4, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 5
|
2019-09-16 18:13:24 +01:00
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
2021-08-03 14:21:27 +01:00
|
|
|
var availablePieces metabase.Pieces
|
2019-09-16 18:13:24 +01:00
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
for i, piece := range segment.Pieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
originalNodes[piece.StorageNode] = true
|
2019-09-16 18:13:24 +01:00
|
|
|
if i >= toKill {
|
2021-08-03 14:21:27 +01:00
|
|
|
availablePieces = append(availablePieces, piece)
|
2019-09-16 18:13:24 +01:00
|
|
|
continue
|
|
|
|
}
|
2020-05-07 09:23:40 +01:00
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
2020-05-07 09:23:40 +01:00
|
|
|
require.NoError(t, err)
|
2019-09-16 18:13:24 +01:00
|
|
|
}
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Equal(t, 4, len(availablePieces))
|
2019-09-16 18:13:24 +01:00
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
corruptedPiece := availablePieces[0]
|
2019-09-16 18:13:24 +01:00
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
// corrupt piece data
|
|
|
|
corruptedNode := planet.FindNode(corruptedPiece.StorageNode)
|
|
|
|
require.NotNil(t, corruptedNode)
|
|
|
|
corruptedPieceID := segment.RootPieceID.Derive(corruptedPiece.StorageNode, int32(corruptedPiece.Number))
|
2019-09-16 18:13:24 +01:00
|
|
|
corruptPieceData(ctx, t, planet, corruptedNode, corruptedPieceID)
|
|
|
|
|
2022-05-07 20:04:12 +01:00
|
|
|
reputationService := satellite.Repairer.Reputation
|
2021-08-03 14:21:27 +01:00
|
|
|
|
2021-10-04 15:18:41 +01:00
|
|
|
nodesReputation := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputation[piece.StorageNode] = *info
|
|
|
|
}
|
|
|
|
|
2022-09-19 22:16:48 +01:00
|
|
|
var successful []repairer.PieceFetchResult
|
2022-09-19 22:13:43 +01:00
|
|
|
satellite.Repairer.SegmentRepairer.OnTestingPiecesReportHook = func(report repairer.FetchResultReport) {
|
|
|
|
successful = report.Successful
|
2021-10-04 15:18:41 +01:00
|
|
|
}
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
satellite.Repair.Repairer.TestingSetMinFailures(1) // expect one corrupted piece
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-09-06 20:20:36 +01:00
|
|
|
|
2021-10-04 15:18:41 +01:00
|
|
|
nodesReputationAfter := make(map[storj.NodeID]reputation.Info)
|
|
|
|
for _, piece := range availablePieces {
|
|
|
|
info, err := reputationService.Get(ctx, piece.StorageNode)
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodesReputationAfter[piece.StorageNode] = *info
|
|
|
|
}
|
2019-09-13 17:21:20 +01:00
|
|
|
|
2021-11-12 21:04:30 +00:00
|
|
|
// repair shouldn't update audit status
|
2022-09-19 22:13:43 +01:00
|
|
|
for _, result := range successful {
|
2022-09-19 22:16:48 +01:00
|
|
|
successfulNodeReputation := nodesReputation[result.Piece.StorageNode]
|
|
|
|
successfulNodeReputationAfter := nodesReputationAfter[result.Piece.StorageNode]
|
2021-11-12 21:04:30 +00:00
|
|
|
require.Equal(t, successfulNodeReputation.TotalAuditCount, successfulNodeReputationAfter.TotalAuditCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditSuccessCount, successfulNodeReputationAfter.AuditSuccessCount)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditReputationAlpha, successfulNodeReputationAfter.AuditReputationAlpha)
|
|
|
|
require.Equal(t, successfulNodeReputation.AuditReputationBeta, successfulNodeReputationAfter.AuditReputationBeta)
|
2021-10-04 15:18:41 +01:00
|
|
|
}
|
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
// repair should fail, so segment should contain all the original nodes
|
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
for _, piece := range segmentAfter.Pieces {
|
|
|
|
require.Contains(t, originalNodes, piece.StorageNode, "there should be no new nodes in pointer")
|
2019-09-06 20:20:36 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-10-15 23:38:53 +01:00
|
|
|
// TestRepairExpiredSegment
|
2020-04-15 20:20:16 +01:00
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Kill nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Modify segment to be expired
|
|
|
|
// - Run the repairer
|
2021-10-15 23:38:53 +01:00
|
|
|
// - Verify segment is no longer in the repair queue.
|
2021-02-01 07:51:47 +00:00
|
|
|
func TestRepairExpiredSegment(t *testing.T) {
|
2020-04-15 20:20:16 +01:00
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Stop()
|
2023-04-24 11:07:16 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2020-04-15 20:20:16 +01:00
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2020-04-15 20:20:16 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-12-18 08:49:31 +00:00
|
|
|
err := uplinkPeer.UploadWithExpiration(ctx, satellite, "testbucket", "test/path", testData, time.Now().Add(1*time.Hour))
|
2020-04-15 20:20:16 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2020-04-15 20:20:16 +01:00
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToDQ := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
// Kill 3 nodes so that pointer has 4 left (less than repair threshold)
|
|
|
|
toKill := 3
|
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segment.Pieces
|
2020-04-15 20:20:16 +01:00
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
|
|
|
continue
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToDQ[piece.StorageNode] = true
|
2020-04-15 20:20:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for nodeID := range nodesToDQ {
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, nodeID, time.Now(), overlay.DisqualificationReasonUnknown)
|
2020-04-15 20:20:16 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2020-04-15 20:20:16 +01:00
|
|
|
|
|
|
|
// Verify that the segment is on the repair queue
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
2020-12-18 08:49:31 +00:00
|
|
|
require.Equal(t, 1, count)
|
|
|
|
|
|
|
|
satellite.Repair.Repairer.SetNow(func() time.Time {
|
|
|
|
return time.Now().Add(2 * time.Hour)
|
|
|
|
})
|
2020-04-15 20:20:16 +01:00
|
|
|
|
|
|
|
// Run the repairer
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
2022-11-11 23:11:40 +00:00
|
|
|
// Verify that the segment is not still in the queue
|
2020-04-15 20:20:16 +01:00
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-10-15 23:38:53 +01:00
|
|
|
require.Equal(t, 0, count)
|
2020-04-15 20:20:16 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
// TestRemoveDeletedSegmentFromQueue
|
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Kill nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Delete segment from the satellite database
|
|
|
|
// - Run the repairer
|
2020-07-16 15:18:02 +01:00
|
|
|
// - Verify segment is no longer in the repair queue.
|
2019-10-15 04:39:28 +01:00
|
|
|
func TestRemoveDeletedSegmentFromQueue(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
2020-01-21 10:38:41 +00:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
2019-10-15 04:39:28 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Stop()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-10-15 04:39:28 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-10-15 04:39:28 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2019-10-15 04:39:28 +01:00
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToDQ := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
// Kill 3 nodes so that pointer has 4 left (less than repair threshold)
|
|
|
|
toKill := 3
|
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segment.Pieces
|
2019-10-15 04:39:28 +01:00
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
|
|
|
if i >= toKill {
|
|
|
|
continue
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToDQ[piece.StorageNode] = true
|
2019-10-15 04:39:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for nodeID := range nodesToDQ {
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, nodeID, time.Now(), overlay.DisqualificationReasonUnknown)
|
2020-01-03 00:00:18 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// trigger checker to add segment to repair queue
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-10-15 04:39:28 +01:00
|
|
|
|
|
|
|
// Delete segment from the satellite database
|
2020-02-10 12:18:18 +00:00
|
|
|
err = uplinkPeer.DeleteObject(ctx, satellite, "testbucket", "test/path")
|
2019-10-15 04:39:28 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
// Verify that the segment is on the repair queue
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 1)
|
|
|
|
|
|
|
|
// Run the repairer
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// Verify that the segment was removed
|
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestSegmentDeletedDuringRepair
|
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Kill nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Delete segment from the satellite database when repair is in progress.
|
|
|
|
// - Run the repairer
|
|
|
|
// - Verify segment is no longer in the repair queue.
|
|
|
|
// - Verify no audit has been recorded.
|
|
|
|
func TestSegmentDeletedDuringRepair(t *testing.T) {
|
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
|
|
|
config.Repairer.InMemoryRepair = true
|
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 6, 6),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 3
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availableNodes storj.NodeIDList
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availableNodes = append(availableNodes, piece.StorageNode)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 3, len(availableNodes))
|
|
|
|
|
|
|
|
// trigger checker to add segment to repair queue
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 1, count)
|
|
|
|
|
|
|
|
// delete segment
|
|
|
|
satellite.Repairer.SegmentRepairer.OnTestingCheckSegmentAlteredHook = func() {
|
|
|
|
err = uplinkPeer.DeleteObject(ctx, satellite, "testbucket", "test/path")
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// Verify that the segment was removed
|
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 0, count)
|
|
|
|
|
|
|
|
// Verify that no audit has been recorded for participated nodes.
|
|
|
|
reputationService := satellite.Reputation.Service
|
|
|
|
|
|
|
|
for _, nodeID := range availableNodes {
|
|
|
|
info, err := reputationService.Get(ctx, nodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, int64(0), info.TotalAuditCount)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestSegmentModifiedDuringRepair
|
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Kill nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Modify segment when repair is in progress.
|
|
|
|
// - Run the repairer
|
|
|
|
// - Verify segment is no longer in the repair queue.
|
|
|
|
// - Verify no audit has been recorded.
|
|
|
|
func TestSegmentModifiedDuringRepair(t *testing.T) {
|
|
|
|
const RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
|
|
|
config.Repairer.InMemoryRepair = true
|
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 6, 6),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 3
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var availableNodes storj.NodeIDList
|
|
|
|
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
availableNodes = append(availableNodes, piece.StorageNode)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 3, len(availableNodes))
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Equal(t, 1, count)
|
|
|
|
|
|
|
|
// delete segment
|
|
|
|
satellite.Repairer.SegmentRepairer.OnTestingCheckSegmentAlteredHook = func() {
|
|
|
|
// remove one piece from the segment so that checkIfSegmentAltered fails
|
|
|
|
err = satellite.Metabase.DB.UpdateSegmentPieces(ctx, metabase.UpdateSegmentPieces{
|
|
|
|
StreamID: segment.StreamID,
|
|
|
|
Position: segment.Position,
|
|
|
|
OldPieces: segment.Pieces,
|
|
|
|
NewPieces: append([]metabase.Piece{segment.Pieces[0]}, segment.Pieces[2:]...),
|
|
|
|
NewRedundancy: segment.Redundancy,
|
|
|
|
})
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
2019-10-15 04:39:28 +01:00
|
|
|
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-10-15 04:39:28 +01:00
|
|
|
|
|
|
|
// Verify that the segment was removed
|
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Equal(t, 0, count)
|
|
|
|
|
|
|
|
// Verify that no audit has been recorded for participated nodes.
|
|
|
|
reputationService := satellite.Reputation.Service
|
|
|
|
|
|
|
|
for _, nodeID := range availableNodes {
|
|
|
|
info, err := reputationService.Get(ctx, nodeID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, int64(0), info.TotalAuditCount)
|
|
|
|
}
|
2019-10-15 04:39:28 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2020-02-24 20:13:12 +00:00
|
|
|
// TestIrreparableSegmentAccordingToOverlay
|
2019-07-30 16:38:25 +01:00
|
|
|
// - Upload tests data to 7 nodes
|
2020-02-24 20:13:12 +00:00
|
|
|
// - Disqualify nodes so that repair threshold > online nodes > minimum threshold
|
2019-07-30 16:38:25 +01:00
|
|
|
// - Call checker to add segment to the repair queue
|
2020-02-24 20:13:12 +00:00
|
|
|
// - Disqualify nodes so that online nodes < minimum threshold
|
2019-07-30 16:38:25 +01:00
|
|
|
// - Run the repairer
|
2021-06-15 22:45:31 +01:00
|
|
|
// - Verify segment is still in the repair queue.
|
2020-02-24 20:13:12 +00:00
|
|
|
func TestIrreparableSegmentAccordingToOverlay(t *testing.T) {
|
2019-07-30 16:38:25 +01:00
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
2020-01-21 10:38:41 +00:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
2019-07-30 16:38:25 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-07-30 16:38:25 +01:00
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Stop()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-30 16:38:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-06-15 22:45:31 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2020-02-24 20:13:12 +00:00
|
|
|
// dq 3 nodes so that pointer has 4 left (less than repair threshold)
|
|
|
|
toDQ := 3
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segment.Pieces
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2020-02-24 20:13:12 +00:00
|
|
|
for i := 0; i < toDQ; i++ {
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, remotePieces[i].StorageNode, time.Now(), overlay.DisqualificationReasonUnknown)
|
2020-01-03 00:00:18 +00:00
|
|
|
require.NoError(t, err)
|
2019-07-30 16:38:25 +01:00
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2020-02-24 20:13:12 +00:00
|
|
|
// Disqualify nodes so that online nodes < minimum threshold
|
2019-07-30 16:38:25 +01:00
|
|
|
// This will make the segment irreparable
|
|
|
|
for _, piece := range remotePieces {
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, piece.StorageNode, time.Now(), overlay.DisqualificationReasonUnknown)
|
2020-01-03 00:00:18 +00:00
|
|
|
require.NoError(t, err)
|
2019-07-30 16:38:25 +01:00
|
|
|
}
|
|
|
|
|
2019-10-15 04:39:28 +01:00
|
|
|
// Verify that the segment is on the repair queue
|
2019-09-12 18:16:39 +01:00
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
2019-07-30 16:38:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 1)
|
|
|
|
|
|
|
|
// Run the repairer
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-07-30 16:38:25 +01:00
|
|
|
|
2021-06-15 22:45:31 +01:00
|
|
|
// Verify that the irreparable segment is still in repair queue
|
2019-09-12 18:16:39 +01:00
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
2019-07-30 16:38:25 +01:00
|
|
|
require.NoError(t, err)
|
2021-06-15 22:45:31 +01:00
|
|
|
require.Equal(t, count, 1)
|
2020-02-24 20:13:12 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestIrreparableSegmentNodesOffline
|
|
|
|
// - Upload tests data to 7 nodes
|
|
|
|
// - Disqualify nodes so that repair threshold > online nodes > minimum threshold
|
|
|
|
// - Call checker to add segment to the repair queue
|
|
|
|
// - Kill (as opposed to disqualifying) nodes so that online nodes < minimum threshold
|
|
|
|
// - Run the repairer
|
2021-06-15 22:45:31 +01:00
|
|
|
// - Verify segment is still in the repair queue.
|
2020-02-24 20:13:12 +00:00
|
|
|
func TestIrreparableSegmentNodesOffline(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 10,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Stop()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2020-02-24 20:13:12 +00:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-06-15 22:45:31 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
2020-02-24 20:13:12 +00:00
|
|
|
|
|
|
|
// kill 3 nodes and mark them as offline so that pointer has 4 left from overlay
|
|
|
|
// perspective (less than repair threshold)
|
|
|
|
toMarkOffline := 3
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segment.Pieces
|
2020-02-24 20:13:12 +00:00
|
|
|
|
2020-05-07 09:23:40 +01:00
|
|
|
for _, piece := range remotePieces[:toMarkOffline] {
|
2020-12-14 14:29:48 +00:00
|
|
|
node := planet.FindNode(piece.StorageNode)
|
2020-05-07 09:23:40 +01:00
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, node)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-02-24 20:13:12 +00:00
|
|
|
err = updateNodeCheckIn(ctx, satellite.DB.OverlayCache(), node, false, time.Now().Add(-24*time.Hour))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2020-02-24 20:13:12 +00:00
|
|
|
|
|
|
|
// Verify that the segment is on the repair queue
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, count, 1)
|
|
|
|
|
|
|
|
// Kill 2 extra nodes so that the number of available pieces is less than the minimum
|
2020-05-07 09:23:40 +01:00
|
|
|
for _, piece := range remotePieces[toMarkOffline : toMarkOffline+2] {
|
2020-12-14 14:29:48 +00:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
2020-05-07 09:23:40 +01:00
|
|
|
require.NoError(t, err)
|
2020-02-24 20:13:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Mark nodes as online again so that online nodes > minimum threshold
|
|
|
|
// This will make the repair worker attempt to download the pieces
|
2020-05-07 09:23:40 +01:00
|
|
|
for _, piece := range remotePieces[:toMarkOffline] {
|
2020-12-14 14:29:48 +00:00
|
|
|
node := planet.FindNode(piece.StorageNode)
|
2020-02-24 20:13:12 +00:00
|
|
|
err := updateNodeCheckIn(ctx, satellite.DB.OverlayCache(), node, true, time.Now())
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Run the repairer
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
2021-06-15 22:45:31 +01:00
|
|
|
// Verify that the irreparable segment is still in repair queue
|
2020-02-24 20:13:12 +00:00
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
2021-06-15 22:45:31 +01:00
|
|
|
require.Equal(t, 1, count)
|
2019-07-30 16:38:25 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-06-15 22:45:31 +01:00
|
|
|
func updateNodeCheckIn(ctx context.Context, overlayDB overlay.DB, node *testplanet.StorageNode, isUp bool, timestamp time.Time) error {
|
|
|
|
local := node.Contact.Service.Local()
|
|
|
|
checkInInfo := overlay.NodeCheckInInfo{
|
|
|
|
NodeID: node.ID(),
|
|
|
|
Address: &pb.NodeAddress{
|
|
|
|
Address: local.Address,
|
|
|
|
},
|
|
|
|
LastIPPort: local.Address,
|
2022-12-13 20:40:15 +00:00
|
|
|
LastNet: local.Address,
|
2021-06-15 22:45:31 +01:00
|
|
|
IsUp: isUp,
|
|
|
|
Operator: &local.Operator,
|
|
|
|
Capacity: &local.Capacity,
|
|
|
|
Version: &local.Version,
|
|
|
|
}
|
|
|
|
return overlayDB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-24*time.Hour), overlay.NodeSelectionConfig{})
|
|
|
|
}
|
|
|
|
|
2020-03-11 21:11:46 +00:00
|
|
|
// TestRepairMultipleDisqualifiedAndSuspended does the following:
|
2019-07-01 16:34:42 +01:00
|
|
|
// - Uploads test data to 7 nodes
|
2020-03-11 21:11:46 +00:00
|
|
|
// - Disqualifies 2 nodes and suspends 1 node
|
2019-07-01 16:34:42 +01:00
|
|
|
// - Triggers data repair, which repairs the data from the remaining 4 nodes to additional 3 new nodes
|
|
|
|
// - Shuts down the 4 nodes from which the data was repaired
|
|
|
|
// - Now we have just the 3 new nodes to which the data was repaired
|
|
|
|
// - Downloads the data from these 3 nodes (succeeds because 3 nodes are enough for download)
|
2020-07-16 15:18:02 +01:00
|
|
|
// - Expect newly repaired pointer does not contain the disqualified or suspended nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestRepairMultipleDisqualifiedAndSuspended(t *testing.T) {
|
2019-07-01 16:34:42 +01:00
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 12,
|
|
|
|
UplinkCount: 1,
|
2020-01-21 10:38:41 +00:00
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2020-10-27 17:34:59 +00:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
),
|
2020-01-21 10:38:41 +00:00
|
|
|
},
|
2019-07-01 16:34:42 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
2019-07-22 20:10:04 +01:00
|
|
|
uplinkPeer := planet.Uplinks[0]
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite := planet.Satellites[0]
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// get a remote segment from metainfo
|
2021-09-07 09:15:47 +01:00
|
|
|
segments, err := satellite.Metabase.DB.TestingAllSegments(ctx)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
2020-12-14 14:29:48 +00:00
|
|
|
require.Len(t, segments, 1)
|
|
|
|
require.False(t, segments[0].Inline())
|
2019-07-01 16:34:42 +01:00
|
|
|
|
|
|
|
// calculate how many storagenodes to disqualify
|
|
|
|
numStorageNodes := len(planet.StorageNodes)
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segments[0].Pieces
|
2019-07-01 16:34:42 +01:00
|
|
|
numPieces := len(remotePieces)
|
2020-03-11 21:11:46 +00:00
|
|
|
// sanity check
|
|
|
|
require.EqualValues(t, numPieces, 7)
|
|
|
|
toDisqualify := 2
|
|
|
|
toSuspend := 1
|
2019-07-01 16:34:42 +01:00
|
|
|
// we should have enough storage nodes to repair on
|
2020-03-11 21:11:46 +00:00
|
|
|
require.True(t, (numStorageNodes-toDisqualify-toSuspend) >= numPieces)
|
2019-07-01 16:34:42 +01:00
|
|
|
|
|
|
|
// disqualify nodes and track lost pieces
|
|
|
|
nodesToDisqualify := make(map[storj.NodeID]bool)
|
2020-03-11 21:11:46 +00:00
|
|
|
nodesToSuspend := make(map[storj.NodeID]bool)
|
2019-07-01 16:34:42 +01:00
|
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
|
|
|
2020-03-11 21:11:46 +00:00
|
|
|
// disqualify and suspend nodes
|
|
|
|
for i := 0; i < toDisqualify; i++ {
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToDisqualify[remotePieces[i].StorageNode] = true
|
2022-10-11 17:13:29 +01:00
|
|
|
_, err := satellite.DB.OverlayCache().DisqualifyNode(ctx, remotePieces[i].StorageNode, time.Now(), overlay.DisqualificationReasonUnknown)
|
2020-03-11 21:11:46 +00:00
|
|
|
require.NoError(t, err)
|
2019-07-01 16:34:42 +01:00
|
|
|
}
|
2020-03-11 21:11:46 +00:00
|
|
|
for i := toDisqualify; i < toDisqualify+toSuspend; i++ {
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToSuspend[remotePieces[i].StorageNode] = true
|
2021-07-15 15:14:13 +01:00
|
|
|
err := satellite.DB.OverlayCache().TestSuspendNodeUnknownAudit(ctx, remotePieces[i].StorageNode, time.Now())
|
2020-03-11 21:11:46 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
for i := toDisqualify + toSuspend; i < len(remotePieces); i++ {
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToKeepAlive[remotePieces[i].StorageNode] = true
|
2019-07-01 16:34:42 +01:00
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
err = satellite.RangedLoop.Repair.Observer.RefreshReliabilityCache(ctx)
|
2019-07-08 23:04:35 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-07-01 16:34:42 +01:00
|
|
|
|
|
|
|
// kill nodes kept alive to ensure repair worked
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKeepAlive[node.ID()] {
|
2020-05-07 09:23:40 +01:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, node)
|
|
|
|
require.NoError(t, err)
|
2019-07-01 16:34:42 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// we should be able to download data without any of the original nodes
|
2019-09-12 18:16:39 +01:00
|
|
|
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, newData, testData)
|
|
|
|
|
2021-09-07 09:15:47 +01:00
|
|
|
segments, err = satellite.Metabase.DB.TestingAllSegments(ctx)
|
2019-07-01 16:34:42 +01:00
|
|
|
require.NoError(t, err)
|
2020-12-14 14:29:48 +00:00
|
|
|
require.Len(t, segments, 1)
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces = segments[0].Pieces
|
2019-07-01 16:34:42 +01:00
|
|
|
for _, piece := range remotePieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
require.False(t, nodesToDisqualify[piece.StorageNode])
|
|
|
|
require.False(t, nodesToSuspend[piece.StorageNode])
|
2019-04-08 18:33:47 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
2019-07-01 16:34:42 +01:00
|
|
|
|
2019-10-02 13:58:37 +01:00
|
|
|
// TestDataRepairOverride_HigherLimit does the following:
|
2020-12-05 16:01:42 +00:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills nodes to fall to the Repair Override Value of the checker but stays above the original Repair Threshold
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
2022-08-10 16:35:58 +01:00
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestDataRepairOverride_HigherLimit(t *testing.T) {
|
2019-10-02 13:58:37 +01:00
|
|
|
const repairOverride = 6
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2020-10-27 18:26:46 +00:00
|
|
|
config.Checker.RepairOverrides = checker.RepairOverrides{
|
|
|
|
List: []checker.RepairOverride{
|
|
|
|
{Min: 3, Success: 9, Total: 9, Override: repairOverride},
|
|
|
|
},
|
|
|
|
}
|
2020-10-27 17:34:59 +00:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 4, 9, 9),
|
|
|
|
),
|
2019-10-02 13:58:37 +01:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-10-02 13:58:37 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-10-02 13:58:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
// kill one nodes less than repair threshold to ensure we dont hit it.
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segment.Pieces
|
2019-10-02 13:58:37 +01:00
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toKill := numPieces - repairOverride
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
originalNodes[piece.StorageNode] = true
|
2019-10-02 13:58:37 +01:00
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
continue
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToKill[piece.StorageNode] = true
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKill[node.ID()] {
|
2020-05-07 09:23:40 +01:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, node)
|
|
|
|
require.NoError(t, err)
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-10-02 13:58:37 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// repair should have been done, due to the override
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ = getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// pointer should have the success count of pieces
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces = segment.Pieces
|
|
|
|
require.Equal(t, int(segment.Redundancy.OptimalShares), len(remotePieces))
|
2019-10-02 13:58:37 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestDataRepairOverride_LowerLimit does the following:
|
2020-12-05 16:01:42 +00:00
|
|
|
// - Uploads test data
|
|
|
|
// - Kills nodes to fall to the Repair Threshold of the checker that should not trigger repair any longer
|
|
|
|
// - Starts Checker and Repairer and ensures this is the case.
|
|
|
|
// - Kills more nodes to fall to the Override Value to trigger repair
|
|
|
|
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
|
2022-08-10 16:35:58 +01:00
|
|
|
// the numbers of nodes determined by the upload repair max threshold
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestDataRepairOverride_LowerLimit(t *testing.T) {
|
2019-10-02 13:58:37 +01:00
|
|
|
const repairOverride = 4
|
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 14,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2020-10-27 18:26:46 +00:00
|
|
|
config.Checker.RepairOverrides = checker.RepairOverrides{
|
|
|
|
List: []checker.RepairOverride{
|
|
|
|
{Min: 3, Success: 9, Total: 9, Override: repairOverride},
|
|
|
|
},
|
|
|
|
}
|
2020-10-27 17:34:59 +00:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 6, 9, 9),
|
|
|
|
),
|
2019-10-02 13:58:37 +01:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-10-02 13:58:37 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
2020-01-21 10:38:41 +00:00
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-10-02 13:58:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
// to hit the repair threshold
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces := segment.Pieces
|
|
|
|
repairThreshold := int(segment.Redundancy.RepairShares)
|
2019-10-02 13:58:37 +01:00
|
|
|
numPieces := len(remotePieces)
|
|
|
|
toKill := numPieces - repairThreshold
|
|
|
|
require.True(t, toKill >= 1)
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
|
|
originalNodes := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
originalNodes[piece.StorageNode] = true
|
2019-10-02 13:58:37 +01:00
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
continue
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToKill[piece.StorageNode] = true
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKill[node.ID()] {
|
2020-05-07 09:23:40 +01:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, node)
|
|
|
|
require.NoError(t, err)
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-10-02 13:58:37 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// Increase offline count by the difference to trigger repair
|
|
|
|
toKill += repairThreshold - repairOverride
|
|
|
|
|
|
|
|
for i, piece := range remotePieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
originalNodes[piece.StorageNode] = true
|
2019-10-02 13:58:37 +01:00
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
continue
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
nodesToKill[piece.StorageNode] = true
|
2019-10-02 13:58:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKill[node.ID()] {
|
2020-05-07 09:23:40 +01:00
|
|
|
err = planet.StopNodeAndUpdate(ctx, node)
|
2019-10-02 13:58:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-10-02 13:58:37 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// repair should have been done, due to the override
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ = getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
2019-10-02 13:58:37 +01:00
|
|
|
|
|
|
|
// pointer should have the success count of pieces
|
2020-12-14 14:29:48 +00:00
|
|
|
remotePieces = segment.Pieces
|
|
|
|
require.Equal(t, int(segment.Redundancy.OptimalShares), len(remotePieces))
|
2019-10-02 13:58:37 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// TestDataRepairUploadLimits does the following:
|
2020-12-05 16:01:42 +00:00
|
|
|
// - Uploads test data to nodes
|
|
|
|
// - Get one segment of that data to check in which nodes its pieces are stored
|
|
|
|
// - Kills as many nodes as needed which store such segment pieces
|
|
|
|
// - Triggers data repair
|
|
|
|
// - Verify that the number of pieces which repaired has uploaded don't overpass
|
|
|
|
// the established limit (success threshold + % of excess)
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestDataRepairUploadLimit(t *testing.T) {
|
2020-01-21 10:38:41 +00:00
|
|
|
const (
|
|
|
|
RepairMaxExcessRateOptimalThreshold = 0.05
|
|
|
|
repairThreshold = 5
|
|
|
|
successThreshold = 7
|
|
|
|
maxThreshold = 9
|
|
|
|
)
|
2019-07-11 23:44:47 +01:00
|
|
|
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 13,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2020-10-27 17:34:59 +00:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, repairThreshold, successThreshold, maxThreshold),
|
|
|
|
),
|
2019-07-11 23:44:47 +01:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
2019-09-12 18:16:39 +01:00
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2019-07-11 23:44:47 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var (
|
|
|
|
maxRepairUploadThreshold = int(
|
|
|
|
math.Ceil(
|
2019-08-20 15:46:39 +01:00
|
|
|
float64(successThreshold) * (1 + RepairMaxExcessRateOptimalThreshold),
|
2019-07-11 23:44:47 +01:00
|
|
|
),
|
|
|
|
)
|
|
|
|
ul = planet.Uplinks[0]
|
|
|
|
testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-11 23:44:47 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, ul.Projects[0].ID, "testbucket")
|
2020-12-14 14:29:48 +00:00
|
|
|
|
|
|
|
originalPieces := segment.Pieces
|
2019-07-11 23:44:47 +01:00
|
|
|
require.True(t, len(originalPieces) <= maxThreshold)
|
|
|
|
|
|
|
|
{ // Check that there is enough nodes in the network which don't contain
|
|
|
|
// pieces of the segment for being able to repair the lost pieces
|
|
|
|
availableNumNodes := len(planet.StorageNodes) - len(originalPieces)
|
|
|
|
neededNodesForRepair := maxRepairUploadThreshold - repairThreshold
|
|
|
|
require.Truef(t,
|
|
|
|
availableNumNodes >= neededNodesForRepair,
|
|
|
|
"Not enough remaining nodes in the network for repairing the pieces: have= %d, need= %d",
|
|
|
|
availableNumNodes, neededNodesForRepair,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
originalStorageNodes := make(map[storj.NodeID]struct{})
|
|
|
|
for _, p := range originalPieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
originalStorageNodes[p.StorageNode] = struct{}{}
|
2019-07-11 23:44:47 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
killedNodes := make(map[storj.NodeID]struct{})
|
|
|
|
{ // Register nodes of the network which don't have pieces for the segment
|
|
|
|
// to be injured and ill nodes which have pieces of the segment in order
|
|
|
|
// to injure it
|
|
|
|
numNodesToKill := len(originalPieces) - repairThreshold
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if _, ok := originalStorageNodes[node.ID()]; !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(killedNodes) < numNodesToKill {
|
2020-05-07 09:23:40 +01:00
|
|
|
err := planet.StopNodeAndUpdate(ctx, node)
|
|
|
|
require.NoError(t, err)
|
2019-07-11 23:44:47 +01:00
|
|
|
|
|
|
|
killedNodes[node.ID()] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2019-07-11 23:44:47 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
2020-01-08 18:33:15 +00:00
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
2019-07-11 23:44:47 +01:00
|
|
|
|
|
|
|
// Get the pointer after repair to check the nodes where the pieces are
|
|
|
|
// stored
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ = getRemoteSegment(ctx, t, satellite, ul.Projects[0].ID, "testbucket")
|
2019-07-11 23:44:47 +01:00
|
|
|
|
|
|
|
// Check that repair has uploaded missed pieces to an expected number of
|
|
|
|
// nodes
|
2020-12-14 14:29:48 +00:00
|
|
|
afterRepairPieces := segment.Pieces
|
2019-07-11 23:44:47 +01:00
|
|
|
require.Falsef(t,
|
|
|
|
len(afterRepairPieces) > maxRepairUploadThreshold,
|
|
|
|
"Repaired pieces cannot be over max repair upload threshold. maxRepairUploadThreshold= %d, have= %d",
|
|
|
|
maxRepairUploadThreshold, len(afterRepairPieces),
|
|
|
|
)
|
|
|
|
require.Falsef(t,
|
|
|
|
len(afterRepairPieces) < successThreshold,
|
|
|
|
"Repaired pieces shouldn't be under success threshold. successThreshold= %d, have= %d",
|
|
|
|
successThreshold, len(afterRepairPieces),
|
|
|
|
)
|
|
|
|
|
|
|
|
// Check that after repair, the segment doesn't have more pieces on the
|
|
|
|
// killed nodes
|
|
|
|
for _, p := range afterRepairPieces {
|
2020-12-14 14:29:48 +00:00
|
|
|
require.NotContains(t, killedNodes, p.StorageNode, "there shouldn't be pieces in killed nodes")
|
2019-07-11 23:44:47 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2020-04-23 20:46:16 +01:00
|
|
|
// TestRepairGracefullyExited does the following:
|
|
|
|
// - Uploads test data to 7 nodes
|
|
|
|
// - Set 3 nodes as gracefully exited
|
|
|
|
// - Triggers data repair, which repairs the data from the remaining 4 nodes to additional 3 new nodes
|
|
|
|
// - Shuts down the 4 nodes from which the data was repaired
|
|
|
|
// - Now we have just the 3 new nodes to which the data was repaired
|
|
|
|
// - Downloads the data from these 3 nodes (succeeds because 3 nodes are enough for download)
|
2020-07-16 15:18:02 +01:00
|
|
|
// - Expect newly repaired pointer does not contain the gracefully exited nodes.
|
2022-03-29 11:42:21 +01:00
|
|
|
func TestRepairGracefullyExited(t *testing.T) {
|
2020-04-23 20:46:16 +01:00
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 12,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
2022-03-29 11:42:21 +01:00
|
|
|
config.Repairer.InMemoryRepair = true
|
2020-10-27 17:34:59 +00:00
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(3, 5, 7, 7),
|
|
|
|
),
|
2020-04-23 20:46:16 +01:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
// first, upload some remote data
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2020-04-23 20:46:16 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-05-14 16:05:42 +01:00
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2020-04-23 20:46:16 +01:00
|
|
|
|
|
|
|
numStorageNodes := len(planet.StorageNodes)
|
2020-12-14 17:33:03 +00:00
|
|
|
remotePieces := segment.Pieces
|
2020-04-23 20:46:16 +01:00
|
|
|
numPieces := len(remotePieces)
|
|
|
|
// sanity check
|
|
|
|
require.EqualValues(t, numPieces, 7)
|
|
|
|
toExit := 3
|
|
|
|
// we should have enough storage nodes to repair on
|
|
|
|
require.True(t, (numStorageNodes-toExit) >= numPieces)
|
|
|
|
|
|
|
|
// gracefully exit nodes and track lost pieces
|
|
|
|
nodesToExit := make(map[storj.NodeID]bool)
|
|
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
|
|
|
|
|
|
// exit nodes
|
|
|
|
for i := 0; i < toExit; i++ {
|
2020-12-14 17:33:03 +00:00
|
|
|
nodesToExit[remotePieces[i].StorageNode] = true
|
2020-04-23 20:46:16 +01:00
|
|
|
req := &overlay.ExitStatusRequest{
|
2020-12-14 17:33:03 +00:00
|
|
|
NodeID: remotePieces[i].StorageNode,
|
2020-04-23 20:46:16 +01:00
|
|
|
ExitInitiatedAt: time.Now(),
|
|
|
|
ExitLoopCompletedAt: time.Now(),
|
|
|
|
ExitFinishedAt: time.Now(),
|
|
|
|
}
|
|
|
|
_, err := satellite.DB.OverlayCache().UpdateExitStatus(ctx, req)
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
for i := toExit; i < len(remotePieces); i++ {
|
2020-12-14 17:33:03 +00:00
|
|
|
nodesToKeepAlive[remotePieces[i].StorageNode] = true
|
2020-04-23 20:46:16 +01:00
|
|
|
}
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
err = satellite.RangedLoop.Repair.Observer.RefreshReliabilityCache(ctx)
|
2020-04-23 20:46:16 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2020-04-23 20:46:16 +01:00
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// kill nodes kept alive to ensure repair worked
|
|
|
|
for _, node := range planet.StorageNodes {
|
|
|
|
if nodesToKeepAlive[node.ID()] {
|
2020-05-07 09:23:40 +01:00
|
|
|
require.NoError(t, planet.StopNodeAndUpdate(ctx, node))
|
2020-04-23 20:46:16 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// we should be able to download data without any of the original nodes
|
|
|
|
newData, err := uplinkPeer.Download(ctx, satellite, "testbucket", "test/path")
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, newData, testData)
|
|
|
|
|
|
|
|
// updated pointer should not contain any of the gracefully exited nodes
|
2021-05-14 16:05:42 +01:00
|
|
|
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
2020-04-23 20:46:16 +01:00
|
|
|
|
2020-12-14 17:33:03 +00:00
|
|
|
remotePieces = segmentAfter.Pieces
|
2020-04-23 20:46:16 +01:00
|
|
|
for _, piece := range remotePieces {
|
2020-12-14 17:33:03 +00:00
|
|
|
require.False(t, nodesToExit[piece.StorageNode])
|
2020-04-23 20:46:16 +01:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// getRemoteSegment returns a remote pointer its path from satellite.
|
2022-10-11 12:47:02 +01:00
|
|
|
//
|
|
|
|
//nolint:golint
|
2019-07-11 23:44:47 +01:00
|
|
|
func getRemoteSegment(
|
2021-05-14 16:05:42 +01:00
|
|
|
ctx context.Context, t *testing.T, satellite *testplanet.Satellite, projectID uuid.UUID, bucketName string,
|
2020-12-14 14:29:48 +00:00
|
|
|
) (_ metabase.Segment, key metabase.SegmentKey) {
|
2019-07-11 23:44:47 +01:00
|
|
|
t.Helper()
|
|
|
|
|
2021-09-07 09:15:47 +01:00
|
|
|
objects, err := satellite.Metabase.DB.TestingAllObjects(ctx)
|
2019-07-11 23:44:47 +01:00
|
|
|
require.NoError(t, err)
|
2020-12-14 14:29:48 +00:00
|
|
|
require.Len(t, objects, 1)
|
2019-07-11 23:44:47 +01:00
|
|
|
|
2021-09-07 09:15:47 +01:00
|
|
|
segments, err := satellite.Metabase.DB.TestingAllSegments(ctx)
|
2020-12-14 14:29:48 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Len(t, segments, 1)
|
|
|
|
require.False(t, segments[0].Inline())
|
|
|
|
|
|
|
|
return segments[0], metabase.SegmentLocation{
|
|
|
|
ProjectID: projectID,
|
|
|
|
BucketName: bucketName,
|
|
|
|
ObjectKey: objects[0].ObjectKey,
|
|
|
|
Position: segments[0].Position,
|
|
|
|
}.Encode()
|
2019-07-11 23:44:47 +01:00
|
|
|
}
|
|
|
|
|
2019-09-16 18:13:24 +01:00
|
|
|
// corruptPieceData manipulates piece data on a storage node.
|
2020-03-27 14:46:40 +00:00
|
|
|
func corruptPieceData(ctx context.Context, t *testing.T, planet *testplanet.Planet, corruptedNode *testplanet.StorageNode, corruptedPieceID storj.PieceID) {
|
2019-09-16 18:13:24 +01:00
|
|
|
t.Helper()
|
|
|
|
|
2023-04-05 18:03:06 +01:00
|
|
|
blobRef := blobstore.BlobRef{
|
2019-09-16 18:13:24 +01:00
|
|
|
Namespace: planet.Satellites[0].ID().Bytes(),
|
|
|
|
Key: corruptedPieceID.Bytes(),
|
|
|
|
}
|
|
|
|
|
|
|
|
// get currently stored piece data from storagenode
|
|
|
|
reader, err := corruptedNode.Storage2.BlobsCache.Open(ctx, blobRef)
|
|
|
|
require.NoError(t, err)
|
|
|
|
pieceSize, err := reader.Size()
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.True(t, pieceSize > 0)
|
|
|
|
pieceData := make([]byte, pieceSize)
|
|
|
|
n, err := io.ReadFull(reader, pieceData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.EqualValues(t, n, pieceSize)
|
|
|
|
|
|
|
|
// delete piece data
|
|
|
|
err = corruptedNode.Storage2.BlobsCache.Delete(ctx, blobRef)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// corrupt piece data (not PieceHeader) and write back to storagenode
|
|
|
|
// this means repair downloading should fail during piece hash verification
|
|
|
|
pieceData[pieceSize-1]++ // if we don't do this, this test should fail
|
|
|
|
writer, err := corruptedNode.Storage2.BlobsCache.Create(ctx, blobRef, pieceSize)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
n, err = writer.Write(pieceData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.EqualValues(t, n, pieceSize)
|
|
|
|
|
|
|
|
err = writer.Commit(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
2021-07-13 14:52:37 +01:00
|
|
|
|
|
|
|
type mockConnector struct {
|
|
|
|
realConnector rpc.Connector
|
|
|
|
addressesDialed []string
|
|
|
|
dialInstead map[string]string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mockConnector) DialContext(ctx context.Context, tlsConfig *tls.Config, address string) (rpc.ConnectorConn, error) {
|
|
|
|
m.addressesDialed = append(m.addressesDialed, address)
|
|
|
|
replacement := m.dialInstead[address]
|
|
|
|
if replacement == "" {
|
|
|
|
// allow numeric ip addresses through, return errors for unexpected dns lookups
|
|
|
|
host, _, err := net.SplitHostPort(address)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if net.ParseIP(host) == nil {
|
|
|
|
return nil, &net.DNSError{
|
|
|
|
Err: "unexpected lookup",
|
|
|
|
Name: address,
|
|
|
|
Server: "a.totally.real.dns.server.i.promise",
|
|
|
|
IsNotFound: true,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
replacement = address
|
|
|
|
}
|
|
|
|
return m.realConnector.DialContext(ctx, tlsConfig, replacement)
|
|
|
|
}
|
|
|
|
|
|
|
|
func ecRepairerWithMockConnector(t testing.TB, sat *testplanet.Satellite, mock *mockConnector) *repairer.ECRepairer {
|
|
|
|
tlsOptions := sat.Dialer.TLSOptions
|
|
|
|
newDialer := rpc.NewDefaultDialer(tlsOptions)
|
|
|
|
mock.realConnector = newDialer.Connector
|
|
|
|
newDialer.Connector = mock
|
|
|
|
|
|
|
|
ec := repairer.NewECRepairer(
|
|
|
|
zaptest.NewLogger(t).Named("a-special-repairer"),
|
|
|
|
newDialer,
|
|
|
|
signing.SigneeFromPeerIdentity(sat.Identity.PeerIdentity()),
|
|
|
|
sat.Config.Repairer.DownloadTimeout,
|
|
|
|
sat.Config.Repairer.InMemoryRepair,
|
|
|
|
)
|
|
|
|
return ec
|
|
|
|
}
|
|
|
|
|
2021-08-03 14:21:27 +01:00
|
|
|
func TestECRepairerGet(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 3, 6, 6),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
|
|
|
|
ecRepairer := satellite.Repairer.EcRepairer
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
2023-05-22 13:35:23 +01:00
|
|
|
getOrderLimits, getPrivateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
_, piecesReport, err := ecRepairer.Get(ctx, getOrderLimits, cachedIPsAndPorts, getPrivateKey, redundancy, int64(segment.EncryptedSize))
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 0, len(piecesReport.Offline))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Failed))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Contained))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Unknown))
|
|
|
|
require.Equal(t, int(segment.Redundancy.RequiredShares), len(piecesReport.Successful))
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestECRepairerGetCorrupted(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 3, 6, 6),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 2
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var corruptedPiece metabase.Piece
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
// choose piece to corrupt
|
|
|
|
if corruptedPiece.StorageNode.IsZero() {
|
|
|
|
corruptedPiece = piece
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.False(t, corruptedPiece.StorageNode.IsZero())
|
|
|
|
|
|
|
|
// corrupted node
|
|
|
|
corruptedNode := planet.FindNode(corruptedPiece.StorageNode)
|
|
|
|
require.NotNil(t, corruptedNode)
|
|
|
|
pieceID := segment.RootPieceID.Derive(corruptedPiece.StorageNode, int32(corruptedPiece.Number))
|
|
|
|
corruptPieceData(ctx, t, planet, corruptedNode, pieceID)
|
|
|
|
|
|
|
|
ecRepairer := satellite.Repairer.EcRepairer
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
2023-05-22 13:35:23 +01:00
|
|
|
getOrderLimits, getPrivateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
ecRepairer.TestingSetMinFailures(1)
|
|
|
|
_, piecesReport, err := ecRepairer.Get(ctx, getOrderLimits, cachedIPsAndPorts, getPrivateKey, redundancy, int64(segment.EncryptedSize))
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 0, len(piecesReport.Offline))
|
|
|
|
require.Equal(t, 1, len(piecesReport.Failed))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Contained))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Unknown))
|
|
|
|
require.Equal(t, int(segment.Redundancy.RequiredShares), len(piecesReport.Successful))
|
2022-09-19 22:16:48 +01:00
|
|
|
require.Equal(t, corruptedPiece, piecesReport.Failed[0].Piece)
|
2021-08-03 14:21:27 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestECRepairerGetMissingPiece(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 3, 6, 6),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 2
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var missingPiece metabase.Piece
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
// this means the node will be kept alive for repair
|
|
|
|
// choose a piece for deletion
|
|
|
|
if missingPiece.StorageNode.IsZero() {
|
|
|
|
missingPiece = piece
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.False(t, missingPiece.StorageNode.IsZero())
|
|
|
|
|
|
|
|
// delete piece
|
|
|
|
node := planet.FindNode(missingPiece.StorageNode)
|
|
|
|
require.NotNil(t, node)
|
|
|
|
pieceID := segment.RootPieceID.Derive(missingPiece.StorageNode, int32(missingPiece.Number))
|
|
|
|
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
ecRepairer := satellite.Repairer.EcRepairer
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
2023-05-22 13:35:23 +01:00
|
|
|
getOrderLimits, getPrivateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
ecRepairer.TestingSetMinFailures(1)
|
|
|
|
_, piecesReport, err := ecRepairer.Get(ctx, getOrderLimits, cachedIPsAndPorts, getPrivateKey, redundancy, int64(segment.EncryptedSize))
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 0, len(piecesReport.Offline))
|
|
|
|
require.Equal(t, 1, len(piecesReport.Failed))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Contained))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Unknown))
|
|
|
|
require.Equal(t, int(segment.Redundancy.RequiredShares), len(piecesReport.Successful))
|
2022-09-19 22:16:48 +01:00
|
|
|
require.Equal(t, missingPiece, piecesReport.Failed[0].Piece)
|
2021-08-03 14:21:27 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestECRepairerGetOffline(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 3, 6, 6),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 2
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var offlinePiece metabase.Piece
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
// choose a node and pieceID to shutdown
|
|
|
|
if offlinePiece.StorageNode.IsZero() {
|
|
|
|
offlinePiece = piece
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.False(t, offlinePiece.StorageNode.IsZero())
|
|
|
|
|
|
|
|
// shutdown node
|
|
|
|
offlineNode := planet.FindNode(offlinePiece.StorageNode)
|
|
|
|
require.NotNil(t, offlineNode)
|
|
|
|
require.NoError(t, planet.StopPeer(offlineNode))
|
|
|
|
|
|
|
|
ecRepairer := satellite.Repairer.EcRepairer
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
2023-05-22 13:35:23 +01:00
|
|
|
getOrderLimits, getPrivateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
ecRepairer.TestingSetMinFailures(1)
|
|
|
|
_, piecesReport, err := ecRepairer.Get(ctx, getOrderLimits, cachedIPsAndPorts, getPrivateKey, redundancy, int64(segment.EncryptedSize))
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 1, len(piecesReport.Offline))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Failed))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Contained))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Unknown))
|
|
|
|
require.Equal(t, int(segment.Redundancy.RequiredShares), len(piecesReport.Successful))
|
2022-09-19 22:16:48 +01:00
|
|
|
require.Equal(t, offlinePiece, piecesReport.Offline[0].Piece)
|
2021-08-03 14:21:27 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestECRepairerGetUnknown(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
|
|
return testblobs.NewBadDB(log.Named("baddb"), db), nil
|
|
|
|
},
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 3, 6, 6),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
toKill := 2
|
|
|
|
|
|
|
|
// kill nodes and track lost pieces
|
|
|
|
var unknownPiece metabase.Piece
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
// choose a node to return unknown error
|
|
|
|
if unknownPiece.StorageNode.IsZero() {
|
|
|
|
unknownPiece = piece
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.False(t, unknownPiece.StorageNode.IsZero())
|
|
|
|
|
|
|
|
// set unknown error for download from bad node
|
|
|
|
badNode := planet.FindNode(unknownPiece.StorageNode)
|
|
|
|
require.NotNil(t, badNode)
|
|
|
|
badNodeDB := badNode.DB.(*testblobs.BadDB)
|
|
|
|
badNodeDB.SetError(errs.New("unknown error"))
|
|
|
|
|
|
|
|
ecRepairer := satellite.Repairer.EcRepairer
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
2023-05-22 13:35:23 +01:00
|
|
|
getOrderLimits, getPrivateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
ecRepairer.TestingSetMinFailures(1)
|
|
|
|
_, piecesReport, err := ecRepairer.Get(ctx, getOrderLimits, cachedIPsAndPorts, getPrivateKey, redundancy, int64(segment.EncryptedSize))
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 0, len(piecesReport.Offline))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Failed))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Contained))
|
|
|
|
require.Equal(t, 1, len(piecesReport.Unknown))
|
|
|
|
require.Equal(t, int(segment.Redundancy.RequiredShares), len(piecesReport.Successful))
|
2022-09-19 22:16:48 +01:00
|
|
|
require.Equal(t, unknownPiece, piecesReport.Unknown[0].Piece)
|
2021-08-03 14:21:27 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestECRepairerGetFailure(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
|
|
return testblobs.NewBadDB(log.Named("baddb"), db), nil
|
|
|
|
},
|
|
|
|
Satellite: testplanet.ReconfigureRS(3, 3, 6, 6),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-08-03 14:21:27 +01:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.Equal(t, 6, len(segment.Pieces))
|
|
|
|
require.Equal(t, 3, int(segment.Redundancy.RequiredShares))
|
|
|
|
|
|
|
|
// calculate how many storagenodes to kill
|
|
|
|
toKill := 2
|
|
|
|
|
|
|
|
var onlinePieces metabase.Pieces
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
if i >= toKill {
|
|
|
|
onlinePieces = append(onlinePieces, piece)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
err := planet.StopNodeAndUpdate(ctx, planet.FindNode(piece.StorageNode))
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
require.Equal(t, 4, len(onlinePieces))
|
|
|
|
|
|
|
|
successfulPiece := onlinePieces[0]
|
|
|
|
offlinePiece := onlinePieces[1]
|
|
|
|
unknownPiece := onlinePieces[2]
|
|
|
|
corruptedPiece := onlinePieces[3]
|
|
|
|
|
|
|
|
// stop offline node
|
|
|
|
offlineNode := planet.FindNode(offlinePiece.StorageNode)
|
|
|
|
require.NotNil(t, offlineNode)
|
|
|
|
require.NoError(t, planet.StopPeer(offlineNode))
|
|
|
|
|
|
|
|
// set unknown error for download from bad node
|
|
|
|
badNode := planet.FindNode(unknownPiece.StorageNode)
|
|
|
|
require.NotNil(t, badNode)
|
|
|
|
badNodeDB := badNode.DB.(*testblobs.BadDB)
|
|
|
|
badNodeDB.SetError(errs.New("unknown error"))
|
|
|
|
|
|
|
|
// corrupt data for corrupted node
|
|
|
|
corruptedNode := planet.FindNode(corruptedPiece.StorageNode)
|
|
|
|
require.NotNil(t, corruptedNode)
|
|
|
|
corruptedPieceID := segment.RootPieceID.Derive(corruptedPiece.StorageNode, int32(corruptedPiece.Number))
|
|
|
|
require.NotNil(t, corruptedPieceID)
|
|
|
|
corruptPieceData(ctx, t, planet, corruptedNode, corruptedPieceID)
|
|
|
|
|
|
|
|
ecRepairer := satellite.Repairer.EcRepairer
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
2023-05-22 13:35:23 +01:00
|
|
|
getOrderLimits, getPrivateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
_, piecesReport, err := ecRepairer.Get(ctx, getOrderLimits, cachedIPsAndPorts, getPrivateKey, redundancy, int64(segment.EncryptedSize))
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Error(t, err)
|
|
|
|
require.Equal(t, 1, len(piecesReport.Offline))
|
|
|
|
require.Equal(t, 1, len(piecesReport.Failed))
|
|
|
|
require.Equal(t, 0, len(piecesReport.Contained))
|
|
|
|
require.Equal(t, 1, len(piecesReport.Unknown))
|
|
|
|
require.Equal(t, 1, len(piecesReport.Successful))
|
2022-09-19 22:16:48 +01:00
|
|
|
require.Equal(t, offlinePiece, piecesReport.Offline[0].Piece)
|
|
|
|
require.Equal(t, corruptedPiece, piecesReport.Failed[0].Piece)
|
|
|
|
require.Equal(t, unknownPiece, piecesReport.Unknown[0].Piece)
|
|
|
|
require.Equal(t, successfulPiece, piecesReport.Successful[0].Piece)
|
2021-08-03 14:21:27 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-07-13 14:52:37 +01:00
|
|
|
func TestECRepairerGetDoesNameLookupIfNecessary(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
|
|
|
|
testSatellite := planet.Satellites[0]
|
|
|
|
audits := testSatellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2023-04-24 11:07:16 +01:00
|
|
|
testSatellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-07-13 14:52:37 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
|
|
|
err := ul.Upload(ctx, testSatellite, "test.bucket", "some//path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-04-24 11:07:16 +01:00
|
|
|
// trigger audit
|
|
|
|
_, err = testSatellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2022-11-11 23:11:40 +00:00
|
|
|
queue := audits.VerifyQueue
|
|
|
|
queueSegment, err := queue.Next(ctx)
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-09-07 09:15:47 +01:00
|
|
|
segment, err := testSatellite.Metabase.DB.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
2021-07-13 14:52:37 +01:00
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.True(t, len(segment.Pieces) > 1)
|
|
|
|
|
2023-05-22 13:35:23 +01:00
|
|
|
limits, privateKey, cachedNodesInfo, err := testSatellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
for i, l := range limits {
|
|
|
|
if l == nil {
|
|
|
|
continue
|
|
|
|
}
|
2021-11-08 20:51:04 +00:00
|
|
|
info := cachedNodesInfo[l.Limit.StorageNodeId]
|
|
|
|
info.LastIPPort = fmt.Sprintf("garbageXXX#:%d", i)
|
|
|
|
cachedNodesInfo[l.Limit.StorageNodeId] = info
|
2021-07-13 14:52:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
mock := &mockConnector{}
|
|
|
|
ec := ecRepairerWithMockConnector(t, testSatellite, mock)
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
readCloser, pieces, err := ec.Get(ctx, limits, cachedNodesInfo, privateKey, redundancy, int64(segment.EncryptedSize))
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Len(t, pieces.Failed, 0)
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NotNil(t, readCloser)
|
|
|
|
|
|
|
|
// repair will only download minimum required
|
|
|
|
minReq := redundancy.RequiredCount()
|
|
|
|
var numDialed int
|
2021-11-08 20:51:04 +00:00
|
|
|
for _, info := range cachedNodesInfo {
|
2021-07-13 14:52:37 +01:00
|
|
|
for _, dialed := range mock.addressesDialed {
|
2021-11-08 20:51:04 +00:00
|
|
|
if dialed == info.LastIPPort {
|
2021-07-13 14:52:37 +01:00
|
|
|
numDialed++
|
|
|
|
if numDialed == minReq {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if numDialed == minReq {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
require.True(t, numDialed == minReq)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestECRepairerGetPrefersCachedIPPort(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
|
|
|
|
testSatellite := planet.Satellites[0]
|
|
|
|
audits := testSatellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2023-04-24 11:07:16 +01:00
|
|
|
testSatellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2021-07-13 14:52:37 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
|
|
|
err := ul.Upload(ctx, testSatellite, "test.bucket", "some//path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-04-24 11:07:16 +01:00
|
|
|
// trigger audit
|
|
|
|
_, err = testSatellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2022-11-11 23:11:40 +00:00
|
|
|
queue := audits.VerifyQueue
|
|
|
|
queueSegment, err := queue.Next(ctx)
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2021-09-07 09:15:47 +01:00
|
|
|
segment, err := testSatellite.Metabase.DB.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
2021-07-13 14:52:37 +01:00
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.True(t, len(segment.Pieces) > 1)
|
|
|
|
|
2023-05-22 13:35:23 +01:00
|
|
|
limits, privateKey, cachedNodesInfo, err := testSatellite.Orders.Service.CreateGetRepairOrderLimits(ctx, segment, segment.Pieces)
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// make it so that when the cached IP is dialed, we dial the "right" address,
|
|
|
|
// but when the "right" address is dialed (meaning it came from the OrderLimit,
|
|
|
|
// we dial something else!
|
|
|
|
mock := &mockConnector{
|
|
|
|
dialInstead: make(map[string]string),
|
|
|
|
}
|
|
|
|
var realAddresses []string
|
|
|
|
for i, l := range limits {
|
|
|
|
if l == nil {
|
|
|
|
continue
|
|
|
|
}
|
2021-11-08 20:51:04 +00:00
|
|
|
|
|
|
|
info := cachedNodesInfo[l.Limit.StorageNodeId]
|
|
|
|
info.LastIPPort = fmt.Sprintf("garbageXXX#:%d", i)
|
|
|
|
cachedNodesInfo[l.Limit.StorageNodeId] = info
|
2021-07-13 14:52:37 +01:00
|
|
|
|
|
|
|
address := l.StorageNodeAddress.Address
|
2021-11-08 20:51:04 +00:00
|
|
|
mock.dialInstead[info.LastIPPort] = address
|
2021-07-13 14:52:37 +01:00
|
|
|
mock.dialInstead[address] = "utter.failure?!*"
|
|
|
|
|
|
|
|
realAddresses = append(realAddresses, address)
|
|
|
|
}
|
|
|
|
|
|
|
|
ec := ecRepairerWithMockConnector(t, testSatellite, mock)
|
|
|
|
|
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2023-03-31 18:52:27 +01:00
|
|
|
readCloser, pieces, err := ec.Get(ctx, limits, cachedNodesInfo, privateKey, redundancy, int64(segment.EncryptedSize))
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NoError(t, err)
|
2021-08-03 14:21:27 +01:00
|
|
|
require.Len(t, pieces.Failed, 0)
|
2021-07-13 14:52:37 +01:00
|
|
|
require.NotNil(t, readCloser)
|
|
|
|
// repair will only download minimum required.
|
|
|
|
minReq := redundancy.RequiredCount()
|
|
|
|
var numDialed int
|
2021-11-08 20:51:04 +00:00
|
|
|
for _, info := range cachedNodesInfo {
|
2021-07-13 14:52:37 +01:00
|
|
|
for _, dialed := range mock.addressesDialed {
|
2021-11-08 20:51:04 +00:00
|
|
|
if dialed == info.LastIPPort {
|
2021-07-13 14:52:37 +01:00
|
|
|
numDialed++
|
|
|
|
if numDialed == minReq {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if numDialed == minReq {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
require.True(t, numDialed == minReq)
|
|
|
|
// and that the right address was never dialed directly
|
|
|
|
require.NotContains(t, mock.addressesDialed, realAddresses)
|
|
|
|
})
|
|
|
|
}
|
2022-03-03 00:23:11 +00:00
|
|
|
|
|
|
|
func TestSegmentInExcludedCountriesRepair(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
2022-04-13 12:23:21 +01:00
|
|
|
StorageNodeCount: 7,
|
2022-03-03 00:23:11 +00:00
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.InMemoryRepair = true
|
|
|
|
},
|
2022-04-13 12:23:21 +01:00
|
|
|
testplanet.ReconfigureRS(2, 3, 4, 5),
|
2022-03-03 00:23:11 +00:00
|
|
|
testplanet.RepairExcludedCountryCodes([]string{"FR", "BE"}),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2022-03-03 00:23:11 +00:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
|
|
|
|
remotePieces := segment.Pieces
|
2022-04-13 12:23:21 +01:00
|
|
|
require.GreaterOrEqual(t, len(segment.Pieces), int(segment.Redundancy.RequiredShares))
|
2022-03-03 00:23:11 +00:00
|
|
|
|
2022-04-13 12:23:21 +01:00
|
|
|
err = planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, remotePieces[1].StorageNode, "FR")
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodeInExcluded := remotePieces[1].StorageNode
|
|
|
|
// make one piece after optimal bad
|
|
|
|
err = planet.StopNodeAndUpdate(ctx, planet.FindNode(remotePieces[2].StorageNode))
|
|
|
|
require.NoError(t, err)
|
2022-03-03 00:23:11 +00:00
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2022-03-03 00:23:11 +00:00
|
|
|
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 1, count)
|
|
|
|
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// Verify that the segment was removed
|
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Zero(t, count)
|
|
|
|
|
|
|
|
// Verify the segment has been repaired
|
|
|
|
segmentAfterRepair, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.NotEqual(t, segment.Pieces, segmentAfterRepair.Pieces)
|
2022-04-13 12:23:21 +01:00
|
|
|
require.GreaterOrEqual(t, len(segmentAfterRepair.Pieces), int(segmentAfterRepair.Redundancy.OptimalShares))
|
|
|
|
|
|
|
|
// check excluded area node still exists
|
|
|
|
var found bool
|
|
|
|
for _, p := range segmentAfterRepair.Pieces {
|
|
|
|
if p.StorageNode == nodeInExcluded {
|
|
|
|
found = true
|
|
|
|
break
|
2022-03-03 00:23:11 +00:00
|
|
|
}
|
|
|
|
}
|
2022-04-13 12:23:21 +01:00
|
|
|
require.True(t, found, fmt.Sprintf("node %s not in segment, but should be\n", segmentAfterRepair.Pieces[1].StorageNode.String()))
|
2022-03-03 00:23:11 +00:00
|
|
|
nodesInPointer := make(map[storj.NodeID]bool)
|
|
|
|
for _, n := range segmentAfterRepair.Pieces {
|
|
|
|
// check for duplicates
|
|
|
|
_, ok := nodesInPointer[n.StorageNode]
|
|
|
|
require.False(t, ok)
|
|
|
|
nodesInPointer[n.StorageNode] = true
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2022-04-13 12:23:21 +01:00
|
|
|
// - 7 storage nodes
|
|
|
|
// - pieces uploaded to 4 or 5 nodes
|
|
|
|
// - mark one node holding a piece in excluded area
|
|
|
|
// - put one other node holding a piece offline
|
|
|
|
// - run the checker and check the segment is in the repair queue
|
|
|
|
// - run the repairer
|
|
|
|
// - check the segment has been repaired and that:
|
2022-08-10 16:35:58 +01:00
|
|
|
// - piece in excluded is still there
|
|
|
|
// - piece held by offline node is not
|
|
|
|
// - there are no duplicate
|
2022-03-03 00:23:11 +00:00
|
|
|
func TestSegmentInExcludedCountriesRepairIrreparable(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
2022-04-13 12:23:21 +01:00
|
|
|
StorageNodeCount: 7,
|
2022-03-03 00:23:11 +00:00
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Repairer.InMemoryRepair = true
|
|
|
|
},
|
2022-04-13 12:23:21 +01:00
|
|
|
testplanet.ReconfigureRS(2, 3, 4, 5),
|
2022-03-03 00:23:11 +00:00
|
|
|
testplanet.RepairExcludedCountryCodes([]string{"FR", "BE"}),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2022-03-03 00:23:11 +00:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
|
|
|
|
remotePieces := segment.Pieces
|
2022-04-13 12:23:21 +01:00
|
|
|
require.GreaterOrEqual(t, len(remotePieces), int(segment.Redundancy.OptimalShares))
|
2022-03-03 00:23:11 +00:00
|
|
|
|
2022-04-13 12:23:21 +01:00
|
|
|
err = planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, remotePieces[1].StorageNode, "FR")
|
|
|
|
require.NoError(t, err)
|
|
|
|
nodeInExcluded := remotePieces[0].StorageNode
|
|
|
|
offlineNode := remotePieces[2].StorageNode
|
|
|
|
// make one unhealthy
|
|
|
|
err = planet.StopNodeAndUpdate(ctx, planet.FindNode(offlineNode))
|
|
|
|
require.NoError(t, err)
|
2022-03-03 00:23:11 +00:00
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
// trigger checker with ranged loop to add segment to repair queue
|
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
2022-03-03 00:23:11 +00:00
|
|
|
|
|
|
|
count, err := satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, 1, count)
|
|
|
|
|
|
|
|
satellite.Repair.Repairer.Loop.Restart()
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// Verify that the segment was removed
|
|
|
|
count, err = satellite.DB.RepairQueue().Count(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Zero(t, count)
|
|
|
|
|
|
|
|
// Verify the segment has been repaired
|
|
|
|
segmentAfterRepair, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
|
|
|
|
require.NotEqual(t, segment.Pieces, segmentAfterRepair.Pieces)
|
2022-04-13 12:23:21 +01:00
|
|
|
require.GreaterOrEqual(t, len(segmentAfterRepair.Pieces), int(segment.Redundancy.OptimalShares))
|
|
|
|
|
|
|
|
// check node in excluded area still exists
|
|
|
|
var nodeInExcludedAreaFound bool
|
|
|
|
var offlineNodeFound bool
|
|
|
|
for _, p := range segmentAfterRepair.Pieces {
|
|
|
|
if p.StorageNode == nodeInExcluded {
|
|
|
|
nodeInExcludedAreaFound = true
|
|
|
|
}
|
|
|
|
if p.StorageNode == offlineNode {
|
|
|
|
offlineNodeFound = true
|
2022-03-03 00:23:11 +00:00
|
|
|
}
|
|
|
|
}
|
2022-04-13 12:23:21 +01:00
|
|
|
require.True(t, nodeInExcludedAreaFound, fmt.Sprintf("node %s not in segment, but should be\n", nodeInExcluded.String()))
|
|
|
|
require.False(t, offlineNodeFound, fmt.Sprintf("node %s in segment, but should not be\n", offlineNode.String()))
|
|
|
|
|
2022-03-03 00:23:11 +00:00
|
|
|
nodesInPointer := make(map[storj.NodeID]bool)
|
|
|
|
for _, n := range segmentAfterRepair.Pieces {
|
|
|
|
// check for duplicates
|
|
|
|
_, ok := nodesInPointer[n.StorageNode]
|
|
|
|
require.False(t, ok)
|
|
|
|
nodesInPointer[n.StorageNode] = true
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
2022-08-11 15:17:12 +01:00
|
|
|
|
|
|
|
func reputationRatio(info reputation.Info) float64 {
|
|
|
|
return info.AuditReputationAlpha / (info.AuditReputationAlpha + info.AuditReputationBeta)
|
|
|
|
}
|
2022-12-13 20:40:15 +00:00
|
|
|
|
|
|
|
func TestRepairClumpedPieces(t *testing.T) {
|
|
|
|
// Test that if nodes change IPs such that multiple pieces of a segment
|
|
|
|
// reside in the same network, that segment will be considered unhealthy
|
|
|
|
// by the repair checker and it will be repaired by the repair worker.
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1,
|
|
|
|
StorageNodeCount: 6,
|
|
|
|
UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2023-05-18 19:47:23 +01:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
testplanet.ReconfigureRS(2, 3, 4, 4),
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
config.Checker.DoDeclumping = true
|
|
|
|
config.Repairer.DoDeclumping = true
|
|
|
|
},
|
|
|
|
),
|
2022-12-13 20:40:15 +00:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
uplinkPeer := planet.Uplinks[0]
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
// stop audit to prevent possible interactions i.e. repair timeout problems
|
|
|
|
satellite.Audit.Worker.Loop.Pause()
|
|
|
|
|
2023-04-25 09:40:22 +01:00
|
|
|
satellite.RangedLoop.RangedLoop.Service.Loop.Stop()
|
2022-12-13 20:40:15 +00:00
|
|
|
satellite.Repair.Repairer.Loop.Pause()
|
|
|
|
|
|
|
|
var testData = testrand.Bytes(8 * memory.KiB)
|
|
|
|
// first, upload some remote data
|
|
|
|
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
segment, _ := getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
|
|
|
remotePiecesBefore := segment.Pieces
|
|
|
|
|
|
|
|
// that segment should be ignored by repair checker for now
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2022-12-13 20:40:15 +00:00
|
|
|
injuredSegment, err := satellite.DB.RepairQueue().Select(ctx)
|
|
|
|
require.Error(t, err)
|
|
|
|
if !queue.ErrEmpty.Has(err) {
|
|
|
|
require.FailNow(t, "Should get ErrEmptyQueue, but got", err)
|
|
|
|
}
|
|
|
|
require.Nil(t, injuredSegment)
|
|
|
|
|
|
|
|
// pieces list has not changed
|
|
|
|
segment, _ = getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
|
|
|
remotePiecesAfter := segment.Pieces
|
|
|
|
require.Equal(t, remotePiecesBefore, remotePiecesAfter)
|
|
|
|
|
|
|
|
// now move the network of one storage node holding a piece, so that it's the same as another
|
|
|
|
node0 := planet.FindNode(remotePiecesAfter[0].StorageNode)
|
|
|
|
node1 := planet.FindNode(remotePiecesAfter[1].StorageNode)
|
|
|
|
|
|
|
|
local := node0.Contact.Service.Local()
|
|
|
|
checkInInfo := overlay.NodeCheckInInfo{
|
|
|
|
NodeID: node0.ID(),
|
|
|
|
Address: &pb.NodeAddress{Address: local.Address},
|
|
|
|
LastIPPort: local.Address,
|
|
|
|
LastNet: node1.Contact.Service.Local().Address,
|
|
|
|
IsUp: true,
|
|
|
|
Operator: &local.Operator,
|
|
|
|
Capacity: &local.Capacity,
|
|
|
|
Version: &local.Version,
|
|
|
|
}
|
|
|
|
err = satellite.DB.OverlayCache().UpdateCheckIn(ctx, checkInInfo, time.Now().UTC(), overlay.NodeSelectionConfig{})
|
|
|
|
require.NoError(t, err)
|
satellite/overlay: fix GetNodesNetworkInOrder
We were using the UploadSelectionCache previously, which does _not_ have
all nodes, or even all online nodes, in it. So all nodes with less than
MinimumVersion, or with less than MinimumDiskSpace, or nodes suspended
for unknown audit errors, or nodes that have started graceful exit, were
all missing, and ended up having empty last_nets. Even with all that,
I'm kind of surprised how many nodes this involved, but using the upload
selection cache was definitely wrong.
This change uses the download selection cache instead, which excludes
nodes only when they are disqualified, gracefully exited (completely),
or offline.
Change-Id: Iaa07c988aa29c1eb05796ac48a6f19d69f5826c1
2023-05-18 22:34:31 +01:00
|
|
|
err = satellite.RangedLoop.Overlay.Service.DownloadSelectionCache.Refresh(ctx)
|
2023-05-16 15:08:52 +01:00
|
|
|
require.NoError(t, err)
|
2022-12-13 20:40:15 +00:00
|
|
|
|
|
|
|
// running repair checker again should put the segment into the repair queue
|
2023-04-25 09:40:22 +01:00
|
|
|
_, err = satellite.RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2022-12-13 20:40:15 +00:00
|
|
|
// and subsequently running the repair worker should pull that off the queue and repair it
|
|
|
|
satellite.Repair.Repairer.Loop.TriggerWait()
|
|
|
|
satellite.Repair.Repairer.WaitForPendingRepairs()
|
|
|
|
|
|
|
|
// confirm that the segment now has exactly one piece on (node0 or node1)
|
|
|
|
// and still has the right number of pieces.
|
|
|
|
segment, _ = getRemoteSegment(ctx, t, satellite, uplinkPeer.Projects[0].ID, "testbucket")
|
|
|
|
require.Len(t, segment.Pieces, 4)
|
|
|
|
foundOnFirstNetwork := 0
|
|
|
|
for _, piece := range segment.Pieces {
|
|
|
|
if piece.StorageNode.Compare(node0.ID()) == 0 || piece.StorageNode.Compare(node1.ID()) == 0 {
|
|
|
|
foundOnFirstNetwork++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
require.Equalf(t, 1, foundOnFirstNetwork,
|
|
|
|
"%v should only include one of %s or %s", segment.Pieces, node0.ID(), node1.ID())
|
|
|
|
})
|
|
|
|
}
|