8f2dca8437
* Disabled discovery service by changiing from Stop() to Pause() Paused to solve race condition. If discovery is running, it may mark a node "up" after they've been manually marked "down" in this test. * Extend to the repair timeout Fixes intermittent test failures when repairs were taking more than 2 seconds. * Re-enabled test. Disabled discovery service by changiing from Stop() to Pause() * Changed back to Stop. * Revert "Changed back to Stop." This reverts commit 46d410e72dfae63e0c44915be42784cc9a7b5abf. * re-enabling TestIdentifyInjuredSegments * Changed Pause to Stop. Commented on timeout change * testing... * temporarily skipping audit tests * changing back to discover Stop for testing via jenkins * Revert "changing back to discover Stop for testing via jenkins" This reverts commit 6aa8558b11a0053c30e0c8b2dbf0d6c0cb34ee6c. * Changing back to Stop(). Depends on PR 2137 * Revert "temporarily skipping audit tests" This reverts commit 1940ed9b315d663a0eb6c95521780cbcb48cb121. * Removed reference to Graveyard since its been removed
132 lines
3.9 KiB
Go
132 lines
3.9 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package segments_test
|
|
|
|
import (
|
|
"math/rand"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"storj.io/storj/internal/memory"
|
|
"storj.io/storj/internal/testcontext"
|
|
"storj.io/storj/internal/testplanet"
|
|
"storj.io/storj/pkg/pb"
|
|
ecclient "storj.io/storj/pkg/storage/ec"
|
|
"storj.io/storj/pkg/storage/segments"
|
|
"storj.io/storj/pkg/storj"
|
|
"storj.io/storj/uplink"
|
|
)
|
|
|
|
func TestSegmentStoreRepair(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 6, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
// first, upload some remote data
|
|
ul := planet.Uplinks[0]
|
|
satellite := planet.Satellites[0]
|
|
|
|
satellite.Repair.Checker.Loop.Stop()
|
|
// stop discovery service so that we do not get a race condition when we delete nodes from overlay cache
|
|
satellite.Discovery.Service.Discovery.Stop()
|
|
satellite.Discovery.Service.Refresh.Stop()
|
|
|
|
testData := make([]byte, 1*memory.MiB)
|
|
_, err := rand.Read(testData)
|
|
require.NoError(t, err)
|
|
|
|
err = ul.UploadWithConfig(ctx, satellite, &uplink.RSConfig{
|
|
MinThreshold: 2,
|
|
RepairThreshold: 3,
|
|
SuccessThreshold: 4,
|
|
MaxThreshold: 4,
|
|
}, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
// get a remote segment from metainfo
|
|
metainfo := satellite.Metainfo.Service
|
|
listResponse, _, err := metainfo.List(ctx, "", "", "", true, 0, 0)
|
|
require.NoError(t, err)
|
|
|
|
var path string
|
|
var pointer *pb.Pointer
|
|
for _, v := range listResponse {
|
|
path = v.GetPath()
|
|
pointer, err = metainfo.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
if pointer.GetType() == pb.Pointer_REMOTE {
|
|
break
|
|
}
|
|
}
|
|
|
|
// calculate how many storagenodes to kill
|
|
numStorageNodes := len(planet.StorageNodes)
|
|
redundancy := pointer.GetRemote().GetRedundancy()
|
|
remotePieces := pointer.GetRemote().GetRemotePieces()
|
|
minReq := redundancy.GetMinReq()
|
|
numPieces := len(remotePieces)
|
|
toKill := numPieces - int(minReq)
|
|
// we should have enough storage nodes to repair on
|
|
assert.True(t, (numStorageNodes-toKill) >= numPieces)
|
|
|
|
// kill nodes and track lost pieces
|
|
var lostPieces []int32
|
|
nodesToKill := make(map[storj.NodeID]bool)
|
|
nodesToKeepAlive := make(map[storj.NodeID]bool)
|
|
for i, piece := range remotePieces {
|
|
if i >= toKill {
|
|
nodesToKeepAlive[piece.NodeId] = true
|
|
continue
|
|
}
|
|
nodesToKill[piece.NodeId] = true
|
|
lostPieces = append(lostPieces, piece.GetPieceNum())
|
|
}
|
|
for _, node := range planet.StorageNodes {
|
|
if nodesToKill[node.ID()] {
|
|
err = planet.StopPeer(node)
|
|
require.NoError(t, err)
|
|
_, err = satellite.Overlay.Service.UpdateUptime(ctx, node.ID(), false)
|
|
require.NoError(t, err)
|
|
}
|
|
}
|
|
|
|
// repair segment
|
|
os := satellite.Orders.Service
|
|
oc := satellite.Overlay.Service
|
|
ec := ecclient.NewClient(satellite.Transport, 0)
|
|
repairer := segments.NewSegmentRepairer(metainfo, os, oc, ec, satellite.Identity, time.Minute)
|
|
assert.NotNil(t, repairer)
|
|
|
|
err = repairer.Repair(ctx, path)
|
|
assert.NoError(t, err)
|
|
|
|
// kill one of the nodes kept alive to ensure repair worked
|
|
for _, node := range planet.StorageNodes {
|
|
if nodesToKeepAlive[node.ID()] {
|
|
err = planet.StopPeer(node)
|
|
require.NoError(t, err)
|
|
_, err = satellite.Overlay.Service.UpdateUptime(ctx, node.ID(), false)
|
|
require.NoError(t, err)
|
|
break
|
|
}
|
|
}
|
|
|
|
// we should be able to download data without any of the original nodes
|
|
newData, err := ul.Download(ctx, satellite, "testbucket", "test/path")
|
|
assert.NoError(t, err)
|
|
assert.Equal(t, newData, testData)
|
|
|
|
// updated pointer should not contain any of the killed nodes
|
|
pointer, err = metainfo.Get(ctx, path)
|
|
assert.NoError(t, err)
|
|
|
|
remotePieces = pointer.GetRemote().GetRemotePieces()
|
|
for _, piece := range remotePieces {
|
|
assert.False(t, nodesToKill[piece.NodeId])
|
|
}
|
|
})
|
|
}
|