storj/satellite/repair/checker/checker_test.go

337 lines
10 KiB
Go
Raw Normal View History

2019-01-24 20:15:10 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
package checker_test
import (
"context"
"fmt"
"testing"
"time"
"github.com/stretchr/testify/require"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/common/uuid"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite/metabase"
)
func TestIdentifyInjuredSegments(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
checker := planet.Satellites[0].Repair.Checker
repairQueue := planet.Satellites[0].DB.RepairQueue()
checker.Loop.Pause()
planet.Satellites[0].Repair.Repairer.Loop.Pause()
rs := storj.RedundancyScheme{
RequiredShares: 2,
RepairShares: 3,
OptimalShares: 4,
TotalShares: 5,
ShareSize: 256,
}
projectID := planet.Uplinks[0].Projects[0].ID
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: projectID,
BucketName: "test-bucket",
}
// add some valid pointers
for x := 0; x < 10; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("a-%d", x))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), time.Time{})
}
// add pointer that needs repair
expectedLocation.ObjectKey = metabase.ObjectKey("b-0")
b0StreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), time.Time{})
// add pointer that is unhealthy, but is expired
expectedLocation.ObjectKey = metabase.ObjectKey("b-1")
insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), time.Now().Add(-time.Hour))
// add some valid pointers
for x := 0; x < 10; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("c-%d", x))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), time.Time{})
Satellite Peer (#1034) * add satellite peer * Add overlay * reorganize kademlia * add RunRefresh * add refresh to storagenode.Peer * add discovery * add agreements and metainfo * rename * add datarepair checker * add repair * add todo notes for audit * add testing interface * add into testplanet * fixes * fix compilation errors * fix compilation errors * make testplanet run * remove audit refrences * ensure that audit tests run * dev * checker tests compilable * fix discovery * fix compilation * fix * fix * dev * fix * disable auth * fixes * revert go.mod/sum * fix linter errors * fix * fix copyright * Add address param for SN dashboard (#1076) * Rename storj-sdk to storj-sim (#1078) * Storagenode logs and config improvements (#1075) * Add more info to SN logs * remove config-dir from user config * add output where config was stored * add message for successful connection * fix linter * remove storage.path from user config * resolve config path * move success message to info * log improvements * Remove captplanet (#1070) * pkg/server: include production cert (#1082) Change-Id: Ie8e6fe78550be83c3bd797db7a1e58d37c684792 * Generate Payments Report (#1079) * memory.Size: autoformat sizes based on value entropy (#1081) * Jj/bytes (#1085) * run tally and rollup * sets dev default tally and rollup intervals * nonessential storj-sim edits (#1086) * Closing context doesn't stop storage node (#1084) * Print when cancelled * Close properly * Don't log nil * Don't print error when closing dashboard * Fix panic in inspector if ping fails (#1088) * Consolidate identity management to identity cli commands (#1083) * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * linters * Consolidate identity management: Move identity cretaion/signing out of storagenode setup command. * fixes * sava backups before saving signed certs * add "-prebuilt-test-cmds" test flag * linters * prepare cli tests for travis * linter fixes * more fixes * linter gods * sp/sdk/sim * remove ca.difficulty * remove unused difficulty * return setup to its rightful place * wip travis * Revert "wip travis" This reverts commit 56834849dcf066d3cc0a4f139033fc3f6d7188ca. * typo in travis.yaml * remove tests * remove more * make it only create one identity at a time for consistency * add config-dir for consitency * add identity creation to storj-sim * add flags * simplify * fix nolint and compile * prevent overwrite and pass difficulty, concurrency, and parent creds * goimports
2019-01-18 13:54:08 +00:00
}
checker.Loop.TriggerWait()
// check that the unhealthy, non-expired segment was added to the queue
// and that the expired segment was ignored
injuredSegment, err := repairQueue.Select(ctx)
require.NoError(t, err)
err = repairQueue.Delete(ctx, injuredSegment)
require.NoError(t, err)
require.Equal(t, b0StreamID, injuredSegment.StreamID)
_, err = repairQueue.Select(ctx)
require.Error(t, err)
})
}
func TestIdentifyIrreparableSegments(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 3, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
checker := planet.Satellites[0].Repair.Checker
checker.Loop.Stop()
const numberOfNodes = 10
pieces := make(metabase.Pieces, 0, numberOfNodes)
// use online nodes
for i, storagenode := range planet.StorageNodes {
pieces = append(pieces, metabase.Piece{
Number: uint16(i),
StorageNode: storagenode.ID(),
})
}
// simulate offline nodes
expectedLostPieces := make(map[int32]bool)
for i := len(pieces); i < numberOfNodes; i++ {
pieces = append(pieces, metabase.Piece{
Number: uint16(i),
StorageNode: storj.NodeID{byte(i)},
})
expectedLostPieces[int32(i)] = true
}
rs := storj.RedundancyScheme{
ShareSize: 256,
RequiredShares: 4,
RepairShares: 8,
OptimalShares: 9,
TotalShares: 10,
}
projectID := planet.Uplinks[0].Projects[0].ID
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: projectID,
BucketName: "test-bucket",
}
// when number of healthy piece is less than minimum required number of piece in redundancy,
// the piece is considered irreparable but also will be put into repair queue
expectedLocation.ObjectKey = "piece"
insertSegment(ctx, t, planet, rs, expectedLocation, pieces, time.Time{})
expectedLocation.ObjectKey = "piece-expired"
insertSegment(ctx, t, planet, rs, expectedLocation, pieces, time.Now().Add(-time.Hour))
err = checker.IdentifyInjuredSegments(ctx)
require.NoError(t, err)
// check that single irreparable segment was added repair queue
repairQueue := planet.Satellites[0].DB.RepairQueue()
_, err = repairQueue.Select(ctx)
require.NoError(t, err)
count, err := repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 1, count)
// check irreparable once again but wait a second
time.Sleep(1 * time.Second)
err = checker.IdentifyInjuredSegments(ctx)
require.NoError(t, err)
expectedLocation.ObjectKey = "piece"
_, err = planet.Satellites[0].Metainfo.Metabase.DeleteObjectLatestVersion(ctx, metabase.DeleteObjectLatestVersion{
ObjectLocation: expectedLocation.Object(),
})
require.NoError(t, err)
err = checker.IdentifyInjuredSegments(ctx)
require.NoError(t, err)
count, err = repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 0, count)
})
}
func TestCleanRepairQueue(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
checker := planet.Satellites[0].Repair.Checker
repairQueue := planet.Satellites[0].DB.RepairQueue()
checker.Loop.Pause()
planet.Satellites[0].Repair.Repairer.Loop.Pause()
rs := storj.RedundancyScheme{
RequiredShares: 2,
RepairShares: 3,
OptimalShares: 4,
TotalShares: 5,
ShareSize: 256,
}
projectID := planet.Uplinks[0].Projects[0].ID
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: projectID,
BucketName: "test-bucket",
}
healthyCount := 5
for i := 0; i < healthyCount; i++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("healthy-%d", i))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), time.Time{})
}
unhealthyCount := 5
unhealthyIDs := make(map[uuid.UUID]struct{})
for i := 0; i < unhealthyCount; i++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("unhealthy-%d", i))
unhealthyStreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), time.Time{})
unhealthyIDs[unhealthyStreamID] = struct{}{}
}
// suspend enough nodes to make healthy pointers unhealthy
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
require.NoError(t, planet.Satellites[0].Overlay.DB.SuspendNodeUnknownAudit(ctx, planet.StorageNodes[i].ID(), time.Now()))
}
require.NoError(t, planet.Satellites[0].Repair.Checker.RefreshReliabilityCache(ctx))
// check that repair queue is empty to avoid false positive
count, err := repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 0, count)
checker.Loop.TriggerWait()
// check that the pointers were put into the repair queue
// and not cleaned up at the end of the checker iteration
count, err = repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, healthyCount+unhealthyCount, count)
// unsuspend nodes to make the previously healthy pointers healthy again
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
require.NoError(t, planet.Satellites[0].Overlay.DB.UnsuspendNodeUnknownAudit(ctx, planet.StorageNodes[i].ID()))
}
require.NoError(t, planet.Satellites[0].Repair.Checker.RefreshReliabilityCache(ctx))
// The checker will not insert/update the now healthy segments causing
// them to be removed from the queue at the end of the checker iteration
checker.Loop.TriggerWait()
// only unhealthy segments should remain
count, err = repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, unhealthyCount, count)
segs, err := repairQueue.SelectN(ctx, count)
require.NoError(t, err)
require.Equal(t, len(unhealthyIDs), len(segs))
for _, s := range segs {
_, ok := unhealthyIDs[s.StreamID]
require.True(t, ok)
}
})
}
func createPieces(planet *testplanet.Planet, rs storj.RedundancyScheme) metabase.Pieces {
pieces := make(metabase.Pieces, rs.OptimalShares)
for i := range pieces {
pieces[i] = metabase.Piece{
Number: uint16(i),
StorageNode: planet.StorageNodes[i].Identity.ID,
}
}
return pieces
}
func createLostPieces(planet *testplanet.Planet, rs storj.RedundancyScheme) metabase.Pieces {
pieces := make(metabase.Pieces, rs.OptimalShares)
for i := range pieces[:rs.RequiredShares] {
pieces[i] = metabase.Piece{
Number: uint16(i),
StorageNode: planet.StorageNodes[i].Identity.ID,
}
}
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
pieces[i] = metabase.Piece{
Number: uint16(i),
StorageNode: storj.NodeID{byte(0xFF)},
}
}
return pieces
}
func insertSegment(ctx context.Context, t *testing.T, planet *testplanet.Planet, rs storj.RedundancyScheme, location metabase.SegmentLocation, pieces metabase.Pieces, expire time.Time) uuid.UUID {
var expiresAt *time.Time
if !expire.IsZero() {
expiresAt = &expire
}
metabaseDB := planet.Satellites[0].Metainfo.Metabase
obj := metabase.ObjectStream{
ProjectID: location.ProjectID,
BucketName: location.BucketName,
ObjectKey: location.ObjectKey,
Version: 1,
StreamID: testrand.UUID(),
}
_, err := metabaseDB.BeginObjectExactVersion(ctx, metabase.BeginObjectExactVersion{
ObjectStream: obj,
Encryption: storj.EncryptionParameters{
CipherSuite: storj.EncAESGCM,
BlockSize: 256,
},
ExpiresAt: expiresAt,
})
require.NoError(t, err)
rootPieceID := testrand.PieceID()
err = metabaseDB.BeginSegment(ctx, metabase.BeginSegment{
ObjectStream: obj,
RootPieceID: rootPieceID,
Pieces: pieces,
})
require.NoError(t, err)
err = metabaseDB.CommitSegment(ctx, metabase.CommitSegment{
ObjectStream: obj,
RootPieceID: rootPieceID,
Pieces: pieces,
EncryptedKey: testrand.Bytes(256),
EncryptedKeyNonce: testrand.Bytes(256),
PlainSize: 1,
EncryptedSize: 1,
Redundancy: rs,
})
require.NoError(t, err)
_, err = metabaseDB.CommitObject(ctx, metabase.CommitObject{
ObjectStream: obj,
})
require.NoError(t, err)
return obj.StreamID
}