cmd/tools/segment-verify: add verifier
Change-Id: I4cc1fbcf964c4a9a37cf80322f6f99dd956f3d7b
This commit is contained in:
parent
a3a3ffd123
commit
8529a169ee
@ -9,7 +9,9 @@ import (
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
"storj.io/common/storj"
|
||||
"storj.io/common/sync2"
|
||||
"storj.io/storj/satellite/metabase"
|
||||
)
|
||||
|
||||
// Verify verifies a collection of segments.
|
||||
@ -25,7 +27,10 @@ func (service *Service) Verify(ctx context.Context, segments []*Segment) (err er
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
service.VerifyBatches(ctx, batches)
|
||||
err = service.VerifyBatches(ctx, batches)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
retrySegments := []*Segment{}
|
||||
for _, segment := range segments {
|
||||
@ -49,32 +54,51 @@ func (service *Service) Verify(ctx context.Context, segments []*Segment) (err er
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
service.VerifyBatches(ctx, retryBatches)
|
||||
err = service.VerifyBatches(ctx, retryBatches)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// VerifyBatches verifies batches.
|
||||
func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) {
|
||||
func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) error {
|
||||
defer mon.Task()(&ctx)(nil)
|
||||
|
||||
// Convert NodeAliases to NodeIDs
|
||||
aliases := make([]metabase.NodeAlias, len(batches))
|
||||
for i, b := range batches {
|
||||
aliases[i] = b.Alias
|
||||
}
|
||||
ids, err := service.metabase.ConvertAliasesToNodes(ctx, aliases)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
// TODO: fetch addresses for NodeID-s
|
||||
|
||||
var mu sync.Mutex
|
||||
|
||||
limiter := sync2.NewLimiter(ConcurrentRequests)
|
||||
for _, batch := range batches {
|
||||
for i, batch := range batches {
|
||||
nodeID := ids[i]
|
||||
batch := batch
|
||||
limiter.Go(ctx, func() {
|
||||
err := service.VerifyBatch(ctx, batch)
|
||||
err := service.verifier.Verify(ctx, storj.NodeURL{
|
||||
ID: nodeID, // TODO: use NodeURL
|
||||
}, batch.Items)
|
||||
if err != nil {
|
||||
if ErrNodeOffline.Has(err) {
|
||||
mu.Lock()
|
||||
service.OfflineNodes.Add(batch.Alias)
|
||||
mu.Unlock()
|
||||
}
|
||||
|
||||
service.log.Error("verifying a batch failed", zap.Error(err))
|
||||
}
|
||||
})
|
||||
}
|
||||
limiter.Wait()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -32,11 +32,15 @@ const ConcurrentRequests = 10000
|
||||
type Metabase interface {
|
||||
ConvertNodesToAliases(ctx context.Context, nodeID []storj.NodeID) ([]metabase.NodeAlias, error)
|
||||
ConvertAliasesToNodes(ctx context.Context, aliases []metabase.NodeAlias) ([]storj.NodeID, error)
|
||||
|
||||
GetSegmentByPosition(ctx context.Context, opts metabase.GetSegmentByPosition) (segment metabase.Segment, err error)
|
||||
ListVerifySegments(ctx context.Context, opts metabase.ListVerifySegments) (result metabase.ListVerifySegmentsResult, err error)
|
||||
}
|
||||
|
||||
// Verifier verifies a batch of segments.
|
||||
type Verifier interface {
|
||||
Verify(ctx context.Context, target storj.NodeURL, segments []*Segment) error
|
||||
}
|
||||
|
||||
// SegmentWriter allows writing segments to some output.
|
||||
type SegmentWriter interface {
|
||||
Write(ctx context.Context, segments []*Segment) error
|
||||
@ -44,23 +48,28 @@ type SegmentWriter interface {
|
||||
|
||||
// Service implements segment verification logic.
|
||||
type Service struct {
|
||||
log *zap.Logger
|
||||
metabase Metabase
|
||||
log *zap.Logger
|
||||
|
||||
notFound SegmentWriter
|
||||
retry SegmentWriter
|
||||
|
||||
metabase Metabase
|
||||
verifier Verifier
|
||||
|
||||
PriorityNodes NodeAliasSet
|
||||
OfflineNodes NodeAliasSet
|
||||
}
|
||||
|
||||
// NewService returns a new service for verifying segments.
|
||||
func NewService(log *zap.Logger) *Service {
|
||||
func NewService(log *zap.Logger, metabase Metabase, verifier Verifier) *Service {
|
||||
return &Service{
|
||||
log: log,
|
||||
|
||||
PriorityNodes: NodeAliasSet{},
|
||||
OfflineNodes: NodeAliasSet{},
|
||||
|
||||
metabase: metabase,
|
||||
verifier: verifier,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14,6 +14,6 @@ import (
|
||||
|
||||
func TestService(t *testing.T) {
|
||||
log := testplanet.NewLogger(t)
|
||||
service := segmentverify.NewService(log.Named("segment-verify"))
|
||||
service := segmentverify.NewService(log.Named("segment-verify"), nil, nil)
|
||||
require.NotNil(t, service)
|
||||
}
|
||||
|
@ -5,14 +5,136 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/zeebo/errs"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"storj.io/common/errs2"
|
||||
"storj.io/common/rpc"
|
||||
"storj.io/common/rpc/rpcstatus"
|
||||
"storj.io/common/storj"
|
||||
"storj.io/common/sync2"
|
||||
"storj.io/storj/satellite/orders"
|
||||
"storj.io/uplink/private/piecestore"
|
||||
)
|
||||
|
||||
// ErrNodeOffline is returned when it was not possible to contact a node or the node was not responding.
|
||||
var ErrNodeOffline = errs.Class("node offline")
|
||||
var (
|
||||
// ErrNodeOffline is returned when it was not possible to contact a node or the node was not responding.
|
||||
ErrNodeOffline = errs.Class("node offline")
|
||||
)
|
||||
|
||||
// VerifyBatch verifies a single batch.
|
||||
func (service *Service) VerifyBatch(ctx context.Context, batch *Batch) error {
|
||||
return errs.New("todo")
|
||||
// PieceDownloadTimeout defines the duration during which a storage node must return a piece before timing out.
|
||||
const PieceDownloadTimeout = time.Millisecond * 100
|
||||
|
||||
// OrderLimitRetryThrottle defines the duration to wait before retrying order limit creation.
|
||||
const OrderLimitRetryThrottle = time.Millisecond * 100
|
||||
|
||||
// NodeVerifier implements segment verification by dialing nodes.
|
||||
type NodeVerifier struct {
|
||||
log *zap.Logger
|
||||
dialer rpc.Dialer
|
||||
orders *orders.Service
|
||||
}
|
||||
|
||||
var _ Verifier = (*NodeVerifier)(nil)
|
||||
|
||||
// NewVerifier creates a new segment verifier using the specified dialer.
|
||||
func NewVerifier(log *zap.Logger, dialer rpc.Dialer, orders *orders.Service) *NodeVerifier {
|
||||
return &NodeVerifier{
|
||||
log: log,
|
||||
dialer: dialer,
|
||||
orders: orders,
|
||||
}
|
||||
}
|
||||
|
||||
// Verify a collection of segments by attempting to download a byte from each segment from the target node.
|
||||
func (service *NodeVerifier) Verify(ctx context.Context, target storj.NodeURL, segments []*Segment) error {
|
||||
client, err := piecestore.Dial(ctx, service.dialer, target, piecestore.DefaultConfig)
|
||||
if err != nil {
|
||||
return ErrNodeOffline.Wrap(err)
|
||||
}
|
||||
defer func() { _ = client.Close() }()
|
||||
|
||||
for _, segment := range segments {
|
||||
err := service.verifySegment(ctx, client, target, segment)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// verifySegment tries to verify the segment by downloading a single byte from the specified segment.
|
||||
func (service *NodeVerifier) verifySegment(ctx context.Context, client *piecestore.Client, target storj.NodeURL, segment *Segment) error {
|
||||
limit, piecePrivateKey, _, err := service.orders.CreateAuditOrderLimit(ctx, target.ID, 0, segment.RootPieceID, 1)
|
||||
if err != nil {
|
||||
service.log.Error("failed to create order limit",
|
||||
zap.Stringer("retrying in", OrderLimitRetryThrottle),
|
||||
zap.String("stream-id", segment.StreamID.String()),
|
||||
zap.Uint64("position", segment.Position.Encode()),
|
||||
zap.Error(err))
|
||||
|
||||
if !sync2.Sleep(ctx, OrderLimitRetryThrottle) {
|
||||
return Error.Wrap(ctx.Err())
|
||||
}
|
||||
|
||||
limit, piecePrivateKey, _, err = service.orders.CreateAuditOrderLimit(ctx, target.ID, 0, segment.RootPieceID, 1)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
}
|
||||
|
||||
timedCtx, cancel := context.WithTimeout(ctx, PieceDownloadTimeout)
|
||||
defer cancel()
|
||||
|
||||
downloader, err := client.Download(timedCtx, limit.GetLimit(), piecePrivateKey, 0, 1)
|
||||
if err != nil {
|
||||
if errs2.IsRPC(err, rpcstatus.NotFound) {
|
||||
service.log.Info("segment not found",
|
||||
zap.String("stream-id", segment.StreamID.String()),
|
||||
zap.Uint64("position", segment.Position.Encode()),
|
||||
zap.Error(err))
|
||||
segment.Status.MarkNotFound()
|
||||
return nil
|
||||
}
|
||||
|
||||
service.log.Error("download failed",
|
||||
zap.String("stream-id", segment.StreamID.String()),
|
||||
zap.Uint64("position", segment.Position.Encode()),
|
||||
zap.Error(err))
|
||||
return ErrNodeOffline.Wrap(err)
|
||||
}
|
||||
|
||||
buf := [1]byte{}
|
||||
_, err = downloader.Read(buf[:])
|
||||
if err != nil {
|
||||
if errs2.IsRPC(err, rpcstatus.NotFound) {
|
||||
service.log.Info("segment not found",
|
||||
zap.String("stream-id", segment.StreamID.String()),
|
||||
zap.Uint64("position", segment.Position.Encode()),
|
||||
zap.Error(err))
|
||||
segment.Status.MarkNotFound()
|
||||
return nil
|
||||
}
|
||||
|
||||
service.log.Error("read failed",
|
||||
zap.String("stream-id", segment.StreamID.String()),
|
||||
zap.Uint64("position", segment.Position.Encode()),
|
||||
zap.Error(err))
|
||||
return ErrNodeOffline.Wrap(err)
|
||||
}
|
||||
segment.Status.MarkFound()
|
||||
|
||||
err = downloader.Close()
|
||||
if err != nil {
|
||||
// TODO: should we try reconnect in this case?
|
||||
service.log.Error("close failed",
|
||||
zap.String("stream-id", segment.StreamID.String()),
|
||||
zap.Uint64("position", segment.Position.Encode()),
|
||||
zap.Error(err))
|
||||
return ErrNodeOffline.Wrap(err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
84
cmd/tools/segment-verify/verify_test.go
Normal file
84
cmd/tools/segment-verify/verify_test.go
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (C) 2022 Storj Labs, Inc.
|
||||
// See LICENSE for copying information.
|
||||
|
||||
package main_test
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"storj.io/common/memory"
|
||||
"storj.io/common/testcontext"
|
||||
"storj.io/common/testrand"
|
||||
"storj.io/common/uuid"
|
||||
segmentverify "storj.io/storj/cmd/tools/segment-verify"
|
||||
"storj.io/storj/private/testplanet"
|
||||
"storj.io/storj/satellite/metabase"
|
||||
)
|
||||
|
||||
func TestVerifier(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 1,
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
|
||||
service := segmentverify.NewVerifier(planet.Log().Named("verifier"), satellite.Dialer, satellite.Orders.Service)
|
||||
|
||||
// upload some data
|
||||
data := testrand.Bytes(8 * memory.KiB)
|
||||
for _, up := range planet.Uplinks {
|
||||
for i := 0; i < 10; i++ {
|
||||
err := up.Upload(ctx, satellite, "bucket1", strconv.Itoa(i), data)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
result, err := satellite.Metabase.DB.ListVerifySegments(ctx, metabase.ListVerifySegments{
|
||||
CursorStreamID: uuid.UUID{},
|
||||
CursorPosition: metabase.SegmentPosition{},
|
||||
Limit: 100000000,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
validSegments := []*segmentverify.Segment{}
|
||||
for _, raw := range result.Segments {
|
||||
validSegments = append(validSegments, &segmentverify.Segment{
|
||||
VerifySegment: raw,
|
||||
Status: segmentverify.Status{Retry: 1},
|
||||
})
|
||||
}
|
||||
|
||||
// expect all segments are found on the node
|
||||
err = service.Verify(ctx, planet.StorageNodes[0].NodeURL(), validSegments)
|
||||
require.NoError(t, err)
|
||||
for _, seg := range validSegments {
|
||||
require.Equal(t, segmentverify.Status{Found: 1, NotFound: 0, Retry: 0}, seg.Status)
|
||||
}
|
||||
|
||||
// segment not found
|
||||
missingSegment := &segmentverify.Segment{
|
||||
VerifySegment: metabase.VerifySegment{
|
||||
StreamID: testrand.UUID(),
|
||||
Position: metabase.SegmentPosition{},
|
||||
RootPieceID: testrand.PieceID(),
|
||||
AliasPieces: metabase.AliasPieces{{Number: 0, Alias: 1}},
|
||||
},
|
||||
Status: segmentverify.Status{Retry: 1},
|
||||
}
|
||||
|
||||
err = service.Verify(ctx, planet.StorageNodes[0].NodeURL(), []*segmentverify.Segment{missingSegment})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, segmentverify.Status{Found: 0, NotFound: 1, Retry: 0}, missingSegment.Status)
|
||||
|
||||
// TODO: test download timeout
|
||||
|
||||
// node offline
|
||||
err = planet.StopNodeAndUpdate(ctx, planet.StorageNodes[0])
|
||||
require.NoError(t, err)
|
||||
err = service.Verify(ctx, planet.StorageNodes[0].NodeURL(), validSegments)
|
||||
require.Error(t, err)
|
||||
require.True(t, segmentverify.ErrNodeOffline.Has(err))
|
||||
})
|
||||
}
|
Loading…
Reference in New Issue
Block a user