cmd/tools/segment-verify: add verifier

Change-Id: I4cc1fbcf964c4a9a37cf80322f6f99dd956f3d7b
This commit is contained in:
Jennifer Johnson 2022-09-12 15:22:01 -04:00 committed by Egon Elbre
parent a3a3ffd123
commit 8529a169ee
5 changed files with 255 additions and 16 deletions

View File

@ -9,7 +9,9 @@ import (
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/common/sync2"
"storj.io/storj/satellite/metabase"
)
// Verify verifies a collection of segments.
@ -25,7 +27,10 @@ func (service *Service) Verify(ctx context.Context, segments []*Segment) (err er
return Error.Wrap(err)
}
service.VerifyBatches(ctx, batches)
err = service.VerifyBatches(ctx, batches)
if err != nil {
return Error.Wrap(err)
}
retrySegments := []*Segment{}
for _, segment := range segments {
@ -49,32 +54,51 @@ func (service *Service) Verify(ctx context.Context, segments []*Segment) (err er
return Error.Wrap(err)
}
service.VerifyBatches(ctx, retryBatches)
err = service.VerifyBatches(ctx, retryBatches)
if err != nil {
return Error.Wrap(err)
}
return nil
}
// VerifyBatches verifies batches.
func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) {
func (service *Service) VerifyBatches(ctx context.Context, batches []*Batch) error {
defer mon.Task()(&ctx)(nil)
// Convert NodeAliases to NodeIDs
aliases := make([]metabase.NodeAlias, len(batches))
for i, b := range batches {
aliases[i] = b.Alias
}
ids, err := service.metabase.ConvertAliasesToNodes(ctx, aliases)
if err != nil {
return Error.Wrap(err)
}
// TODO: fetch addresses for NodeID-s
var mu sync.Mutex
limiter := sync2.NewLimiter(ConcurrentRequests)
for _, batch := range batches {
for i, batch := range batches {
nodeID := ids[i]
batch := batch
limiter.Go(ctx, func() {
err := service.VerifyBatch(ctx, batch)
err := service.verifier.Verify(ctx, storj.NodeURL{
ID: nodeID, // TODO: use NodeURL
}, batch.Items)
if err != nil {
if ErrNodeOffline.Has(err) {
mu.Lock()
service.OfflineNodes.Add(batch.Alias)
mu.Unlock()
}
service.log.Error("verifying a batch failed", zap.Error(err))
}
})
}
limiter.Wait()
return nil
}

View File

@ -32,11 +32,15 @@ const ConcurrentRequests = 10000
type Metabase interface {
ConvertNodesToAliases(ctx context.Context, nodeID []storj.NodeID) ([]metabase.NodeAlias, error)
ConvertAliasesToNodes(ctx context.Context, aliases []metabase.NodeAlias) ([]storj.NodeID, error)
GetSegmentByPosition(ctx context.Context, opts metabase.GetSegmentByPosition) (segment metabase.Segment, err error)
ListVerifySegments(ctx context.Context, opts metabase.ListVerifySegments) (result metabase.ListVerifySegmentsResult, err error)
}
// Verifier verifies a batch of segments.
type Verifier interface {
Verify(ctx context.Context, target storj.NodeURL, segments []*Segment) error
}
// SegmentWriter allows writing segments to some output.
type SegmentWriter interface {
Write(ctx context.Context, segments []*Segment) error
@ -44,23 +48,28 @@ type SegmentWriter interface {
// Service implements segment verification logic.
type Service struct {
log *zap.Logger
metabase Metabase
log *zap.Logger
notFound SegmentWriter
retry SegmentWriter
metabase Metabase
verifier Verifier
PriorityNodes NodeAliasSet
OfflineNodes NodeAliasSet
}
// NewService returns a new service for verifying segments.
func NewService(log *zap.Logger) *Service {
func NewService(log *zap.Logger, metabase Metabase, verifier Verifier) *Service {
return &Service{
log: log,
PriorityNodes: NodeAliasSet{},
OfflineNodes: NodeAliasSet{},
metabase: metabase,
verifier: verifier,
}
}

View File

@ -14,6 +14,6 @@ import (
func TestService(t *testing.T) {
log := testplanet.NewLogger(t)
service := segmentverify.NewService(log.Named("segment-verify"))
service := segmentverify.NewService(log.Named("segment-verify"), nil, nil)
require.NotNil(t, service)
}

View File

@ -5,14 +5,136 @@ package main
import (
"context"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/errs2"
"storj.io/common/rpc"
"storj.io/common/rpc/rpcstatus"
"storj.io/common/storj"
"storj.io/common/sync2"
"storj.io/storj/satellite/orders"
"storj.io/uplink/private/piecestore"
)
// ErrNodeOffline is returned when it was not possible to contact a node or the node was not responding.
var ErrNodeOffline = errs.Class("node offline")
var (
// ErrNodeOffline is returned when it was not possible to contact a node or the node was not responding.
ErrNodeOffline = errs.Class("node offline")
)
// VerifyBatch verifies a single batch.
func (service *Service) VerifyBatch(ctx context.Context, batch *Batch) error {
return errs.New("todo")
// PieceDownloadTimeout defines the duration during which a storage node must return a piece before timing out.
const PieceDownloadTimeout = time.Millisecond * 100
// OrderLimitRetryThrottle defines the duration to wait before retrying order limit creation.
const OrderLimitRetryThrottle = time.Millisecond * 100
// NodeVerifier implements segment verification by dialing nodes.
type NodeVerifier struct {
log *zap.Logger
dialer rpc.Dialer
orders *orders.Service
}
var _ Verifier = (*NodeVerifier)(nil)
// NewVerifier creates a new segment verifier using the specified dialer.
func NewVerifier(log *zap.Logger, dialer rpc.Dialer, orders *orders.Service) *NodeVerifier {
return &NodeVerifier{
log: log,
dialer: dialer,
orders: orders,
}
}
// Verify a collection of segments by attempting to download a byte from each segment from the target node.
func (service *NodeVerifier) Verify(ctx context.Context, target storj.NodeURL, segments []*Segment) error {
client, err := piecestore.Dial(ctx, service.dialer, target, piecestore.DefaultConfig)
if err != nil {
return ErrNodeOffline.Wrap(err)
}
defer func() { _ = client.Close() }()
for _, segment := range segments {
err := service.verifySegment(ctx, client, target, segment)
if err != nil {
return Error.Wrap(err)
}
}
return nil
}
// verifySegment tries to verify the segment by downloading a single byte from the specified segment.
func (service *NodeVerifier) verifySegment(ctx context.Context, client *piecestore.Client, target storj.NodeURL, segment *Segment) error {
limit, piecePrivateKey, _, err := service.orders.CreateAuditOrderLimit(ctx, target.ID, 0, segment.RootPieceID, 1)
if err != nil {
service.log.Error("failed to create order limit",
zap.Stringer("retrying in", OrderLimitRetryThrottle),
zap.String("stream-id", segment.StreamID.String()),
zap.Uint64("position", segment.Position.Encode()),
zap.Error(err))
if !sync2.Sleep(ctx, OrderLimitRetryThrottle) {
return Error.Wrap(ctx.Err())
}
limit, piecePrivateKey, _, err = service.orders.CreateAuditOrderLimit(ctx, target.ID, 0, segment.RootPieceID, 1)
if err != nil {
return Error.Wrap(err)
}
}
timedCtx, cancel := context.WithTimeout(ctx, PieceDownloadTimeout)
defer cancel()
downloader, err := client.Download(timedCtx, limit.GetLimit(), piecePrivateKey, 0, 1)
if err != nil {
if errs2.IsRPC(err, rpcstatus.NotFound) {
service.log.Info("segment not found",
zap.String("stream-id", segment.StreamID.String()),
zap.Uint64("position", segment.Position.Encode()),
zap.Error(err))
segment.Status.MarkNotFound()
return nil
}
service.log.Error("download failed",
zap.String("stream-id", segment.StreamID.String()),
zap.Uint64("position", segment.Position.Encode()),
zap.Error(err))
return ErrNodeOffline.Wrap(err)
}
buf := [1]byte{}
_, err = downloader.Read(buf[:])
if err != nil {
if errs2.IsRPC(err, rpcstatus.NotFound) {
service.log.Info("segment not found",
zap.String("stream-id", segment.StreamID.String()),
zap.Uint64("position", segment.Position.Encode()),
zap.Error(err))
segment.Status.MarkNotFound()
return nil
}
service.log.Error("read failed",
zap.String("stream-id", segment.StreamID.String()),
zap.Uint64("position", segment.Position.Encode()),
zap.Error(err))
return ErrNodeOffline.Wrap(err)
}
segment.Status.MarkFound()
err = downloader.Close()
if err != nil {
// TODO: should we try reconnect in this case?
service.log.Error("close failed",
zap.String("stream-id", segment.StreamID.String()),
zap.Uint64("position", segment.Position.Encode()),
zap.Error(err))
return ErrNodeOffline.Wrap(err)
}
return nil
}

View File

@ -0,0 +1,84 @@
// Copyright (C) 2022 Storj Labs, Inc.
// See LICENSE for copying information.
package main_test
import (
"strconv"
"testing"
"github.com/stretchr/testify/require"
"storj.io/common/memory"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/common/uuid"
segmentverify "storj.io/storj/cmd/tools/segment-verify"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite/metabase"
)
func TestVerifier(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
service := segmentverify.NewVerifier(planet.Log().Named("verifier"), satellite.Dialer, satellite.Orders.Service)
// upload some data
data := testrand.Bytes(8 * memory.KiB)
for _, up := range planet.Uplinks {
for i := 0; i < 10; i++ {
err := up.Upload(ctx, satellite, "bucket1", strconv.Itoa(i), data)
require.NoError(t, err)
}
}
result, err := satellite.Metabase.DB.ListVerifySegments(ctx, metabase.ListVerifySegments{
CursorStreamID: uuid.UUID{},
CursorPosition: metabase.SegmentPosition{},
Limit: 100000000,
})
require.NoError(t, err)
validSegments := []*segmentverify.Segment{}
for _, raw := range result.Segments {
validSegments = append(validSegments, &segmentverify.Segment{
VerifySegment: raw,
Status: segmentverify.Status{Retry: 1},
})
}
// expect all segments are found on the node
err = service.Verify(ctx, planet.StorageNodes[0].NodeURL(), validSegments)
require.NoError(t, err)
for _, seg := range validSegments {
require.Equal(t, segmentverify.Status{Found: 1, NotFound: 0, Retry: 0}, seg.Status)
}
// segment not found
missingSegment := &segmentverify.Segment{
VerifySegment: metabase.VerifySegment{
StreamID: testrand.UUID(),
Position: metabase.SegmentPosition{},
RootPieceID: testrand.PieceID(),
AliasPieces: metabase.AliasPieces{{Number: 0, Alias: 1}},
},
Status: segmentverify.Status{Retry: 1},
}
err = service.Verify(ctx, planet.StorageNodes[0].NodeURL(), []*segmentverify.Segment{missingSegment})
require.NoError(t, err)
require.Equal(t, segmentverify.Status{Found: 0, NotFound: 1, Retry: 0}, missingSegment.Status)
// TODO: test download timeout
// node offline
err = planet.StopNodeAndUpdate(ctx, planet.StorageNodes[0])
require.NoError(t, err)
err = service.Verify(ctx, planet.StorageNodes[0].NodeURL(), validSegments)
require.Error(t, err)
require.True(t, segmentverify.ErrNodeOffline.Has(err))
})
}