Audit service should download erasure shares in parallel (#2164)

This commit is contained in:
Kaloyan Raev 2019-06-11 11:00:59 +03:00 committed by GitHub
parent 5e7fed7541
commit f0880b9b3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 50 additions and 40 deletions

View File

@ -39,6 +39,7 @@ var (
type Share struct { type Share struct {
Error error Error error
PieceNum int PieceNum int
NodeID storj.NodeID
Data []byte Data []byte
} }
@ -91,7 +92,7 @@ func (verifier *Verifier) Verify(ctx context.Context, stripe *Stripe, skip map[s
verifier.log.Debug("Verify: order limits not created for some nodes (offline)", zap.Strings("Node IDs", offlineNodes.Strings())) verifier.log.Debug("Verify: order limits not created for some nodes (offline)", zap.Strings("Node IDs", offlineNodes.Strings()))
} }
shares, nodes, err := verifier.DownloadShares(ctx, orderLimits, stripe.Index, shareSize) shares, err := verifier.DownloadShares(ctx, orderLimits, stripe.Index, shareSize)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -107,29 +108,29 @@ func (verifier *Verifier) Verify(ctx context.Context, stripe *Stripe, skip map[s
return err == context.DeadlineExceeded return err == context.DeadlineExceeded
}) { }) {
// dial timeout // dial timeout
offlineNodes = append(offlineNodes, nodes[pieceNum]) offlineNodes = append(offlineNodes, share.NodeID)
verifier.log.Debug("Verify: dial timeout (offline)", zap.String("Node ID", nodes[pieceNum].String()), zap.Error(share.Error)) verifier.log.Debug("Verify: dial timeout (offline)", zap.String("Node ID", share.NodeID.String()), zap.Error(share.Error))
continue continue
} }
if errs.IsFunc(share.Error, func(err error) bool { if errs.IsFunc(share.Error, func(err error) bool {
return status.Code(err) == codes.Unknown return status.Code(err) == codes.Unknown
}) { }) {
// dial failed -- offline node // dial failed -- offline node
offlineNodes = append(offlineNodes, nodes[pieceNum]) offlineNodes = append(offlineNodes, share.NodeID)
verifier.log.Debug("Verify: dial failed (offline)", zap.String("Node ID", nodes[pieceNum].String()), zap.Error(share.Error)) verifier.log.Debug("Verify: dial failed (offline)", zap.String("Node ID", share.NodeID.String()), zap.Error(share.Error))
continue continue
} }
// unknown transport error // unknown transport error
containedNodes[pieceNum] = nodes[pieceNum] containedNodes[pieceNum] = share.NodeID
verifier.log.Debug("Verify: unknown transport error (contained)", zap.String("Node ID", nodes[pieceNum].String()), zap.Error(share.Error)) verifier.log.Debug("Verify: unknown transport error (contained)", zap.String("Node ID", share.NodeID.String()), zap.Error(share.Error))
} }
if errs.IsFunc(share.Error, func(err error) bool { if errs.IsFunc(share.Error, func(err error) bool {
return status.Code(err) == codes.NotFound return status.Code(err) == codes.NotFound
}) { }) {
// missing share // missing share
failedNodes = append(failedNodes, nodes[pieceNum]) failedNodes = append(failedNodes, share.NodeID)
verifier.log.Debug("Verify: piece not found (audit failed)", zap.String("Node ID", nodes[pieceNum].String()), zap.Error(share.Error)) verifier.log.Debug("Verify: piece not found (audit failed)", zap.String("Node ID", share.NodeID.String()), zap.Error(share.Error))
continue continue
} }
@ -137,14 +138,14 @@ func (verifier *Verifier) Verify(ctx context.Context, stripe *Stripe, skip map[s
return status.Code(err) == codes.DeadlineExceeded return status.Code(err) == codes.DeadlineExceeded
}) { }) {
// dial successful, but download timed out // dial successful, but download timed out
containedNodes[pieceNum] = nodes[pieceNum] containedNodes[pieceNum] = share.NodeID
verifier.log.Debug("Verify: download timeout (contained)", zap.String("Node ID", nodes[pieceNum].String()), zap.Error(share.Error)) verifier.log.Debug("Verify: download timeout (contained)", zap.String("Node ID", share.NodeID.String()), zap.Error(share.Error))
continue continue
} }
// unknown error // unknown error
containedNodes[pieceNum] = nodes[pieceNum] containedNodes[pieceNum] = share.NodeID
verifier.log.Debug("Verify: unknown error (contained)", zap.String("Node ID", nodes[pieceNum].String()), zap.Error(share.Error)) verifier.log.Debug("Verify: unknown error (contained)", zap.String("Node ID", share.NodeID.String()), zap.Error(share.Error))
} }
required := int(pointer.Remote.Redundancy.GetMinReq()) required := int(pointer.Remote.Redundancy.GetMinReq())
@ -166,10 +167,10 @@ func (verifier *Verifier) Verify(ctx context.Context, stripe *Stripe, skip map[s
} }
for _, pieceNum := range pieceNums { for _, pieceNum := range pieceNums {
failedNodes = append(failedNodes, nodes[pieceNum]) failedNodes = append(failedNodes, shares[pieceNum].NodeID)
} }
successNodes := getSuccessNodes(ctx, nodes, failedNodes, offlineNodes, containedNodes) successNodes := getSuccessNodes(ctx, shares, failedNodes, offlineNodes, containedNodes)
totalInPointer := len(stripe.Segment.GetRemote().GetRemotePieces()) totalInPointer := len(stripe.Segment.GetRemote().GetRemotePieces())
numOffline := len(offlineNodes) numOffline := len(offlineNodes)
@ -225,32 +226,40 @@ func (verifier *Verifier) Verify(ctx context.Context, stripe *Stripe, skip map[s
} }
// DownloadShares downloads shares from the nodes where remote pieces are located // DownloadShares downloads shares from the nodes where remote pieces are located
func (verifier *Verifier) DownloadShares(ctx context.Context, limits []*pb.AddressedOrderLimit, stripeIndex int64, shareSize int32) (shares map[int]Share, nodes map[int]storj.NodeID, err error) { func (verifier *Verifier) DownloadShares(ctx context.Context, limits []*pb.AddressedOrderLimit, stripeIndex int64, shareSize int32) (shares map[int]Share, err error) {
defer mon.Task()(&ctx)(&err) defer mon.Task()(&ctx)(&err)
shares = make(map[int]Share, len(limits)) shares = make(map[int]Share, len(limits))
nodes = make(map[int]storj.NodeID, len(limits)) ch := make(chan *Share, len(limits))
for i, limit := range limits { for i, limit := range limits {
if limit == nil { if limit == nil {
ch <- nil
continue continue
} }
// TODO(kaloyan): execute in goroutine for better performance go func(i int, limit *pb.AddressedOrderLimit) {
share, err := verifier.GetShare(ctx, limit, stripeIndex, shareSize, i) share, err := verifier.GetShare(ctx, limit, stripeIndex, shareSize, i)
if err != nil { if err != nil {
share = Share{ share = Share{
Error: err, Error: err,
PieceNum: i, PieceNum: i,
NodeID: limit.GetLimit().StorageNodeId,
Data: nil, Data: nil,
} }
} }
ch <- &share
shares[share.PieceNum] = share }(i, limit)
nodes[share.PieceNum] = limit.GetLimit().StorageNodeId
} }
return shares, nodes, nil for range limits {
share := <-ch
if share != nil {
shares[share.PieceNum] = *share
}
}
return shares, nil
} }
// Reverify reverifies the contained nodes in the stripe // Reverify reverifies the contained nodes in the stripe
@ -440,6 +449,7 @@ func (verifier *Verifier) GetShare(ctx context.Context, limit *pb.AddressedOrder
return Share{ return Share{
Error: nil, Error: nil,
PieceNum: pieceNum, PieceNum: pieceNum,
NodeID: storageNodeID,
Data: buf, Data: buf,
}, nil }, nil
} }
@ -505,7 +515,7 @@ func getOfflineNodes(pointer *pb.Pointer, limits []*pb.AddressedOrderLimit, skip
} }
// getSuccessNodes uses the failed nodes, offline nodes and contained nodes arrays to determine which nodes passed the audit // getSuccessNodes uses the failed nodes, offline nodes and contained nodes arrays to determine which nodes passed the audit
func getSuccessNodes(ctx context.Context, nodes map[int]storj.NodeID, failedNodes, offlineNodes storj.NodeIDList, containedNodes map[int]storj.NodeID) (successNodes storj.NodeIDList) { func getSuccessNodes(ctx context.Context, shares map[int]Share, failedNodes, offlineNodes storj.NodeIDList, containedNodes map[int]storj.NodeID) (successNodes storj.NodeIDList) {
defer mon.Task()(&ctx)(nil) defer mon.Task()(&ctx)(nil)
fails := make(map[storj.NodeID]bool) fails := make(map[storj.NodeID]bool)
for _, fail := range failedNodes { for _, fail := range failedNodes {
@ -518,9 +528,9 @@ func getSuccessNodes(ctx context.Context, nodes map[int]storj.NodeID, failedNode
fails[contained] = true fails[contained] = true
} }
for _, node := range nodes { for _, share := range shares {
if !fails[node] { if !fails[share.NodeID] {
successNodes = append(successNodes, node) successNodes = append(successNodes, share.NodeID)
} }
} }

View File

@ -68,7 +68,7 @@ func TestDownloadSharesHappyPath(t *testing.T) {
limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil) limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil)
require.NoError(t, err) require.NoError(t, err)
shares, _, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize) shares, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize)
require.NoError(t, err) require.NoError(t, err)
for _, share := range shares { for _, share := range shares {
@ -128,11 +128,11 @@ func TestDownloadSharesOfflineNode(t *testing.T) {
err = stopStorageNode(ctx, planet, stoppedNodeID) err = stopStorageNode(ctx, planet, stoppedNodeID)
require.NoError(t, err) require.NoError(t, err)
shares, nodes, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize) shares, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize)
require.NoError(t, err) require.NoError(t, err)
for i, share := range shares { for _, share := range shares {
if nodes[i] == stoppedNodeID { if share.NodeID == stoppedNodeID {
assert.True(t, transport.Error.Has(share.Error), "unexpected error: %+v", share.Error) assert.True(t, transport.Error.Has(share.Error), "unexpected error: %+v", share.Error)
assert.False(t, errs.IsFunc(share.Error, func(err error) bool { assert.False(t, errs.IsFunc(share.Error, func(err error) bool {
return err == context.DeadlineExceeded return err == context.DeadlineExceeded
@ -194,7 +194,7 @@ func TestDownloadSharesMissingPiece(t *testing.T) {
limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil) limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil)
require.NoError(t, err) require.NoError(t, err)
shares, _, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize) shares, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize)
require.NoError(t, err) require.NoError(t, err)
for _, share := range shares { for _, share := range shares {
@ -271,7 +271,7 @@ func TestDownloadSharesDialTimeout(t *testing.T) {
limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil) limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil)
require.NoError(t, err) require.NoError(t, err)
shares, _, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize) shares, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize)
require.NoError(t, err) require.NoError(t, err)
for _, share := range shares { for _, share := range shares {
@ -351,7 +351,7 @@ func TestDownloadSharesDownloadTimeout(t *testing.T) {
limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil) limits, err := planet.Satellites[0].Orders.Service.CreateAuditOrderLimits(ctx, planet.Satellites[0].Identity.PeerIdentity(), bucketID, stripe.Segment, nil)
require.NoError(t, err) require.NoError(t, err)
shares, _, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize) shares, err := verifier.DownloadShares(ctx, limits, stripe.Index, shareSize)
require.NoError(t, err) require.NoError(t, err)
for _, share := range shares { for _, share := range shares {