Repair with hashes (#2925)

* add outline for ECRepairer

* add description of process in TODO comments

* begin download/getting hash for a single piece

* verify piece hash and order limit during download

* fix download piece

* begin filling out ESREpair. Get

* wip move ecclient.Repair to ecrepairer.Repair

* pass satellite signee into repairer

* reconstruct original stripe from pieces

* move rebuildStripe()

* calculate piece size differently, increment successful count

* fix shares slices initialization

* rename stripeData to segment

* do not pad reader in Repair()

* temp debug

* create unsafeRSScheme

* use decode reader

* rename file name to be all lowercase

* make repair downloader async

* declare condition variable inside Get method

* set downloadAndVerifyPiece's in-memory buffer to be share size

* update unusedLimits var

* address comments

* remove unnecessary comments

* move initialization of segmentRepaire to be outside of repairer service

* use ReadAll during download

* remove dots and move hashing to after validating for order limit signature

* wip test

* make sure files exactly at min threshold are repaired

* remove unused code

* use corrput data and write back to storagenode

* only create corrupted node and piece ids once

* add comment

* address nat's comment

* fix linting and checker_test

* update comment

* add comments

* remove "copied from ecclient" comments

* add clarification comments in ec.Repair
This commit is contained in:
Maximillian von Briesen 2019-09-06 15:20:36 -04:00 committed by Yingrong Zhao
parent 3387750280
commit fb10815229
9 changed files with 695 additions and 159 deletions

View File

@ -463,16 +463,22 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, revocationDB exten
peer.Overlay.Service,
config.Checker)
segmentRepairer := repairer.NewSegmentRepairer(
log.Named("repairer"),
peer.Metainfo.Service,
peer.Orders.Service,
peer.Overlay.Service,
peer.Transport,
config.Repairer.Timeout,
config.Repairer.MaxExcessRateOptimalThreshold,
signing.SigneeFromPeerIdentity(peer.Identity.PeerIdentity()),
)
peer.Repair.Repairer = repairer.NewService(
peer.Log.Named("repairer"),
peer.DB.RepairQueue(),
&config.Repairer,
config.Repairer.Interval,
config.Repairer.MaxRepair,
peer.Transport,
peer.Metainfo.Service,
peer.Orders.Service,
peer.Overlay.Service,
segmentRepairer,
)
peer.Repair.Inspector = irreparable.NewInspector(peer.DB.Irreparable())

View File

@ -161,9 +161,10 @@ func (checker *Checker) updateIrreparableSegmentStatus(ctx context.Context, poin
numHealthy := int32(len(pieces) - len(missingPieces))
redundancy := pointer.Remote.Redundancy
// we repair when the number of healthy pieces is less than or equal to the repair threshold
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
// minimum required pieces in redundancy
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
if numHealthy >= redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
if len(missingPieces) == 0 {
checker.logger.Error("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme:",
zap.String("path", path),
@ -187,9 +188,7 @@ func (checker *Checker) updateIrreparableSegmentStatus(ctx context.Context, poin
if err != nil {
checker.logger.Error("error deleting entry from irreparable db: ", zap.Error(err))
}
// we need one additional piece for error correction. If only the minimum is remaining the file can't be repaired and is lost.
// except for the case when minimum and repair thresholds are the same (a case usually seen during testing)
} else if numHealthy <= redundancy.MinReq && numHealthy < redundancy.RepairThreshold {
} else if numHealthy < redundancy.MinReq && numHealthy < redundancy.RepairThreshold {
// make an entry into the irreparable table
segmentInfo := &pb.IrreparableSegment{
@ -241,9 +240,10 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, path storj.Path,
redundancy := pointer.Remote.Redundancy
// we repair when the number of healthy pieces is less than or equal to the repair threshold
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
// minimum required pieces in redundancy
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
if numHealthy >= redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
if len(missingPieces) == 0 {
obs.log.Error("Missing pieces is zero in checker, but this should be impossible -- bad redundancy scheme:",
zap.String("path", path),
@ -270,9 +270,7 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, path storj.Path,
obs.log.Error("error deleting entry from irreparable db", zap.Error(err))
return nil
}
// we need one additional piece for error correction. If only the minimum is remaining the file can't be repaired and is lost.
// except for the case when minimum and repair thresholds are the same (a case usually seen during testing)
} else if numHealthy <= redundancy.MinReq && numHealthy < redundancy.RepairThreshold {
} else if numHealthy < redundancy.MinReq && numHealthy < redundancy.RepairThreshold {
pathElements := storj.SplitPath(path)
// check to make sure there are at least *4* path elements. the first three

View File

@ -85,11 +85,14 @@ func TestIdentifyIrreparableSegments(t *testing.T) {
})
expectedLostPieces[int32(i)] = true
}
// when number of healthy piece is less than minimum required number of piece in redundancy,
// the piece is considered irreparable and will be put into irreparable DB
pointer := &pb.Pointer{
CreationDate: time.Now(),
Remote: &pb.RemoteSegment{
Redundancy: &pb.RedundancyScheme{
MinReq: int32(3),
MinReq: int32(4),
RepairThreshold: int32(8),
SuccessThreshold: int32(9),
Total: int32(10),

View File

@ -5,6 +5,7 @@ package repair_test
import (
"context"
"io"
"math"
"testing"
@ -19,6 +20,8 @@ import (
"storj.io/storj/pkg/storj"
"storj.io/storj/satellite"
"storj.io/storj/satellite/overlay"
"storj.io/storj/storage"
"storj.io/storj/storagenode"
"storj.io/storj/uplink"
)
@ -79,7 +82,7 @@ func TestDataRepair(t *testing.T) {
numPieces := len(remotePieces)
// disqualify one storage node
toDisqualify := 1
toKill := numPieces - toDisqualify - int(minReq+1)
toKill := numPieces - toDisqualify - int(minReq)
require.True(t, toKill >= 1)
maxNumRepairedPieces := int(
math.Ceil(
@ -159,6 +162,149 @@ func TestDataRepair(t *testing.T) {
})
}
// TestCorruptDataRepair does the following:
// - Uploads test data
// - Kills all but the minimum number of nodes carrying the uploaded segment
// - On one of the remaining nodes, corrupt the piece data being stored by that node
// - Triggers data repair, which attempts to repair the data from the remaining nodes to
// the numbers of nodes determined by the upload repair max threshold
// - Expects that the repair failed and the pointer was not updated
func TestCorruptDataRepair(t *testing.T) {
const RepairMaxExcessRateOptimalThreshold = 0.05
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1,
StorageNodeCount: 14,
UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.Node.OnlineWindow = 0
config.Repairer.MaxExcessRateOptimalThreshold = RepairMaxExcessRateOptimalThreshold
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
uplinkPeer := planet.Uplinks[0]
satellitePeer := planet.Satellites[0]
// stop discovery service so that we do not get a race condition when we delete nodes from overlay
satellitePeer.Discovery.Service.Discovery.Stop()
satellitePeer.Discovery.Service.Refresh.Stop()
// stop audit to prevent possible interactions i.e. repair timeout problems
satellitePeer.Audit.Service.Loop.Stop()
satellitePeer.Audit.Chore.Loop.Stop()
satellitePeer.Audit.Worker.Loop.Stop()
satellitePeer.Repair.Checker.Loop.Pause()
satellitePeer.Repair.Repairer.Loop.Pause()
var testData = testrand.Bytes(8 * memory.KiB)
// first, upload some remote data
err := uplinkPeer.UploadWithConfig(ctx, satellitePeer, &uplink.RSConfig{
MinThreshold: 3,
RepairThreshold: 5,
SuccessThreshold: 7,
MaxThreshold: 10,
}, "testbucket", "test/path", testData)
require.NoError(t, err)
pointer, path := getRemoteSegment(t, ctx, satellitePeer)
// calculate how many storagenodes to kill
redundancy := pointer.GetRemote().GetRedundancy()
minReq := redundancy.GetMinReq()
remotePieces := pointer.GetRemote().GetRemotePieces()
numPieces := len(remotePieces)
toKill := numPieces - int(minReq)
require.True(t, toKill >= 1)
// kill nodes and track lost pieces
nodesToKill := make(map[storj.NodeID]bool)
originalNodes := make(map[storj.NodeID]bool)
var corruptedNode *storagenode.Peer
var corruptedNodeID storj.NodeID
var corruptedPiece storj.PieceID
for i, piece := range remotePieces {
originalNodes[piece.NodeId] = true
if i >= toKill {
// this means the node will be kept alive for repair
// choose a node and pieceID to corrupt so repair fails
if corruptedNodeID.IsZero() || corruptedPiece.IsZero() {
corruptedNodeID = piece.NodeId
corruptedPiece = pointer.GetRemote().RootPieceId.Derive(corruptedNodeID, piece.PieceNum)
}
continue
}
nodesToKill[piece.NodeId] = true
}
require.NotNil(t, corruptedNodeID)
require.NotNil(t, corruptedPiece)
for _, node := range planet.StorageNodes {
if node.ID() == corruptedNodeID {
corruptedNode = node
}
if nodesToKill[node.ID()] {
err = planet.StopPeer(node)
require.NoError(t, err)
_, err = satellitePeer.Overlay.Service.UpdateUptime(ctx, node.ID(), false)
require.NoError(t, err)
}
}
require.NotNil(t, corruptedNode)
blobRef := storage.BlobRef{
Namespace: satellitePeer.ID().Bytes(),
Key: corruptedPiece.Bytes(),
}
// get currently stored piece data from storagenode
reader, err := corruptedNode.Storage2.BlobsCache.Open(ctx, blobRef)
require.NoError(t, err)
pieceSize, err := reader.Size()
require.NoError(t, err)
require.True(t, pieceSize > 0)
pieceData := make([]byte, pieceSize)
n, err := io.ReadFull(reader, pieceData)
require.NoError(t, err)
require.EqualValues(t, n, pieceSize)
// delete piece data
err = corruptedNode.Storage2.BlobsCache.Delete(ctx, blobRef)
require.NoError(t, err)
// corrupt data and write back to storagenode
pieceData[0]++ // if we don't do this, this test should fail
writer, err := corruptedNode.Storage2.BlobsCache.Create(ctx, blobRef, pieceSize)
require.NoError(t, err)
n, err = writer.Write(pieceData)
require.NoError(t, err)
require.EqualValues(t, n, pieceSize)
err = writer.Commit(ctx)
require.NoError(t, err)
satellitePeer.Repair.Checker.Loop.Restart()
satellitePeer.Repair.Checker.Loop.TriggerWait()
satellitePeer.Repair.Checker.Loop.Pause()
satellitePeer.Repair.Repairer.Loop.Restart()
satellitePeer.Repair.Repairer.Loop.TriggerWait()
satellitePeer.Repair.Repairer.Loop.Pause()
satellitePeer.Repair.Repairer.Limiter.Wait()
// repair should fail, so segment should contain all the original nodes
metainfoService := satellitePeer.Metainfo.Service
pointer, err = metainfoService.Get(ctx, path)
require.NoError(t, err)
remotePieces = pointer.GetRemote().GetRemotePieces()
for _, piece := range remotePieces {
require.Contains(t, originalNodes, piece.NodeId, "there should be no new nodes in pointer")
}
})
}
// TestRemoveIrreparableSegmentFromQueue
// - Upload tests data to 7 nodes
// - Kill nodes so that repair threshold > online nodes > minimum threshold

View File

@ -0,0 +1,448 @@
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package repairer
import (
"bytes"
"context"
"io"
"io/ioutil"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/vivint/infectious"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/storj/internal/errs2"
"storj.io/storj/internal/sync2"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/pkcrypto"
"storj.io/storj/pkg/signing"
"storj.io/storj/pkg/storj"
"storj.io/storj/pkg/transport"
"storj.io/storj/uplink/eestream"
"storj.io/storj/uplink/piecestore"
)
// ECRepairer allows the repairer to download, verify, and upload pieces from storagenodes.
type ECRepairer struct {
log *zap.Logger
transport transport.Client
satelliteSignee signing.Signee
}
// NewECRepairer creates a new repairer for interfacing with storagenodes.
func NewECRepairer(log *zap.Logger, tc transport.Client, satelliteSignee signing.Signee) *ECRepairer {
return &ECRepairer{
log: log,
transport: tc,
satelliteSignee: satelliteSignee,
}
}
func (ec *ECRepairer) dialPiecestore(ctx context.Context, n *pb.Node) (*piecestore.Client, error) {
logger := ec.log.Named(n.Id.String())
return piecestore.Dial(ctx, ec.transport, n, logger, piecestore.DefaultConfig)
}
// Get downloads pieces from storagenodes using the provided order limits, and decodes those pieces into a segment.
// It attempts to download from the minimum required number based on the redundancy scheme.
// After downloading a piece, the ECRepairer will verify the hash and original order limit for that piece.
// If verification fails, another piece will be downloaded until we reach the minimum required or run out of order limits.
func (ec *ECRepairer) Get(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, es eestream.ErasureScheme, dataSize int64) (_ io.ReadCloser, err error) {
defer mon.Task()(&ctx)(&err)
if len(limits) != es.TotalCount() {
return nil, Error.New("number of limits slice (%d) does not match total count (%d) of erasure scheme", len(limits), es.TotalCount())
}
nonNilLimits := nonNilCount(limits)
if nonNilLimits < es.RequiredCount() {
return nil, Error.New("number of non-nil limits (%d) is less than required count (%d) of erasure scheme", nonNilCount(limits), es.RequiredCount())
}
pieceSize := eestream.CalcPieceSize(dataSize, es)
var successfulPieces, inProgress int
unusedLimits := nonNilLimits
pieceReaders := make(map[int]io.ReadCloser)
limiter := sync2.NewLimiter(es.RequiredCount())
cond := sync.NewCond(&sync.Mutex{})
for currentLimitIndex, limit := range limits {
if limit == nil {
continue
}
currentLimitIndex, limit := currentLimitIndex, limit
limiter.Go(ctx, func() {
cond.L.Lock()
defer cond.Signal()
defer cond.L.Unlock()
for {
if successfulPieces >= es.RequiredCount() {
// already downloaded minimum number of pieces
cond.Broadcast()
return
}
if successfulPieces+inProgress+unusedLimits < es.RequiredCount() {
// not enough available limits left to get required number of pieces
cond.Broadcast()
return
}
if successfulPieces+inProgress >= es.RequiredCount() {
cond.Wait()
continue
}
unusedLimits--
inProgress++
cond.L.Unlock()
downloadedPiece, err := ec.downloadAndVerifyPiece(ctx, limit, privateKey, pieceSize)
cond.L.Lock()
inProgress--
if err != nil {
ec.log.Debug("Failed to download pieces for repair.", zap.Error(err))
return
}
pieceReaders[currentLimitIndex] = ioutil.NopCloser(bytes.NewReader(downloadedPiece))
successfulPieces++
return
}
})
}
limiter.Wait()
if successfulPieces < es.RequiredCount() {
return nil, Error.New("couldn't download enough pieces, number of successful downloaded pieces (%d) is less than required number (%d)", successfulPieces, es.RequiredCount())
}
fec, err := infectious.NewFEC(es.RequiredCount(), es.TotalCount())
if err != nil {
return nil, Error.Wrap(err)
}
esScheme := eestream.NewUnsafeRSScheme(fec, es.ErasureShareSize())
expectedSize := pieceSize * int64(es.RequiredCount())
ctx, cancel := context.WithCancel(ctx)
decodeReader := eestream.DecodeReaders(ctx, cancel, ec.log.Named("decode readers"), pieceReaders, esScheme, expectedSize, 0, false)
return decodeReader, nil
}
// downloadAndVerifyPiece downloads a piece from a storagenode,
// expects the original order limit to have the correct piece public key,
// and expects the hash of the data to match the signed hash provided by the storagenode.
func (ec *ECRepairer) downloadAndVerifyPiece(ctx context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, pieceSize int64) (data []byte, err error) {
// contact node
ps, err := ec.dialPiecestore(ctx, &pb.Node{
Id: limit.GetLimit().StorageNodeId,
Address: limit.GetStorageNodeAddress(),
})
if err != nil {
return nil, err
}
downloader, err := ps.Download(ctx, limit.GetLimit(), privateKey, 0, pieceSize)
if err != nil {
return nil, err
}
defer func() { err = errs.Combine(err, downloader.Close()) }()
pieceBytes, err := ioutil.ReadAll(downloader)
if err != nil {
return nil, err
}
if int64(len(pieceBytes)) != pieceSize {
return nil, Error.New("didn't download the correct amount of data, want %d, got %d", pieceSize, len(pieceBytes))
}
// get signed piece hash and original order limit
hash, originalLimit := downloader.GetHashAndLimit()
if hash == nil {
return nil, Error.New("hash was not sent from storagenode")
}
if originalLimit == nil {
return nil, Error.New("original order limit was not sent from storagenode")
}
// verify order limit from storage node is signed by the satellite
if err := verifyOrderLimitSignature(ctx, ec.satelliteSignee, originalLimit); err != nil {
return nil, err
}
// verify the hashes from storage node
calculatedHash := pkcrypto.SHA256Hash(pieceBytes)
if err := verifyPieceHash(ctx, originalLimit, hash, calculatedHash); err != nil {
return nil, err
}
return pieceBytes, nil
}
func verifyPieceHash(ctx context.Context, limit *pb.OrderLimit, hash *pb.PieceHash, expectedHash []byte) (err error) {
defer mon.Task()(&ctx)(&err)
if limit == nil || hash == nil || len(expectedHash) == 0 {
return Error.New("invalid arguments")
}
if limit.PieceId != hash.PieceId {
return Error.New("piece id changed")
}
if !bytes.Equal(hash.Hash, expectedHash) {
return Error.New("hashes don't match")
}
if err := signing.VerifyUplinkPieceHashSignature(ctx, limit.UplinkPublicKey, hash); err != nil {
return Error.New("invalid piece hash signature")
}
return nil
}
func verifyOrderLimitSignature(ctx context.Context, satellite signing.Signee, limit *pb.OrderLimit) (err error) {
if err := signing.VerifyOrderLimitSignature(ctx, satellite, limit); err != nil {
return Error.New("invalid order limit signature: %v", err)
}
return nil
}
// Repair takes a provided segment, encodes it with the provided redundancy strategy,
// and uploads the pieces in need of repair to new nodes provided by order limits.
func (ec *ECRepairer) Repair(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, rs eestream.RedundancyStrategy, data io.Reader, expiration time.Time, timeout time.Duration, path storj.Path) (successfulNodes []*pb.Node, successfulHashes []*pb.PieceHash, err error) {
defer mon.Task()(&ctx)(&err)
pieceCount := len(limits)
if pieceCount != rs.TotalCount() {
return nil, nil, Error.New("size of limits slice (%d) does not match total count (%d) of erasure scheme", pieceCount, rs.TotalCount())
}
if !unique(limits) {
return nil, nil, Error.New("duplicated nodes are not allowed")
}
readers, err := eestream.EncodeReader(ctx, ec.log, ioutil.NopCloser(data), rs)
if err != nil {
return nil, nil, err
}
// info contains data about a single piece transfer
type info struct {
i int
err error
hash *pb.PieceHash
}
// this channel is used to synchronize concurrently uploaded pieces with the overall repair
infos := make(chan info, pieceCount)
psCtx, cancel := context.WithCancel(ctx)
defer cancel()
for i, addressedLimit := range limits {
go func(i int, addressedLimit *pb.AddressedOrderLimit) {
hash, err := ec.putPiece(psCtx, ctx, addressedLimit, privateKey, readers[i], expiration)
infos <- info{i: i, err: err, hash: hash}
}(i, addressedLimit)
}
ec.log.Info("Starting a timer for repair so that the number of pieces will be closer to the success threshold",
zap.Duration("Timer", timeout),
zap.String("Path", path),
zap.Int("Node Count", nonNilCount(limits)),
zap.Int("Optimal Threshold", rs.OptimalThreshold()),
)
var successfulCount, failureCount, cancellationCount int32
timer := time.AfterFunc(timeout, func() {
if ctx.Err() != context.Canceled {
ec.log.Info("Timer expired. Canceling the long tail...",
zap.String("Path", path),
zap.Int32("Successfully repaired", atomic.LoadInt32(&successfulCount)),
)
cancel()
}
})
successfulNodes = make([]*pb.Node, pieceCount)
successfulHashes = make([]*pb.PieceHash, pieceCount)
for range limits {
info := <-infos
if limits[info.i] == nil {
continue
}
if info.err != nil {
if !errs2.IsCanceled(info.err) {
failureCount++
} else {
cancellationCount++
}
ec.log.Debug("Repair to storage node failed",
zap.String("Path", path),
zap.String("NodeID", limits[info.i].GetLimit().StorageNodeId.String()),
zap.Error(info.err),
)
continue
}
successfulNodes[info.i] = &pb.Node{
Id: limits[info.i].GetLimit().StorageNodeId,
Address: limits[info.i].GetStorageNodeAddress(),
}
successfulHashes[info.i] = info.hash
successfulCount++
}
// Ensure timer is stopped
_ = timer.Stop()
// TODO: clean up the partially uploaded segment's pieces
defer func() {
select {
case <-ctx.Done():
err = Error.New("repair cancelled")
default:
}
}()
if successfulCount == 0 {
return nil, nil, Error.New("repair %v to all nodes failed", path)
}
ec.log.Info("Successfully repaired",
zap.String("Path", path),
zap.Int32("Success Count", atomic.LoadInt32(&successfulCount)),
)
mon.IntVal("repair_segment_pieces_total").Observe(int64(pieceCount))
mon.IntVal("repair_segment_pieces_successful").Observe(int64(successfulCount))
mon.IntVal("repair_segment_pieces_failed").Observe(int64(failureCount))
mon.IntVal("repair_segment_pieces_canceled").Observe(int64(cancellationCount))
return successfulNodes, successfulHashes, nil
}
func (ec *ECRepairer) putPiece(ctx, parent context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, data io.ReadCloser, expiration time.Time) (hash *pb.PieceHash, err error) {
nodeName := "nil"
if limit != nil {
nodeName = limit.GetLimit().StorageNodeId.String()[0:8]
}
defer mon.Task()(&ctx, "node: "+nodeName)(&err)
defer func() { err = errs.Combine(err, data.Close()) }()
if limit == nil {
_, _ = io.Copy(ioutil.Discard, data)
return nil, nil
}
storageNodeID := limit.GetLimit().StorageNodeId
pieceID := limit.GetLimit().PieceId
ps, err := ec.dialPiecestore(ctx, &pb.Node{
Id: storageNodeID,
Address: limit.GetStorageNodeAddress(),
})
if err != nil {
ec.log.Debug("Failed dialing for putting piece to node",
zap.String("PieceID", pieceID.String()),
zap.String("NodeID", storageNodeID.String()),
zap.Error(err),
)
return nil, err
}
defer func() { err = errs.Combine(err, ps.Close()) }()
upload, err := ps.Upload(ctx, limit.GetLimit(), privateKey)
if err != nil {
ec.log.Debug("Failed requesting upload of pieces to node",
zap.String("PieceID", pieceID.String()),
zap.String("NodeID", storageNodeID.String()),
zap.Error(err),
)
return nil, err
}
defer func() {
if ctx.Err() != nil || err != nil {
hash = nil
err = errs.Combine(err, upload.Cancel(ctx))
return
}
h, closeErr := upload.Commit(ctx)
hash = h
err = errs.Combine(err, closeErr)
}()
_, err = sync2.Copy(ctx, upload, data)
// Canceled context means the piece upload was interrupted by user or due
// to slow connection. No error logging for this case.
if ctx.Err() == context.Canceled {
if parent.Err() == context.Canceled {
ec.log.Info("Upload to node canceled by user", zap.String("NodeID", storageNodeID.String()))
} else {
ec.log.Debug("Node cut from upload due to slow connection", zap.String("NodeID", storageNodeID.String()))
}
err = context.Canceled
} else if err != nil {
nodeAddress := "nil"
if limit.GetStorageNodeAddress() != nil {
nodeAddress = limit.GetStorageNodeAddress().GetAddress()
}
ec.log.Debug("Failed uploading piece to node",
zap.String("PieceID", pieceID.String()),
zap.String("NodeID", storageNodeID.String()),
zap.String("Node Address", nodeAddress),
zap.Error(err),
)
}
return hash, err
}
func nonNilCount(limits []*pb.AddressedOrderLimit) int {
total := 0
for _, limit := range limits {
if limit != nil {
total++
}
}
return total
}
func unique(limits []*pb.AddressedOrderLimit) bool {
if len(limits) < 2 {
return true
}
ids := make(storj.NodeIDList, len(limits))
for i, addressedLimit := range limits {
if addressedLimit != nil {
ids[i] = addressedLimit.GetLimit().StorageNodeId
}
}
// sort the ids and check for identical neighbors
sort.Sort(ids)
// sort.Slice(ids, func(i, k int) bool { return ids[i].Less(ids[k]) })
for i := 1; i < len(ids); i++ {
if ids[i] != (storj.NodeID{}) && ids[i] == ids[i-1] {
return false
}
}
return true
}

View File

@ -14,13 +14,8 @@ import (
"storj.io/storj/internal/memory"
"storj.io/storj/internal/sync2"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/transport"
"storj.io/storj/satellite/metainfo"
"storj.io/storj/satellite/orders"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/repair/queue"
"storj.io/storj/storage"
"storj.io/storj/uplink/ecclient"
)
// Error is a standard error class for this package.
@ -49,16 +44,13 @@ type Service struct {
}
// NewService creates repairing service
func NewService(log *zap.Logger, queue queue.RepairQueue, config *Config, interval time.Duration, concurrency int, transport transport.Client, metainfo *metainfo.Service, orders *orders.Service, cache *overlay.Service) *Service {
client := ecclient.NewClient(log.Named("ecclient"), transport, config.MaxBufferMem.Int())
repairer := NewSegmentRepairer(log.Named("repairer"), metainfo, orders, cache, client, config.Timeout, config.MaxExcessRateOptimalThreshold)
func NewService(log *zap.Logger, queue queue.RepairQueue, config *Config, repairer *SegmentRepairer) *Service {
return &Service{
log: log,
queue: queue,
config: config,
Limiter: sync2.NewLimiter(concurrency),
Loop: *sync2.NewCycle(interval),
Limiter: sync2.NewLimiter(config.MaxRepair),
Loop: *sync2.NewCycle(config.Interval),
repairer: repairer,
}
}

View File

@ -12,11 +12,12 @@ import (
"go.uber.org/zap"
"storj.io/storj/pkg/pb"
"storj.io/storj/pkg/signing"
"storj.io/storj/pkg/storj"
"storj.io/storj/pkg/transport"
"storj.io/storj/satellite/metainfo"
"storj.io/storj/satellite/orders"
"storj.io/storj/satellite/overlay"
"storj.io/storj/uplink/ecclient"
"storj.io/storj/uplink/eestream"
)
@ -29,7 +30,7 @@ type SegmentRepairer struct {
metainfo *metainfo.Service
orders *orders.Service
overlay *overlay.Service
ec ecclient.Client
ec *ECRepairer
timeout time.Duration
// multiplierOptimalThreshold is the value that multiplied by the optimal
@ -45,8 +46,8 @@ type SegmentRepairer struct {
// when negative, 0 is applied.
func NewSegmentRepairer(
log *zap.Logger, metainfo *metainfo.Service, orders *orders.Service,
overlay *overlay.Service, ec ecclient.Client, timeout time.Duration,
excessOptimalThreshold float64,
overlay *overlay.Service, tc transport.Client, timeout time.Duration,
excessOptimalThreshold float64, satelliteSignee signing.Signee,
) *SegmentRepairer {
if excessOptimalThreshold < 0 {
@ -58,7 +59,7 @@ func NewSegmentRepairer(
metainfo: metainfo,
orders: orders,
overlay: overlay,
ec: ec.WithForceErrorDetection(true),
ec: NewECRepairer(log.Named("ec repairer"), tc, satelliteSignee),
timeout: timeout,
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
}
@ -100,8 +101,8 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
}
numHealthy := len(pieces) - len(missingPieces)
// irreparable piece, we need k+1 to detect corrupted pieces
if int32(numHealthy) < pointer.Remote.Redundancy.MinReq+1 {
// irreparable piece
if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
mon.Meter("repair_nodes_unavailable").Mark(1)
return true, Error.Wrap(IrreparableError.New("segment %v cannot be repaired: only %d healthy pieces, %d required", path, numHealthy, pointer.Remote.Redundancy.MinReq+1))
}
@ -170,20 +171,15 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
}
// Download the segment using just the healthy pieces
rr, err := repairer.ec.Get(ctx, getOrderLimits, getPrivateKey, redundancy, pointer.GetSegmentSize())
segmentReader, err := repairer.ec.Get(ctx, getOrderLimits, getPrivateKey, redundancy, pointer.GetSegmentSize())
if err != nil {
// .Get() seems to only fail from input validation, so it would keep failing
return true, Error.Wrap(err)
}
r, err := rr.Range(ctx, 0, rr.Size())
if err != nil {
return false, Error.Wrap(err)
}
defer func() { err = errs.Combine(err, r.Close()) }()
defer func() { err = errs.Combine(err, segmentReader.Close()) }()
// Upload the repaired pieces
successfulNodes, hashes, err := repairer.ec.Repair(ctx, putLimits, putPrivateKey, redundancy, r, expiration, repairer.timeout, path)
successfulNodes, hashes, err := repairer.ec.Repair(ctx, putLimits, putPrivateKey, redundancy, segmentReader, expiration, repairer.timeout, path)
if err != nil {
return false, Error.Wrap(err)
}

View File

@ -31,7 +31,6 @@ var mon = monkit.Package()
// Client defines an interface for storing erasure coded data to piece store nodes
type Client interface {
Put(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, rs eestream.RedundancyStrategy, data io.Reader, expiration time.Time) (successfulNodes []*pb.Node, successfulHashes []*pb.PieceHash, err error)
Repair(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, rs eestream.RedundancyStrategy, data io.Reader, expiration time.Time, timeout time.Duration, path storj.Path) (successfulNodes []*pb.Node, successfulHashes []*pb.PieceHash, err error)
Get(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, es eestream.ErasureScheme, size int64) (ranger.Ranger, error)
Delete(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey) error
WithForceErrorDetection(force bool) Client
@ -179,120 +178,6 @@ func (ec *ecClient) Put(ctx context.Context, limits []*pb.AddressedOrderLimit, p
return successfulNodes, successfulHashes, nil
}
func (ec *ecClient) Repair(ctx context.Context, limits []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, rs eestream.RedundancyStrategy, data io.Reader, expiration time.Time, timeout time.Duration, path storj.Path) (successfulNodes []*pb.Node, successfulHashes []*pb.PieceHash, err error) {
defer mon.Task()(&ctx)(&err)
pieceCount := len(limits)
if pieceCount != rs.TotalCount() {
return nil, nil, Error.New("size of limits slice (%d) does not match total count (%d) of erasure scheme", pieceCount, rs.TotalCount())
}
if !unique(limits) {
return nil, nil, Error.New("duplicated nodes are not allowed")
}
padded := eestream.PadReader(ioutil.NopCloser(data), rs.StripeSize())
readers, err := eestream.EncodeReader(ctx, ec.log, padded, rs)
if err != nil {
return nil, nil, err
}
type info struct {
i int
err error
hash *pb.PieceHash
}
infos := make(chan info, pieceCount)
psCtx, cancel := context.WithCancel(ctx)
defer cancel()
for i, addressedLimit := range limits {
go func(i int, addressedLimit *pb.AddressedOrderLimit) {
hash, err := ec.putPiece(psCtx, ctx, addressedLimit, privateKey, readers[i], expiration)
infos <- info{i: i, err: err, hash: hash}
}(i, addressedLimit)
}
ec.log.Info("Starting a timer for repair so that the number of pieces will be closer to the success threshold",
zap.Duration("Timer", timeout),
zap.String("Path", path),
zap.Int("Node Count", nonNilCount(limits)),
zap.Int("Optimal Threshold", rs.OptimalThreshold()),
)
var successfulCount, failureCount, cancellationCount int32
timer := time.AfterFunc(timeout, func() {
if ctx.Err() != context.Canceled {
ec.log.Info("Timer expired. Canceling the long tail...",
zap.String("Path", path),
zap.Int32("Successfully repaired", atomic.LoadInt32(&successfulCount)),
)
cancel()
}
})
successfulNodes = make([]*pb.Node, pieceCount)
successfulHashes = make([]*pb.PieceHash, pieceCount)
for range limits {
info := <-infos
if limits[info.i] == nil {
continue
}
if info.err != nil {
if !errs2.IsCanceled(info.err) {
failureCount++
} else {
cancellationCount++
}
ec.log.Debug("Repair to storage node failed",
zap.String("Path", path),
zap.String("NodeID", limits[info.i].GetLimit().StorageNodeId.String()),
zap.Error(info.err),
)
continue
}
successfulNodes[info.i] = &pb.Node{
Id: limits[info.i].GetLimit().StorageNodeId,
Address: limits[info.i].GetStorageNodeAddress(),
}
successfulHashes[info.i] = info.hash
successfulCount++
}
// Ensure timer is stopped
_ = timer.Stop()
// TODO: clean up the partially uploaded segment's pieces
defer func() {
select {
case <-ctx.Done():
err = Error.New("repair cancelled")
// ec.Delete(context.Background(), nodes, pieceID, pba.SatelliteId), //TODO
default:
}
}()
if successfulCount == 0 {
return nil, nil, Error.New("repair %v to all nodes failed", path)
}
ec.log.Info("Successfully repaired",
zap.String("Path", path),
zap.Int32("Success Count", atomic.LoadInt32(&successfulCount)),
)
mon.IntVal("repair_segment_pieces_total").Observe(int64(pieceCount))
mon.IntVal("repair_segment_pieces_successful").Observe(int64(successfulCount))
mon.IntVal("repair_segment_pieces_failed").Observe(int64(failureCount))
mon.IntVal("repair_segment_pieces_canceled").Observe(int64(cancellationCount))
return successfulNodes, successfulHashes, nil
}
func (ec *ecClient) putPiece(ctx, parent context.Context, limit *pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, data io.ReadCloser, expiration time.Time) (hash *pb.PieceHash, err error) {
nodeName := "nil"
if limit != nil {

View File

@ -0,0 +1,62 @@
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package eestream
import (
"github.com/vivint/infectious"
)
type unsafeRSScheme struct {
fc *infectious.FEC
erasureShareSize int
}
// NewUnsafeRSScheme returns a Reed-Solomon-based ErasureScheme without error correction.
func NewUnsafeRSScheme(fc *infectious.FEC, erasureShareSize int) ErasureScheme {
return &unsafeRSScheme{fc: fc, erasureShareSize: erasureShareSize}
}
func (s *unsafeRSScheme) EncodeSingle(input, output []byte, num int) (err error) {
return s.fc.EncodeSingle(input, output, num)
}
func (s *unsafeRSScheme) Encode(input []byte, output func(num int, data []byte)) (
err error) {
return s.fc.Encode(input, func(s infectious.Share) {
output(s.Number, s.Data)
})
}
func (s *unsafeRSScheme) Decode(out []byte, in map[int][]byte) ([]byte, error) {
shares := make([]infectious.Share, 0, len(in))
for num, data := range in {
shares = append(shares, infectious.Share{Number: num, Data: data})
}
stripe := make([]byte, s.RequiredCount()*s.ErasureShareSize())
err := s.fc.Rebuild(shares, func(share infectious.Share) {
copy(stripe[share.Number*s.ErasureShareSize():], share.Data)
})
if err != nil {
return nil, err
}
return stripe, nil
}
func (s *unsafeRSScheme) ErasureShareSize() int {
return s.erasureShareSize
}
func (s *unsafeRSScheme) StripeSize() int {
return s.erasureShareSize * s.fc.Required()
}
func (s *unsafeRSScheme) TotalCount() int {
return s.fc.Total()
}
func (s *unsafeRSScheme) RequiredCount() int {
return s.fc.Required()
}