upload: Add more info to returned error response & to logs (#3218)

* uplink/storage/segments: return error no optimal threshold
  Return an error if the store get less uploaded pieces than the indicated
  by the optimal threshold.

* satellite/metainfo: Fix gRPC status error & add reason
  This commit fix the CommitSegment endpoint method to return an
  "Invalid Argument" status code when uplink submits invalid data which is
  detected when filtering invalid pieces by filterInvalidPieces endpoint
  method.

  Because filterInvalidPieces is also used by CommitSegmentOld, such
  method part has been changed accordingly.

  * An initial check in CommitSegment to detect earlier if uplink sends an
    invalid number of upload pieces.
  * Add more information to some log messages.
  * Return more information to uplink when it sends a number of invalid
    pieces which make impossible to finish the operation successfully.

* satellite/metainfo: Swap some "sugar" loggers to normal ones
  Swap "sugar" loggers to normal ones because they impact the performance
  in production systems and they should only be used under specific
  circumstances which were none of the ones changed.
This commit is contained in:
Ivan Fraixedes 2019-10-17 20:01:40 +02:00 committed by GitHub
parent f4162bd33f
commit 071d1c4313
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 140 additions and 25 deletions

View File

@ -7,6 +7,7 @@ import (
"context"
"crypto/sha256"
"errors"
"fmt"
"strconv"
"time"
@ -241,7 +242,7 @@ func (endpoint *Endpoint) CommitSegmentOld(ctx context.Context, req *pb.SegmentC
err = endpoint.filterValidPieces(ctx, req.Pointer, req.OriginalLimits)
if err != nil {
return nil, rpcstatus.Error(rpcstatus.Internal, err.Error())
return nil, err
}
path, err := CreatePath(ctx, keyInfo.ProjectID, req.Segment, req.Bucket, req.Path)
@ -459,6 +460,10 @@ func createBucketID(projectID uuid.UUID, bucket []byte) []byte {
return []byte(storj.JoinPaths(entries...))
}
// filterValidPieces filter out the invalid remote pieces held by pointer.
//
// The method always return a gRPC status error so the caller can directly
// return it to the client.
func (endpoint *Endpoint) filterValidPieces(ctx context.Context, pointer *pb.Pointer, limits []*pb.OrderLimit) (err error) {
defer mon.Task()(&ctx)(&err)
@ -470,23 +475,44 @@ func (endpoint *Endpoint) filterValidPieces(ctx context.Context, pointer *pb.Poi
return err
}
var remotePieces []*pb.RemotePiece
allSizesValid := true
lastPieceSize := int64(0)
for _, piece := range remote.RemotePieces {
type invalidPiece struct {
NodeID storj.NodeID
PieceNum int32
Reason string
}
var (
remotePieces []*pb.RemotePiece
invalidPieces []invalidPiece
lastPieceSize int64
allSizesValid = true
)
for _, piece := range remote.RemotePieces {
// Verify storagenode signature on piecehash
peerID, ok := peerIDMap[piece.NodeId]
if !ok {
endpoint.log.Warn("Identity chain unknown for node", zap.String("nodeID", piece.NodeId.String()))
endpoint.log.Warn("Identity chain unknown for node. Piece removed from pointer",
zap.Stringer("nodeID", piece.NodeId),
zap.Int32("pieceID", piece.PieceNum),
)
invalidPieces = append(invalidPieces, invalidPiece{
NodeID: piece.NodeId,
PieceNum: piece.PieceNum,
Reason: "Identity chain unknown for node",
})
continue
}
signee := signing.SigneeFromPeerIdentity(peerID)
err = endpoint.validatePieceHash(ctx, piece, limits, signee)
if err != nil {
// TODO maybe this should be logged also to uplink too
endpoint.log.Warn("Problem validating piece hash", zap.Error(err))
endpoint.log.Warn("Problem validating piece hash. Pieces removed from pointer", zap.Error(err))
invalidPieces = append(invalidPieces, invalidPiece{
NodeID: piece.NodeId,
PieceNum: piece.PieceNum,
Reason: err.Error(),
})
continue
}
@ -502,35 +528,86 @@ func (endpoint *Endpoint) filterValidPieces(ctx context.Context, pointer *pb.Poi
if allSizesValid {
redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())
if err != nil {
return Error.Wrap(err)
endpoint.log.Debug("pointer contains an invalid redundancy strategy", zap.Error(Error.Wrap(err)))
return rpcstatus.Errorf(rpcstatus.InvalidArgument,
"invalid redundancy strategy; MinReq and/or Total are invalid: %s", err,
)
}
expectedPieceSize := eestream.CalcPieceSize(pointer.SegmentSize, redundancy)
if expectedPieceSize != lastPieceSize {
return Error.New("expected piece size is different from provided (%v != %v)", expectedPieceSize, lastPieceSize)
endpoint.log.Debug("expected piece size is different from provided",
zap.Int64("expectedSize", expectedPieceSize),
zap.Int64("actualSize", lastPieceSize),
)
return rpcstatus.Errorf(rpcstatus.InvalidArgument,
"expected piece size is different from provided (%d != %d)",
expectedPieceSize, lastPieceSize,
)
}
} else {
return Error.New("all pieces needs to have the same size")
errMsg := "all pieces needs to have the same size"
endpoint.log.Debug(errMsg)
return rpcstatus.Error(rpcstatus.InvalidArgument, errMsg)
}
// We repair when the number of healthy files is less than or equal to the repair threshold
// except for the case when the repair and success thresholds are the same (a case usually seen during testing).
if int32(len(remotePieces)) <= remote.Redundancy.RepairThreshold && int32(len(remotePieces)) < remote.Redundancy.SuccessThreshold {
return Error.New("Number of valid pieces (%d) is less than or equal to the repair threshold (%d)",
if numPieces := int32(len(remotePieces)); numPieces <= remote.Redundancy.RepairThreshold && numPieces < remote.Redundancy.SuccessThreshold {
endpoint.log.Debug("Number of valid pieces is less than or equal to the repair threshold",
zap.Int("totalReceivedPieces", len(remote.RemotePieces)),
zap.Int("validPieces", len(remotePieces)),
zap.Int("invalidPieces", len(invalidPieces)),
zap.Int32("repairThreshold", remote.Redundancy.RepairThreshold),
)
errMsg := fmt.Sprintf("Number of valid pieces (%d) is less than or equal to the repair threshold (%d). Found %d invalid pieces",
len(remotePieces),
remote.Redundancy.RepairThreshold,
len(remote.RemotePieces),
)
if len(invalidPieces) > 0 {
errMsg = fmt.Sprintf("%s. Invalid Pieces:", errMsg)
for _, p := range invalidPieces {
errMsg = fmt.Sprintf("%s\nNodeID: %v, PieceNum: %d, Reason: %s",
errMsg, p.NodeID, p.PieceNum, p.Reason,
)
}
}
return rpcstatus.Error(rpcstatus.InvalidArgument, errMsg)
}
if int32(len(remotePieces)) < remote.Redundancy.SuccessThreshold {
return Error.New("Number of valid pieces (%d) is less than the success threshold (%d)",
endpoint.log.Debug("Number of valid pieces is less than the success threshold",
zap.Int("totalReceivedPieces", len(remote.RemotePieces)),
zap.Int("validPieces", len(remotePieces)),
zap.Int("invalidPieces", len(invalidPieces)),
zap.Int32("successThreshold", remote.Redundancy.SuccessThreshold),
)
errMsg := fmt.Sprintf("Number of valid pieces (%d) is less than the success threshold (%d). Found %d invalid pieces",
len(remotePieces),
remote.Redundancy.SuccessThreshold,
len(remote.RemotePieces),
)
if len(invalidPieces) > 0 {
errMsg = fmt.Sprintf("%s. Invalid Pieces:", errMsg)
for _, p := range invalidPieces {
errMsg = fmt.Sprintf("%s\nNodeID: %v, PieceNum: %d, Reason: %s",
errMsg, p.NodeID, p.PieceNum, p.Reason,
)
}
}
return rpcstatus.Error(rpcstatus.InvalidArgument, errMsg)
}
remote.RemotePieces = remotePieces
}
return nil
}
@ -541,7 +618,8 @@ func (endpoint *Endpoint) mapNodesFor(ctx context.Context, pieces []*pb.RemotePi
}
peerIDList, err := endpoint.peerIdentities.BatchGet(ctx, nodeIDList)
if err != nil {
return nil, Error.Wrap(err)
endpoint.log.Error("retrieving batch of the peer identities of nodes", zap.Error(Error.Wrap(err)))
return nil, rpcstatus.Error(rpcstatus.Internal, "retrieving nodes peer identities")
}
peerIDMap := make(map[storj.NodeID]*identity.PeerIdentity, len(peerIDList))
for _, peerID := range peerIDList {
@ -1414,6 +1492,18 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
return nil, rpcstatus.Error(rpcstatus.Unauthenticated, err.Error())
}
if numResults := len(req.UploadResult); numResults < int(streamID.Redundancy.GetSuccessThreshold()) {
endpoint.log.Debug("the results of uploaded pieces for the segment is below the redundancy optimal threshold",
zap.Int("upload pieces results", numResults),
zap.Int32("redundancy optimal threshold", streamID.Redundancy.GetSuccessThreshold()),
zap.Stringer("segment ID", req.SegmentId),
)
return nil, rpcstatus.Errorf(rpcstatus.InvalidArgument,
"the number of results of uploaded pieces (%d) is below the optimal threshold (%d)",
numResults, streamID.Redundancy.GetSuccessThreshold(),
)
}
pieces := make([]*pb.RemotePiece, len(req.UploadResult))
for i, result := range req.UploadResult {
pieces[i] = &pb.RemotePiece{
@ -1461,7 +1551,7 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
err = endpoint.filterValidPieces(ctx, pointer, orderLimits)
if err != nil {
return nil, rpcstatus.Error(rpcstatus.Internal, err.Error())
return nil, err
}
path, err := CreatePath(ctx, keyInfo.ProjectID, int64(segmentID.Index), streamID.Bucket, streamID.EncryptedPath)
@ -1474,8 +1564,9 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
return nil, rpcstatus.Error(rpcstatus.Internal, err.Error())
}
if exceeded {
endpoint.log.Sugar().Errorf("monthly project limits are %s of storage and bandwidth usage. This limit has been exceeded for storage for projectID %s.",
limit, keyInfo.ProjectID,
endpoint.log.Error("The project limit of storage and bandwidth has been exceeded",
zap.Int64("limit", limit.Int64()),
zap.Stringer("project id", keyInfo.ProjectID),
)
return nil, rpcstatus.Error(rpcstatus.ResourceExhausted, "Exceeded Usage Limit")
}
@ -1492,13 +1583,21 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
if pointer.Type == pb.Pointer_REMOTE {
//We cannot have more redundancy than total/min
if float64(remoteUsed) > (float64(pointer.SegmentSize)/float64(pointer.Remote.Redundancy.MinReq))*float64(pointer.Remote.Redundancy.Total) {
endpoint.log.Sugar().Debugf("data size mismatch, got segment: %d, pieces: %d, RS Min, Total: %d,%d", pointer.SegmentSize, remoteUsed, pointer.Remote.Redundancy.MinReq, pointer.Remote.Redundancy.Total)
endpoint.log.Debug("data size mismatch",
zap.Int64("segment", pointer.SegmentSize),
zap.Int64("pieces", remoteUsed),
zap.Int32("redundancy minimum requested", pointer.Remote.Redundancy.MinReq),
zap.Int32("redundancy total", pointer.Remote.Redundancy.Total),
)
return nil, rpcstatus.Error(rpcstatus.InvalidArgument, "mismatched segment size and piece usage")
}
}
if err := endpoint.projectUsage.AddProjectStorageUsage(ctx, keyInfo.ProjectID, inlineUsed, remoteUsed); err != nil {
endpoint.log.Sugar().Errorf("Could not track new storage usage by project %v: %v", keyInfo.ProjectID, err)
endpoint.log.Error("Could not track new storage usage by project",
zap.Stringer("projectID", keyInfo.ProjectID),
zap.Error(err),
)
// but continue. it's most likely our own fault that we couldn't track it, and the only thing
// that will be affected is our per-project bandwidth and storage limits.
}

View File

@ -13,6 +13,7 @@ import (
"github.com/gogo/protobuf/proto"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/storj/internal/errs2"
@ -350,6 +351,8 @@ func TestCommitSegment(t *testing.T) {
}
_, err = metainfo.CommitSegment(ctx, "bucket", "path", -1, pointer, limits)
require.Error(t, err)
err = errs.Unwrap(err)
require.Equal(t, rpcstatus.Code(err), rpcstatus.InvalidArgument)
require.Contains(t, err.Error(), "is less than or equal to the repair threshold")
}

View File

@ -366,26 +366,34 @@ func (endpoint *Endpoint) validatePieceHash(ctx context.Context, piece *pb.Remot
defer mon.Task()(&ctx)(&err)
if piece.Hash == nil {
return errs.New("no piece hash, removing from pointer %v (%v)", piece.NodeId, piece.PieceNum)
return errs.New("no piece hash. NodeID: %v, PieceNum: %d", piece.NodeId, piece.PieceNum)
}
err = signing.VerifyPieceHashSignature(ctx, signee, piece.Hash)
if err != nil {
return errs.New("piece hash signature could not be verified for node %v: %v", piece.NodeId, err)
return errs.New("piece hash signature could not be verified for node (NodeID: %v, PieceNum: %d): %+v",
piece.NodeId, piece.PieceNum, err,
)
}
timestamp := piece.Hash.Timestamp
if timestamp.Before(time.Now().Add(-pieceHashExpiration)) {
return errs.New("piece hash timestamp is too old (%v), removing from pointer %v (num: %v)", timestamp, piece.NodeId, piece.PieceNum)
return errs.New("piece hash timestamp is too old (%v). NodeId: %v, PieceNum: %d)",
timestamp, piece.NodeId, piece.PieceNum,
)
}
limit := limits[piece.PieceNum]
if limit != nil {
switch {
case limit.PieceId != piece.Hash.PieceId:
return errs.New("piece hash pieceID doesn't match limit pieceID, removing from pointer (%v != %v)", piece.Hash.PieceId, limit.PieceId)
return errs.New("piece hash pieceID (%v) doesn't match limit pieceID (%v). NodeID: %v, PieceNum: %d",
piece.Hash.PieceId, limit.PieceId, piece.NodeId, piece.PieceNum,
)
case limit.Limit < piece.Hash.PieceSize:
return errs.New("piece hash PieceSize is larger than order limit, removing from pointer (%v > %v)", piece.Hash.PieceSize, limit.Limit)
return errs.New("piece hash PieceSize (%d) is larger than order limit (%d). NodeID: %v, PieceNum: %d",
piece.Hash.PieceSize, limit.Limit, piece.NodeId, piece.PieceNum,
)
}
}
return nil

View File

@ -133,6 +133,11 @@ func (s *segmentStore) Put(ctx context.Context, streamID storj.StreamID, data io
Hash: successfulHashes[i],
})
}
if l := len(uploadResults); l < s.rs.OptimalThreshold() {
return Meta{}, Error.New("uploaded results (%d) are below the optimal threshold (%d)", l, s.rs.OptimalThreshold())
}
err = s.metainfo.CommitSegmentNew(ctx, metainfo.CommitSegmentParams{
SegmentID: segmentID,
SizeEncryptedData: sizedReader.Size(),