upload: Add more info to returned error response & to logs (#3218)

* uplink/storage/segments: return error no optimal threshold Return an error if the store get less uploaded pieces than the indicated by the optimal threshold. * satellite/metainfo: Fix gRPC status error & add reason This commit fix the CommitSegment endpoint method to return an "Invalid Argument" status code when uplink submits invalid data which is detected when filtering invalid pieces by filterInvalidPieces endpoint method. Because filterInvalidPieces is also used by CommitSegmentOld, such method part has been changed accordingly. * An initial check in CommitSegment to detect earlier if uplink sends an invalid number of upload pieces. * Add more information to some log messages. * Return more information to uplink when it sends a number of invalid pieces which make impossible to finish the operation successfully. * satellite/metainfo: Swap some "sugar" loggers to normal ones Swap "sugar" loggers to normal ones because they impact the performance in production systems and they should only be used under specific circumstances which were none of the ones changed.
2019-10-17 20:01:40 +02:00 · 2019-10-17 20:01:40 +02:00 · 071d1c4313
commit 071d1c4313
parent f4162bd33f
4 changed files with 140 additions and 25 deletions
--- a/satellite/metainfo/metainfo.go
+++ b/satellite/metainfo/metainfo.go
@ -7,6 +7,7 @@ import (
 	"context"
 	"crypto/sha256"
 	"errors"
+	"fmt"
 	"strconv"
 	"time"

@ -241,7 +242,7 @@ func (endpoint *Endpoint) CommitSegmentOld(ctx context.Context, req *pb.SegmentC

 	err = endpoint.filterValidPieces(ctx, req.Pointer, req.OriginalLimits)
 	if err != nil {
-		return nil, rpcstatus.Error(rpcstatus.Internal, err.Error())
+		return nil, err
 	}

 	path, err := CreatePath(ctx, keyInfo.ProjectID, req.Segment, req.Bucket, req.Path)
@ -459,6 +460,10 @@ func createBucketID(projectID uuid.UUID, bucket []byte) []byte {
 	return []byte(storj.JoinPaths(entries...))
 }

+// filterValidPieces filter out the invalid remote pieces held by pointer.
+//
+// The method always return a gRPC status error so the caller can directly
+// return it to the client.
 func (endpoint *Endpoint) filterValidPieces(ctx context.Context, pointer *pb.Pointer, limits []*pb.OrderLimit) (err error) {
 	defer mon.Task()(&ctx)(&err)

@ -470,23 +475,44 @@ func (endpoint *Endpoint) filterValidPieces(ctx context.Context, pointer *pb.Poi
 			return err
 		}

-		var remotePieces []*pb.RemotePiece
-		allSizesValid := true
-		lastPieceSize := int64(0)
-		for _, piece := range remote.RemotePieces {
+		type invalidPiece struct {
+			NodeID   storj.NodeID
+			PieceNum int32
+			Reason   string
+		}

+		var (
+			remotePieces  []*pb.RemotePiece
+			invalidPieces []invalidPiece
+			lastPieceSize int64
+			allSizesValid = true
+		)
+		for _, piece := range remote.RemotePieces {
 			// Verify storagenode signature on piecehash
 			peerID, ok := peerIDMap[piece.NodeId]
 			if !ok {
-				endpoint.log.Warn("Identity chain unknown for node", zap.String("nodeID", piece.NodeId.String()))
+				endpoint.log.Warn("Identity chain unknown for node. Piece removed from pointer",
+					zap.Stringer("nodeID", piece.NodeId),
+					zap.Int32("pieceID", piece.PieceNum),
+				)
+
+				invalidPieces = append(invalidPieces, invalidPiece{
+					NodeID:   piece.NodeId,
+					PieceNum: piece.PieceNum,
+					Reason:   "Identity chain unknown for node",
+				})
 				continue
 			}
 			signee := signing.SigneeFromPeerIdentity(peerID)

 			err = endpoint.validatePieceHash(ctx, piece, limits, signee)
 			if err != nil {
-				// TODO maybe this should be logged also to uplink too
-				endpoint.log.Warn("Problem validating piece hash", zap.Error(err))
+				endpoint.log.Warn("Problem validating piece hash. Pieces removed from pointer", zap.Error(err))
+				invalidPieces = append(invalidPieces, invalidPiece{
+					NodeID:   piece.NodeId,
+					PieceNum: piece.PieceNum,
+					Reason:   err.Error(),
+				})
 				continue
 			}

@ -502,35 +528,86 @@ func (endpoint *Endpoint) filterValidPieces(ctx context.Context, pointer *pb.Poi
 		if allSizesValid {
 			redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())
 			if err != nil {
-				return Error.Wrap(err)
+				endpoint.log.Debug("pointer contains an invalid redundancy strategy", zap.Error(Error.Wrap(err)))
+				return rpcstatus.Errorf(rpcstatus.InvalidArgument,
+					"invalid redundancy strategy; MinReq and/or Total are invalid: %s", err,
+				)
 			}

 			expectedPieceSize := eestream.CalcPieceSize(pointer.SegmentSize, redundancy)
 			if expectedPieceSize != lastPieceSize {
-				return Error.New("expected piece size is different from provided (%v != %v)", expectedPieceSize, lastPieceSize)
+				endpoint.log.Debug("expected piece size is different from provided",
+					zap.Int64("expectedSize", expectedPieceSize),
+					zap.Int64("actualSize", lastPieceSize),
+				)
+				return rpcstatus.Errorf(rpcstatus.InvalidArgument,
+					"expected piece size is different from provided (%d != %d)",
+					expectedPieceSize, lastPieceSize,
+				)
 			}
 		} else {
-			return Error.New("all pieces needs to have the same size")
+			errMsg := "all pieces needs to have the same size"
+			endpoint.log.Debug(errMsg)
+			return rpcstatus.Error(rpcstatus.InvalidArgument, errMsg)
 		}

 		// We repair when the number of healthy files is less than or equal to the repair threshold
 		// except for the case when the repair and success thresholds are the same (a case usually seen during testing).
-		if int32(len(remotePieces)) <= remote.Redundancy.RepairThreshold && int32(len(remotePieces)) < remote.Redundancy.SuccessThreshold {
-			return Error.New("Number of valid pieces (%d) is less than or equal to the repair threshold (%d)",
+		if numPieces := int32(len(remotePieces)); numPieces <= remote.Redundancy.RepairThreshold && numPieces < remote.Redundancy.SuccessThreshold {
+			endpoint.log.Debug("Number of valid pieces is less than or equal to the repair threshold",
+				zap.Int("totalReceivedPieces", len(remote.RemotePieces)),
+				zap.Int("validPieces", len(remotePieces)),
+				zap.Int("invalidPieces", len(invalidPieces)),
+				zap.Int32("repairThreshold", remote.Redundancy.RepairThreshold),
+			)
+
+			errMsg := fmt.Sprintf("Number of valid pieces (%d) is less than or equal to the repair threshold (%d). Found %d invalid pieces",
 				len(remotePieces),
 				remote.Redundancy.RepairThreshold,
+				len(remote.RemotePieces),
 			)
+			if len(invalidPieces) > 0 {
+				errMsg = fmt.Sprintf("%s. Invalid Pieces:", errMsg)
+
+				for _, p := range invalidPieces {
+					errMsg = fmt.Sprintf("%s\nNodeID: %v, PieceNum: %d, Reason: %s",
+						errMsg, p.NodeID, p.PieceNum, p.Reason,
+					)
+				}
+			}
+
+			return rpcstatus.Error(rpcstatus.InvalidArgument, errMsg)
 		}

 		if int32(len(remotePieces)) < remote.Redundancy.SuccessThreshold {
-			return Error.New("Number of valid pieces (%d) is less than the success threshold (%d)",
+			endpoint.log.Debug("Number of valid pieces is less than the success threshold",
+				zap.Int("totalReceivedPieces", len(remote.RemotePieces)),
+				zap.Int("validPieces", len(remotePieces)),
+				zap.Int("invalidPieces", len(invalidPieces)),
+				zap.Int32("successThreshold", remote.Redundancy.SuccessThreshold),
+			)
+
+			errMsg := fmt.Sprintf("Number of valid pieces (%d) is less than the success threshold (%d). Found %d invalid pieces",
 				len(remotePieces),
 				remote.Redundancy.SuccessThreshold,
+				len(remote.RemotePieces),
 			)
+			if len(invalidPieces) > 0 {
+				errMsg = fmt.Sprintf("%s. Invalid Pieces:", errMsg)
+
+				for _, p := range invalidPieces {
+					errMsg = fmt.Sprintf("%s\nNodeID: %v, PieceNum: %d, Reason: %s",
+						errMsg, p.NodeID, p.PieceNum, p.Reason,
+					)
+				}
+			}
+
+			return rpcstatus.Error(rpcstatus.InvalidArgument, errMsg)
 		}

 		remote.RemotePieces = remotePieces
 	}
+
 	return nil
 }

@ -541,7 +618,8 @@ func (endpoint *Endpoint) mapNodesFor(ctx context.Context, pieces []*pb.RemotePi
 	}
 	peerIDList, err := endpoint.peerIdentities.BatchGet(ctx, nodeIDList)
 	if err != nil {
-		return nil, Error.Wrap(err)
+		endpoint.log.Error("retrieving batch of the peer identities of nodes", zap.Error(Error.Wrap(err)))
+		return nil, rpcstatus.Error(rpcstatus.Internal, "retrieving nodes peer identities")
 	}
 	peerIDMap := make(map[storj.NodeID]*identity.PeerIdentity, len(peerIDList))
 	for _, peerID := range peerIDList {
@ -1414,6 +1492,18 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
 		return nil, rpcstatus.Error(rpcstatus.Unauthenticated, err.Error())
 	}

+	if numResults := len(req.UploadResult); numResults < int(streamID.Redundancy.GetSuccessThreshold()) {
+		endpoint.log.Debug("the results of uploaded pieces for the segment is below the redundancy optimal threshold",
+			zap.Int("upload pieces results", numResults),
+			zap.Int32("redundancy optimal threshold", streamID.Redundancy.GetSuccessThreshold()),
+			zap.Stringer("segment ID", req.SegmentId),
+		)
+		return nil, rpcstatus.Errorf(rpcstatus.InvalidArgument,
+			"the number of results of uploaded pieces (%d) is below the optimal threshold (%d)",
+			numResults, streamID.Redundancy.GetSuccessThreshold(),
+		)
+	}
+
 	pieces := make([]*pb.RemotePiece, len(req.UploadResult))
 	for i, result := range req.UploadResult {
 		pieces[i] = &pb.RemotePiece{
@ -1461,7 +1551,7 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm

 	err = endpoint.filterValidPieces(ctx, pointer, orderLimits)
 	if err != nil {
-		return nil, rpcstatus.Error(rpcstatus.Internal, err.Error())
+		return nil, err
 	}

 	path, err := CreatePath(ctx, keyInfo.ProjectID, int64(segmentID.Index), streamID.Bucket, streamID.EncryptedPath)
@ -1474,8 +1564,9 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
 		return nil, rpcstatus.Error(rpcstatus.Internal, err.Error())
 	}
 	if exceeded {
-		endpoint.log.Sugar().Errorf("monthly project limits are %s of storage and bandwidth usage. This limit has been exceeded for storage for projectID %s.",
-			limit, keyInfo.ProjectID,
+		endpoint.log.Error("The project limit of storage and bandwidth has been exceeded",
+			zap.Int64("limit", limit.Int64()),
+			zap.Stringer("project id", keyInfo.ProjectID),
 		)
 		return nil, rpcstatus.Error(rpcstatus.ResourceExhausted, "Exceeded Usage Limit")
 	}
@ -1492,13 +1583,21 @@ func (endpoint *Endpoint) CommitSegment(ctx context.Context, req *pb.SegmentComm
 	if pointer.Type == pb.Pointer_REMOTE {
 		//We cannot have more redundancy than total/min
 		if float64(remoteUsed) > (float64(pointer.SegmentSize)/float64(pointer.Remote.Redundancy.MinReq))*float64(pointer.Remote.Redundancy.Total) {
-			endpoint.log.Sugar().Debugf("data size mismatch, got segment: %d, pieces: %d, RS Min, Total: %d,%d", pointer.SegmentSize, remoteUsed, pointer.Remote.Redundancy.MinReq, pointer.Remote.Redundancy.Total)
+			endpoint.log.Debug("data size mismatch",
+				zap.Int64("segment", pointer.SegmentSize),
+				zap.Int64("pieces", remoteUsed),
+				zap.Int32("redundancy minimum requested", pointer.Remote.Redundancy.MinReq),
+				zap.Int32("redundancy total", pointer.Remote.Redundancy.Total),
+			)
 			return nil, rpcstatus.Error(rpcstatus.InvalidArgument, "mismatched segment size and piece usage")
 		}
 	}

 	if err := endpoint.projectUsage.AddProjectStorageUsage(ctx, keyInfo.ProjectID, inlineUsed, remoteUsed); err != nil {
-		endpoint.log.Sugar().Errorf("Could not track new storage usage by project %v: %v", keyInfo.ProjectID, err)
+		endpoint.log.Error("Could not track new storage usage by project",
+			zap.Stringer("projectID", keyInfo.ProjectID),
+			zap.Error(err),
+		)
 		// but continue. it's most likely our own fault that we couldn't track it, and the only thing
 		// that will be affected is our per-project bandwidth and storage limits.
 	}
--- a/satellite/metainfo/metainfo_test.go
+++ b/satellite/metainfo/metainfo_test.go
@ -13,6 +13,7 @@ import (
 	"github.com/gogo/protobuf/proto"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"github.com/zeebo/errs"
 	"go.uber.org/zap"

 	"storj.io/storj/internal/errs2"
@ -350,6 +351,8 @@ func TestCommitSegment(t *testing.T) {
 			}
 			_, err = metainfo.CommitSegment(ctx, "bucket", "path", -1, pointer, limits)
 			require.Error(t, err)
+			err = errs.Unwrap(err)
+			require.Equal(t, rpcstatus.Code(err), rpcstatus.InvalidArgument)
 			require.Contains(t, err.Error(), "is less than or equal to the repair threshold")
 		}

--- a/satellite/metainfo/validation.go
+++ b/satellite/metainfo/validation.go
@ -366,26 +366,34 @@ func (endpoint *Endpoint) validatePieceHash(ctx context.Context, piece *pb.Remot
 	defer mon.Task()(&ctx)(&err)

 	if piece.Hash == nil {
-		return errs.New("no piece hash, removing from pointer %v (%v)", piece.NodeId, piece.PieceNum)
+		return errs.New("no piece hash. NodeID: %v, PieceNum: %d", piece.NodeId, piece.PieceNum)
 	}

 	err = signing.VerifyPieceHashSignature(ctx, signee, piece.Hash)
 	if err != nil {
-		return errs.New("piece hash signature could not be verified for node %v: %v", piece.NodeId, err)
+		return errs.New("piece hash signature could not be verified for node (NodeID: %v, PieceNum: %d): %+v",
+			piece.NodeId, piece.PieceNum, err,
+		)
 	}

 	timestamp := piece.Hash.Timestamp
 	if timestamp.Before(time.Now().Add(-pieceHashExpiration)) {
-		return errs.New("piece hash timestamp is too old (%v), removing from pointer %v (num: %v)", timestamp, piece.NodeId, piece.PieceNum)
+		return errs.New("piece hash timestamp is too old (%v). NodeId: %v, PieceNum: %d)",
+			timestamp, piece.NodeId, piece.PieceNum,
+		)
 	}

 	limit := limits[piece.PieceNum]
 	if limit != nil {
 		switch {
 		case limit.PieceId != piece.Hash.PieceId:
-			return errs.New("piece hash pieceID doesn't match limit pieceID, removing from pointer (%v != %v)", piece.Hash.PieceId, limit.PieceId)
+			return errs.New("piece hash pieceID (%v) doesn't match limit pieceID (%v). NodeID: %v, PieceNum: %d",
+				piece.Hash.PieceId, limit.PieceId, piece.NodeId, piece.PieceNum,
+			)
 		case limit.Limit < piece.Hash.PieceSize:
-			return errs.New("piece hash PieceSize is larger than order limit, removing from pointer (%v > %v)", piece.Hash.PieceSize, limit.Limit)
+			return errs.New("piece hash PieceSize (%d) is larger than order limit (%d). NodeID: %v, PieceNum: %d",
+				piece.Hash.PieceSize, limit.Limit, piece.NodeId, piece.PieceNum,
+			)
 		}
 	}
 	return nil
--- a/uplink/storage/segments/store.go
+++ b/uplink/storage/segments/store.go
@ -133,6 +133,11 @@ func (s *segmentStore) Put(ctx context.Context, streamID storj.StreamID, data io
 			Hash:     successfulHashes[i],
 		})
 	}
+
+	if l := len(uploadResults); l < s.rs.OptimalThreshold() {
+		return Meta{}, Error.New("uploaded results (%d) are below the optimal threshold (%d)", l, s.rs.OptimalThreshold())
+	}
+
 	err = s.metainfo.CommitSegmentNew(ctx, metainfo.CommitSegmentParams{
 		SegmentID:         segmentID,
 		SizeEncryptedData: sizedReader.Size(),