storj/satellite/orders/service.go

614 lines
22 KiB
Go
Raw Permalink Normal View History

2019-03-28 20:09:23 +00:00
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package orders
import (
"context"
[V3-1927] Repairer uploads to max threshold instead of success… (#2423) * pkg/datarepair: Add test to check num upload pieces Add a new test for ensuring the number of pieces that the repair process upload when a segment is injured. * satellite/orders: Don't create "put order limits" over total Repair must not create "put order limits" more than the total count. * pkg/datarepair: Update upload repair pieces test Update the test which checks the number of pieces which are uploaded during a repair for using the same excess over the success threshold value than the implementation. * satellites/orders: Limit repair put order for not being total Limit the number of put orders to be used by repair for only uploading pieces to a % excess over the successful threshold. * pkg/datarepair: Change DataRepair test to pass again Make some changes in the DataRepair test to make pass again after the repair upload repaired pieces only until a % excess over success threshold. Also update the steps description of the DataRepair test after it has been changed, to match on what's now, besides to leave it more generic for avoiding having to update it on minimal future refactorings. * satellite: Make repair excess optimal threshold configurable Add a new configuration parameter to the satellite for being able to configure the percentage excess over the optimal threshold, used for determining how many pieces should be repaired/uploaded, rather than having the value hard coded. * repairer: Add configurable param to segments/repairer Add a new parameters to the segment/repairer to calculate the maximum number of excess nodes, based on the optimal threshold, that repaired pieces can be uploaded. This new parameter has been added for not returning more nodes than the number of upload orders for data repair satellite service calculate for repairing pieces. * pkg/storage/ec: Update log message in clien.Repair * satellite: Update configuration lock file
2019-07-11 23:44:47 +01:00
"math"
mathrand "math/rand"
"sync"
2019-03-28 20:09:23 +00:00
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/pb"
"storj.io/common/signing"
"storj.io/common/storj"
"storj.io/storj/satellite/internalpb"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/overlay"
2019-03-28 20:09:23 +00:00
)
var (
// ErrDownloadFailedNotEnoughPieces is returned when download failed due to missing pieces.
ErrDownloadFailedNotEnoughPieces = errs.Class("not enough pieces for download")
// ErrDecryptOrderMetadata is returned when a step of decrypting metadata fails.
ErrDecryptOrderMetadata = errs.Class("decrytping order metadata")
)
// Config is a configuration struct for orders Service.
type Config struct {
EncryptionKeys EncryptionKeys `help:"encryption keys to encrypt info in orders" default:""`
Expiration time.Duration `help:"how long until an order expires" default:"24h" testDefault:"168h"` // default is 1 day
FlushBatchSize int `help:"how many items in the rollups write cache before they are flushed to the database" devDefault:"20" releaseDefault:"1000" testDefault:"10"`
FlushInterval time.Duration `help:"how often to flush the rollups write cache to the database" devDefault:"30s" releaseDefault:"1m" testDefault:"$TESTINTERVAL"`
NodeStatusLogging bool `hidden:"true" help:"deprecated, log the offline/disqualification status of nodes" default:"false" testDefault:"true"`
OrdersSemaphoreSize int `help:"how many concurrent orders to process at once. zero is unlimited" default:"2"`
}
// Overlay defines the overlay dependency of orders.Service.
// use `go install github.com/golang/mock/mockgen@v1.6.0` if missing
//
//go:generate mockgen -destination mock_test.go -package orders . OverlayForOrders
type Overlay interface {
CachedGetOnlineNodesForGet(context.Context, []storj.NodeID) (map[storj.NodeID]*overlay.SelectedNode, error)
GetOnlineNodesForAuditRepair(context.Context, []storj.NodeID) (map[storj.NodeID]*overlay.NodeReputation, error)
Get(ctx context.Context, nodeID storj.NodeID) (*overlay.NodeDossier, error)
IsOnline(node *overlay.NodeDossier) bool
}
2019-03-28 20:09:23 +00:00
// Service for creating order limits.
2019-09-10 14:24:16 +01:00
//
// architecture: Service
2019-03-28 20:09:23 +00:00
type Service struct {
log *zap.Logger
satellite signing.Signer
overlay Overlay
orders DB
encryptionKeys EncryptionKeys
orderExpiration time.Duration
rngMu sync.Mutex
rng *mathrand.Rand
2019-03-28 20:09:23 +00:00
}
// NewService creates new service for creating order limits.
[V3-1927] Repairer uploads to max threshold instead of success… (#2423) * pkg/datarepair: Add test to check num upload pieces Add a new test for ensuring the number of pieces that the repair process upload when a segment is injured. * satellite/orders: Don't create "put order limits" over total Repair must not create "put order limits" more than the total count. * pkg/datarepair: Update upload repair pieces test Update the test which checks the number of pieces which are uploaded during a repair for using the same excess over the success threshold value than the implementation. * satellites/orders: Limit repair put order for not being total Limit the number of put orders to be used by repair for only uploading pieces to a % excess over the successful threshold. * pkg/datarepair: Change DataRepair test to pass again Make some changes in the DataRepair test to make pass again after the repair upload repaired pieces only until a % excess over success threshold. Also update the steps description of the DataRepair test after it has been changed, to match on what's now, besides to leave it more generic for avoiding having to update it on minimal future refactorings. * satellite: Make repair excess optimal threshold configurable Add a new configuration parameter to the satellite for being able to configure the percentage excess over the optimal threshold, used for determining how many pieces should be repaired/uploaded, rather than having the value hard coded. * repairer: Add configurable param to segments/repairer Add a new parameters to the segment/repairer to calculate the maximum number of excess nodes, based on the optimal threshold, that repaired pieces can be uploaded. This new parameter has been added for not returning more nodes than the number of upload orders for data repair satellite service calculate for repairing pieces. * pkg/storage/ec: Update log message in clien.Repair * satellite: Update configuration lock file
2019-07-11 23:44:47 +01:00
func NewService(
log *zap.Logger, satellite signing.Signer, overlay Overlay,
orders DB, config Config,
) (*Service, error) {
if config.EncryptionKeys.Default.IsZero() {
return nil, Error.New("encryption keys must be specified to include encrypted metadata")
}
2019-03-28 20:09:23 +00:00
return &Service{
log: log,
satellite: satellite,
overlay: overlay,
orders: orders,
encryptionKeys: config.EncryptionKeys,
orderExpiration: config.Expiration,
rng: mathrand.New(mathrand.NewSource(time.Now().UnixNano())),
}, nil
2019-03-28 20:09:23 +00:00
}
// VerifyOrderLimitSignature verifies that the signature inside order limit belongs to the satellite.
func (service *Service) VerifyOrderLimitSignature(ctx context.Context, signed *pb.OrderLimit) (err error) {
defer mon.Task()(&ctx)(&err)
return signing.VerifyOrderLimitSignature(ctx, service.satellite, signed)
2019-03-28 20:09:23 +00:00
}
func (service *Service) updateBandwidth(ctx context.Context, bucket metabase.BucketLocation, addressedOrderLimits ...*pb.AddressedOrderLimit) (err error) {
defer mon.Task()(&ctx)(&err)
if len(addressedOrderLimits) == 0 {
return nil
}
var action pb.PieceAction
var bucketAllocation int64
for _, addressedOrderLimit := range addressedOrderLimits {
if addressedOrderLimit != nil && addressedOrderLimit.Limit != nil {
orderLimit := addressedOrderLimit.Limit
action = orderLimit.Action
bucketAllocation += orderLimit.Limit
}
}
now := time.Now().UTC()
intervalStart := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, now.Location())
// TODO: all of this below should be a single db transaction. in fact, this whole function should probably be part of an existing transaction
if err := service.orders.UpdateBucketBandwidthAllocation(ctx, bucket.ProjectID, []byte(bucket.BucketName), action, bucketAllocation, intervalStart); err != nil {
return Error.Wrap(err)
}
return nil
}
// CreateGetOrderLimits creates the order limits for downloading the pieces of a segment.
func (service *Service) CreateGetOrderLimits(ctx context.Context, bucket metabase.BucketLocation, segment metabase.Segment, desiredNodes int32, overrideLimit int64) (_ []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, err error) {
defer mon.Task()(&ctx)(&err)
orderLimit := segment.PieceSize()
if overrideLimit > 0 && overrideLimit < orderLimit {
orderLimit = overrideLimit
}
2019-03-28 20:09:23 +00:00
nodeIDs := make([]storj.NodeID, len(segment.Pieces))
for i, piece := range segment.Pieces {
nodeIDs[i] = piece.StorageNode
}
nodes, err := service.overlay.CachedGetOnlineNodesForGet(ctx, nodeIDs)
if err != nil {
service.log.Debug("error getting nodes from overlay", zap.Error(err))
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
if segment.Placement != storj.EveryCountry {
for id, node := range nodes {
if !segment.Placement.AllowedCountry(node.CountryCode) {
delete(nodes, id)
}
}
}
signer, err := NewSignerGet(service, segment.RootPieceID, time.Now(), orderLimit, bucket)
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
neededLimits := segment.Redundancy.DownloadNodes()
if desiredNodes > neededLimits {
neededLimits = desiredNodes
}
pieces := segment.Pieces
for _, pieceIndex := range service.perm(len(pieces)) {
piece := pieces[pieceIndex]
node, ok := nodes[piece.StorageNode]
if !ok {
continue
}
_, err := signer.Sign(ctx, resolveStorageNode_Selected(node, true), int32(piece.Number))
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
if len(signer.AddressedLimits) >= int(neededLimits) {
break
}
}
if len(signer.AddressedLimits) < int(segment.Redundancy.RequiredShares) {
mon.Meter("download_failed_not_enough_pieces_uplink").Mark(1) //mon:locked
return nil, storj.PiecePrivateKey{}, ErrDownloadFailedNotEnoughPieces.New("not enough orderlimits: got %d, required %d", len(signer.AddressedLimits), segment.Redundancy.RequiredShares)
}
if err := service.updateBandwidth(ctx, bucket, signer.AddressedLimits...); err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
signer.AddressedLimits, err = sortLimits(signer.AddressedLimits, segment)
if err != nil {
return nil, storj.PiecePrivateKey{}, err
}
// workaround to avoid sending nil values on top level
for i := range signer.AddressedLimits {
if signer.AddressedLimits[i] == nil {
signer.AddressedLimits[i] = &pb.AddressedOrderLimit{}
}
}
return signer.AddressedLimits, signer.PrivateKey, nil
2019-03-28 20:09:23 +00:00
}
func (service *Service) perm(n int) []int {
service.rngMu.Lock()
defer service.rngMu.Unlock()
return service.rng.Perm(n)
}
// sortLimits sorts order limits and fill missing ones with nil values.
func sortLimits(limits []*pb.AddressedOrderLimit, segment metabase.Segment) ([]*pb.AddressedOrderLimit, error) {
sorted := make([]*pb.AddressedOrderLimit, segment.Redundancy.TotalShares)
for _, piece := range segment.Pieces {
if int16(piece.Number) >= segment.Redundancy.TotalShares {
return nil, Error.New("piece number is greater than redundancy total shares: got %d, max %d",
piece.Number, (segment.Redundancy.TotalShares - 1))
}
sorted[piece.Number] = getLimitByStorageNodeID(limits, piece.StorageNode)
}
return sorted, nil
}
func getLimitByStorageNodeID(limits []*pb.AddressedOrderLimit, storageNodeID storj.NodeID) *pb.AddressedOrderLimit {
for _, limit := range limits {
if limit == nil || limit.GetLimit() == nil {
continue
}
if limit.GetLimit().StorageNodeId == storageNodeID {
return limit
}
}
return nil
}
2019-03-28 20:09:23 +00:00
// CreatePutOrderLimits creates the order limits for uploading pieces to nodes.
func (service *Service) CreatePutOrderLimits(ctx context.Context, bucket metabase.BucketLocation, nodes []*overlay.SelectedNode, pieceExpiration time.Time, maxPieceSize int64) (_ storj.PieceID, _ []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, err error) {
defer mon.Task()(&ctx)(&err)
signer, err := NewSignerPut(service, pieceExpiration, time.Now(), maxPieceSize, bucket)
2019-03-28 20:09:23 +00:00
if err != nil {
return storj.PieceID{}, nil, storj.PiecePrivateKey{}, Error.Wrap(err)
2019-03-28 20:09:23 +00:00
}
for pieceNum, node := range nodes {
_, err := signer.Sign(ctx, resolveStorageNode_Selected(node, true), int32(pieceNum))
if err != nil {
return storj.PieceID{}, nil, storj.PiecePrivateKey{}, Error.Wrap(err)
2019-03-28 20:09:23 +00:00
}
}
return signer.RootPieceID, signer.AddressedLimits, signer.PrivateKey, nil
2019-03-28 20:09:23 +00:00
}
// ReplacePutOrderLimits replaces order limits for uploading pieces to nodes.
func (service *Service) ReplacePutOrderLimits(ctx context.Context, rootPieceID storj.PieceID, addressedLimits []*pb.AddressedOrderLimit, nodes []*overlay.SelectedNode, pieceNumbers []int32) (_ []*pb.AddressedOrderLimit, err error) {
defer mon.Task()(&ctx)(&err)
pieceIDDeriver := rootPieceID.Deriver()
newAddressedLimits := make([]*pb.AddressedOrderLimit, len(addressedLimits))
copy(newAddressedLimits, addressedLimits)
for i, pieceNumber := range pieceNumbers {
if pieceNumber < 0 || int(pieceNumber) >= len(addressedLimits) {
return nil, Error.New("invalid piece number %d", pieceNumber)
}
// TODO: clone?
newAddressedLimit := *addressedLimits[pieceNumber].Limit
newAddressedLimit.StorageNodeId = nodes[i].ID
newAddressedLimit.PieceId = pieceIDDeriver.Derive(nodes[i].ID, pieceNumber)
newAddressedLimit.SatelliteSignature = nil
newAddressedLimits[pieceNumber].Limit, err = signing.SignOrderLimit(ctx, service.satellite, &newAddressedLimit)
if err != nil {
return nil, ErrSigner.Wrap(err)
}
newAddressedLimits[pieceNumber].StorageNodeAddress = resolveStorageNode_Selected(nodes[i], true).Address
}
return newAddressedLimits, nil
}
// CreateAuditOrderLimits creates the order limits for auditing the pieces of a segment.
func (service *Service) CreateAuditOrderLimits(ctx context.Context, segment metabase.Segment, skip map[storj.NodeID]bool) (_ []*pb.AddressedOrderLimit, _ storj.PiecePrivateKey, cachedNodesInfo map[storj.NodeID]overlay.NodeReputation, err error) {
defer mon.Task()(&ctx)(&err)
nodeIDs := make([]storj.NodeID, len(segment.Pieces))
for i, piece := range segment.Pieces {
nodeIDs[i] = piece.StorageNode
}
nodes, err := service.overlay.GetOnlineNodesForAuditRepair(ctx, nodeIDs)
if err != nil {
service.log.Debug("error getting nodes from overlay", zap.Error(err))
satellite/audit: use LastIPAndPort preferentially This preserves the last_ip_and_port field from node lookups through CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later calls to (*Verifier).GetShare() can try to use that IP and port. If a connection to the given IP and port cannot be made, or the connection cannot be verified and secured with the target node identity, an attempt is made to connect to the original node address instead. A similar change is not necessary to the other Create*OrderLimits functions, because they already replace node addresses with the cached IP and port as appropriate. We might want to consider making a similar change to CreateGetRepairOrderLimits(), though. The audit situation is unique because the ramifications are especially powerful when we get the address wrong. Failing a single audit can have a heavy cost to a storage node. We need to make extra effort in order to avoid imposing that cost unfairly. Situation 1: If an audit fails because the repair worker failed to make a DNS query (which might well be the fault on the satellite side), and we have last_ip_and_port information available for the target node, it would be unfair not to try connecting to that last_ip_and_port address. Situation 2: If a node has changed addresses recently and the operator correctly changed its DNS entry, but we don't bother querying DNS, it would be unfair to penalize the node for our failure to connect to it. So the audit worker must try both last_ip_and_port _and_ the node address as supplied by the SNO. We elect here to try last_ip_and_port first, on the grounds that (a) it is expected to work in the large majority of cases, and (b) there should not be any security concerns with connecting to an out-or-date address, and (c) avoiding DNS queries on the satellite side helps alleviate satellite operational load. Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
}
bucket := metabase.BucketLocation{}
signer, err := NewSignerAudit(service, segment.RootPieceID, time.Now(), int64(segment.Redundancy.ShareSize), bucket)
if err != nil {
satellite/audit: use LastIPAndPort preferentially This preserves the last_ip_and_port field from node lookups through CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later calls to (*Verifier).GetShare() can try to use that IP and port. If a connection to the given IP and port cannot be made, or the connection cannot be verified and secured with the target node identity, an attempt is made to connect to the original node address instead. A similar change is not necessary to the other Create*OrderLimits functions, because they already replace node addresses with the cached IP and port as appropriate. We might want to consider making a similar change to CreateGetRepairOrderLimits(), though. The audit situation is unique because the ramifications are especially powerful when we get the address wrong. Failing a single audit can have a heavy cost to a storage node. We need to make extra effort in order to avoid imposing that cost unfairly. Situation 1: If an audit fails because the repair worker failed to make a DNS query (which might well be the fault on the satellite side), and we have last_ip_and_port information available for the target node, it would be unfair not to try connecting to that last_ip_and_port address. Situation 2: If a node has changed addresses recently and the operator correctly changed its DNS entry, but we don't bother querying DNS, it would be unfair to penalize the node for our failure to connect to it. So the audit worker must try both last_ip_and_port _and_ the node address as supplied by the SNO. We elect here to try last_ip_and_port first, on the grounds that (a) it is expected to work in the large majority of cases, and (b) there should not be any security concerns with connecting to an out-or-date address, and (c) avoiding DNS queries on the satellite side helps alleviate satellite operational load. Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
}
cachedNodesInfo = make(map[storj.NodeID]overlay.NodeReputation)
var nodeErrors errs.Group
var limitsCount int16
limits := make([]*pb.AddressedOrderLimit, segment.Redundancy.TotalShares)
for _, piece := range segment.Pieces {
if skip[piece.StorageNode] {
continue
}
node, ok := nodes[piece.StorageNode]
if !ok {
nodeErrors.Add(errs.New("node %q is not reliable", piece.StorageNode))
continue
}
cachedNodesInfo[piece.StorageNode] = *node
limit, err := signer.Sign(ctx, resolveStorageNode_Reputation(node), int32(piece.Number))
2019-03-28 20:09:23 +00:00
if err != nil {
satellite/audit: use LastIPAndPort preferentially This preserves the last_ip_and_port field from node lookups through CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later calls to (*Verifier).GetShare() can try to use that IP and port. If a connection to the given IP and port cannot be made, or the connection cannot be verified and secured with the target node identity, an attempt is made to connect to the original node address instead. A similar change is not necessary to the other Create*OrderLimits functions, because they already replace node addresses with the cached IP and port as appropriate. We might want to consider making a similar change to CreateGetRepairOrderLimits(), though. The audit situation is unique because the ramifications are especially powerful when we get the address wrong. Failing a single audit can have a heavy cost to a storage node. We need to make extra effort in order to avoid imposing that cost unfairly. Situation 1: If an audit fails because the repair worker failed to make a DNS query (which might well be the fault on the satellite side), and we have last_ip_and_port information available for the target node, it would be unfair not to try connecting to that last_ip_and_port address. Situation 2: If a node has changed addresses recently and the operator correctly changed its DNS entry, but we don't bother querying DNS, it would be unfair to penalize the node for our failure to connect to it. So the audit worker must try both last_ip_and_port _and_ the node address as supplied by the SNO. We elect here to try last_ip_and_port first, on the grounds that (a) it is expected to work in the large majority of cases, and (b) there should not be any security concerns with connecting to an out-or-date address, and (c) avoiding DNS queries on the satellite side helps alleviate satellite operational load. Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
2019-03-28 20:09:23 +00:00
}
limits[piece.Number] = limit
2019-03-28 20:09:23 +00:00
limitsCount++
}
if limitsCount < segment.Redundancy.RequiredShares {
err = ErrDownloadFailedNotEnoughPieces.New("not enough nodes available: got %d, required %d", limitsCount, segment.Redundancy.RequiredShares)
satellite/audit: use LastIPAndPort preferentially This preserves the last_ip_and_port field from node lookups through CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later calls to (*Verifier).GetShare() can try to use that IP and port. If a connection to the given IP and port cannot be made, or the connection cannot be verified and secured with the target node identity, an attempt is made to connect to the original node address instead. A similar change is not necessary to the other Create*OrderLimits functions, because they already replace node addresses with the cached IP and port as appropriate. We might want to consider making a similar change to CreateGetRepairOrderLimits(), though. The audit situation is unique because the ramifications are especially powerful when we get the address wrong. Failing a single audit can have a heavy cost to a storage node. We need to make extra effort in order to avoid imposing that cost unfairly. Situation 1: If an audit fails because the repair worker failed to make a DNS query (which might well be the fault on the satellite side), and we have last_ip_and_port information available for the target node, it would be unfair not to try connecting to that last_ip_and_port address. Situation 2: If a node has changed addresses recently and the operator correctly changed its DNS entry, but we don't bother querying DNS, it would be unfair to penalize the node for our failure to connect to it. So the audit worker must try both last_ip_and_port _and_ the node address as supplied by the SNO. We elect here to try last_ip_and_port first, on the grounds that (a) it is expected to work in the large majority of cases, and (b) there should not be any security concerns with connecting to an out-or-date address, and (c) avoiding DNS queries on the satellite side helps alleviate satellite operational load. Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
return nil, storj.PiecePrivateKey{}, nil, errs.Combine(err, nodeErrors.Err())
2019-03-28 20:09:23 +00:00
}
return limits, signer.PrivateKey, cachedNodesInfo, nil
2019-03-28 20:09:23 +00:00
}
// CreateAuditOrderLimit creates an order limit for auditing a single piece from a segment.
func (service *Service) CreateAuditOrderLimit(ctx context.Context, nodeID storj.NodeID, pieceNum uint16, rootPieceID storj.PieceID, shareSize int32) (limit *pb.AddressedOrderLimit, _ storj.PiecePrivateKey, nodeInfo *overlay.NodeReputation, err error) {
// TODO reduce number of params ?
defer mon.Task()(&ctx)(&err)
signer, err := NewSignerAudit(service, rootPieceID, time.Now(), int64(shareSize), metabase.BucketLocation{})
if err != nil {
return nil, storj.PiecePrivateKey{}, nodeInfo, Error.Wrap(err)
}
return service.createAuditOrderLimitWithSigner(ctx, nodeID, pieceNum, signer)
}
// CreateAuditPieceOrderLimit creates an order limit for auditing a single
// piece from a segment, requesting that the original order limit and piece
// hash be included.
//
// Unfortunately, because of the way the protocol works historically, we
// must use GET_REPAIR for this operation instead of GET_AUDIT.
func (service *Service) CreateAuditPieceOrderLimit(ctx context.Context, nodeID storj.NodeID, pieceNum uint16, rootPieceID storj.PieceID, pieceSize int32) (limit *pb.AddressedOrderLimit, _ storj.PiecePrivateKey, nodeInfo *overlay.NodeReputation, err error) {
defer mon.Task()(&ctx)(&err)
signer, err := NewSignerRepairGet(service, rootPieceID, time.Now(), int64(pieceSize), metabase.BucketLocation{})
if err != nil {
return nil, storj.PiecePrivateKey{}, nodeInfo, Error.Wrap(err)
}
return service.createAuditOrderLimitWithSigner(ctx, nodeID, pieceNum, signer)
}
func (service *Service) createAuditOrderLimitWithSigner(ctx context.Context, nodeID storj.NodeID, pieceNum uint16, signer *Signer) (limit *pb.AddressedOrderLimit, _ storj.PiecePrivateKey, nodeInfo *overlay.NodeReputation, err error) {
defer mon.Task()(&ctx)(&err)
node, err := service.overlay.Get(ctx, nodeID)
if err != nil {
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
}
nodeInfo = &overlay.NodeReputation{
ID: nodeID,
Address: node.Address,
LastNet: node.LastNet,
LastIPPort: node.LastIPPort,
Reputation: node.Reputation.Status,
}
if node.Disqualified != nil {
return nil, storj.PiecePrivateKey{}, nodeInfo, overlay.ErrNodeDisqualified.New("%v", nodeID)
}
if node.ExitStatus.ExitFinishedAt != nil {
return nil, storj.PiecePrivateKey{}, nodeInfo, overlay.ErrNodeFinishedGE.New("%v", nodeID)
}
if !service.overlay.IsOnline(node) {
return nil, storj.PiecePrivateKey{}, nodeInfo, overlay.ErrNodeOffline.New("%v", nodeID)
}
orderLimit, err := signer.Sign(ctx, resolveStorageNode(&node.Node, node.LastIPPort, false), int32(pieceNum))
if err != nil {
return nil, storj.PiecePrivateKey{}, nodeInfo, Error.Wrap(err)
}
return orderLimit, signer.PrivateKey, nodeInfo, nil
}
[V3-1927] Repairer uploads to max threshold instead of success… (#2423) * pkg/datarepair: Add test to check num upload pieces Add a new test for ensuring the number of pieces that the repair process upload when a segment is injured. * satellite/orders: Don't create "put order limits" over total Repair must not create "put order limits" more than the total count. * pkg/datarepair: Update upload repair pieces test Update the test which checks the number of pieces which are uploaded during a repair for using the same excess over the success threshold value than the implementation. * satellites/orders: Limit repair put order for not being total Limit the number of put orders to be used by repair for only uploading pieces to a % excess over the successful threshold. * pkg/datarepair: Change DataRepair test to pass again Make some changes in the DataRepair test to make pass again after the repair upload repaired pieces only until a % excess over success threshold. Also update the steps description of the DataRepair test after it has been changed, to match on what's now, besides to leave it more generic for avoiding having to update it on minimal future refactorings. * satellite: Make repair excess optimal threshold configurable Add a new configuration parameter to the satellite for being able to configure the percentage excess over the optimal threshold, used for determining how many pieces should be repaired/uploaded, rather than having the value hard coded. * repairer: Add configurable param to segments/repairer Add a new parameters to the segment/repairer to calculate the maximum number of excess nodes, based on the optimal threshold, that repaired pieces can be uploaded. This new parameter has been added for not returning more nodes than the number of upload orders for data repair satellite service calculate for repairing pieces. * pkg/storage/ec: Update log message in clien.Repair * satellite: Update configuration lock file
2019-07-11 23:44:47 +01:00
// CreateGetRepairOrderLimits creates the order limits for downloading the
// healthy pieces of segment as the source for repair.
[V3-1927] Repairer uploads to max threshold instead of success… (#2423) * pkg/datarepair: Add test to check num upload pieces Add a new test for ensuring the number of pieces that the repair process upload when a segment is injured. * satellite/orders: Don't create "put order limits" over total Repair must not create "put order limits" more than the total count. * pkg/datarepair: Update upload repair pieces test Update the test which checks the number of pieces which are uploaded during a repair for using the same excess over the success threshold value than the implementation. * satellites/orders: Limit repair put order for not being total Limit the number of put orders to be used by repair for only uploading pieces to a % excess over the successful threshold. * pkg/datarepair: Change DataRepair test to pass again Make some changes in the DataRepair test to make pass again after the repair upload repaired pieces only until a % excess over success threshold. Also update the steps description of the DataRepair test after it has been changed, to match on what's now, besides to leave it more generic for avoiding having to update it on minimal future refactorings. * satellite: Make repair excess optimal threshold configurable Add a new configuration parameter to the satellite for being able to configure the percentage excess over the optimal threshold, used for determining how many pieces should be repaired/uploaded, rather than having the value hard coded. * repairer: Add configurable param to segments/repairer Add a new parameters to the segment/repairer to calculate the maximum number of excess nodes, based on the optimal threshold, that repaired pieces can be uploaded. This new parameter has been added for not returning more nodes than the number of upload orders for data repair satellite service calculate for repairing pieces. * pkg/storage/ec: Update log message in clien.Repair * satellite: Update configuration lock file
2019-07-11 23:44:47 +01:00
//
// The length of the returned orders slice is the total number of pieces of the
// segment, setting to null the ones which don't correspond to a healthy piece.
func (service *Service) CreateGetRepairOrderLimits(ctx context.Context, segment metabase.Segment, healthy metabase.Pieces) (_ []*pb.AddressedOrderLimit, _ storj.PiecePrivateKey, cachedNodesInfo map[storj.NodeID]overlay.NodeReputation, err error) {
defer mon.Task()(&ctx)(&err)
pieceSize := segment.PieceSize()
totalPieces := segment.Redundancy.TotalShares
2019-03-28 20:09:23 +00:00
nodeIDs := make([]storj.NodeID, len(segment.Pieces))
for i, piece := range segment.Pieces {
nodeIDs[i] = piece.StorageNode
}
nodes, err := service.overlay.GetOnlineNodesForAuditRepair(ctx, nodeIDs)
if err != nil {
service.log.Debug("error getting nodes from overlay", zap.Error(err))
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
}
signer, err := NewSignerRepairGet(service, segment.RootPieceID, time.Now(), pieceSize, metabase.BucketLocation{})
if err != nil {
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
}
cachedNodesInfo = make(map[storj.NodeID]overlay.NodeReputation, len(healthy))
var nodeErrors errs.Group
var limitsCount int
2019-03-28 20:09:23 +00:00
limits := make([]*pb.AddressedOrderLimit, totalPieces)
for _, piece := range healthy {
node, ok := nodes[piece.StorageNode]
if !ok {
nodeErrors.Add(errs.New("node %q is not reliable", piece.StorageNode))
continue
}
cachedNodesInfo[piece.StorageNode] = *node
limit, err := signer.Sign(ctx, resolveStorageNode_Reputation(node), int32(piece.Number))
2019-03-28 20:09:23 +00:00
if err != nil {
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
2019-03-28 20:09:23 +00:00
}
limits[piece.Number] = limit
limitsCount++
2019-03-28 20:09:23 +00:00
}
if limitsCount < int(segment.Redundancy.RequiredShares) {
err = ErrDownloadFailedNotEnoughPieces.New("not enough nodes available: got %d, required %d", limitsCount, segment.Redundancy.RequiredShares)
return nil, storj.PiecePrivateKey{}, nil, errs.Combine(err, nodeErrors.Err())
}
return limits, signer.PrivateKey, cachedNodesInfo, nil
2019-03-28 20:09:23 +00:00
}
// CreatePutRepairOrderLimits creates the order limits for uploading the repaired pieces of segment to newNodes.
func (service *Service) CreatePutRepairOrderLimits(ctx context.Context, segment metabase.Segment, getOrderLimits []*pb.AddressedOrderLimit, healthySet map[int32]struct{}, newNodes []*overlay.SelectedNode, optimalThresholdMultiplier float64, numPiecesInExcludedCountries int) (_ []*pb.AddressedOrderLimit, _ storj.PiecePrivateKey, err error) {
defer mon.Task()(&ctx)(&err)
2019-03-28 20:09:23 +00:00
// Create the order limits for being used to upload the repaired pieces
pieceSize := segment.PieceSize()
totalPieces := int(segment.Redundancy.TotalShares)
totalPiecesAfterRepair := int(math.Ceil(float64(segment.Redundancy.OptimalShares)*optimalThresholdMultiplier)) + numPiecesInExcludedCountries
if totalPiecesAfterRepair > totalPieces {
totalPiecesAfterRepair = totalPieces
}
var numRetrievablePieces int
for _, o := range getOrderLimits {
if o != nil {
numRetrievablePieces++
}
}
totalPiecesToRepair := totalPiecesAfterRepair - len(healthySet)
limits := make([]*pb.AddressedOrderLimit, totalPieces)
expirationDate := time.Time{}
if segment.ExpiresAt != nil {
expirationDate = *segment.ExpiresAt
}
signer, err := NewSignerRepairPut(service, segment.RootPieceID, expirationDate, time.Now(), pieceSize, metabase.BucketLocation{})
2019-03-28 20:09:23 +00:00
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
2019-03-28 20:09:23 +00:00
}
var pieceNum int32
for _, node := range newNodes {
for int(pieceNum) < totalPieces {
_, isHealthy := healthySet[pieceNum]
if !isHealthy {
break
}
pieceNum++
2019-03-28 20:09:23 +00:00
}
if int(pieceNum) >= totalPieces { // should not happen
return nil, storj.PiecePrivateKey{}, Error.New("piece num greater than total pieces: %d >= %d", pieceNum, totalPieces)
2019-03-28 20:09:23 +00:00
}
limit, err := signer.Sign(ctx, resolveStorageNode_Selected(node, false), pieceNum)
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
2019-03-28 20:09:23 +00:00
}
limits[pieceNum] = limit
pieceNum++
totalPiecesToRepair--
[V3-1927] Repairer uploads to max threshold instead of success… (#2423) * pkg/datarepair: Add test to check num upload pieces Add a new test for ensuring the number of pieces that the repair process upload when a segment is injured. * satellite/orders: Don't create "put order limits" over total Repair must not create "put order limits" more than the total count. * pkg/datarepair: Update upload repair pieces test Update the test which checks the number of pieces which are uploaded during a repair for using the same excess over the success threshold value than the implementation. * satellites/orders: Limit repair put order for not being total Limit the number of put orders to be used by repair for only uploading pieces to a % excess over the successful threshold. * pkg/datarepair: Change DataRepair test to pass again Make some changes in the DataRepair test to make pass again after the repair upload repaired pieces only until a % excess over success threshold. Also update the steps description of the DataRepair test after it has been changed, to match on what's now, besides to leave it more generic for avoiding having to update it on minimal future refactorings. * satellite: Make repair excess optimal threshold configurable Add a new configuration parameter to the satellite for being able to configure the percentage excess over the optimal threshold, used for determining how many pieces should be repaired/uploaded, rather than having the value hard coded. * repairer: Add configurable param to segments/repairer Add a new parameters to the segment/repairer to calculate the maximum number of excess nodes, based on the optimal threshold, that repaired pieces can be uploaded. This new parameter has been added for not returning more nodes than the number of upload orders for data repair satellite service calculate for repairing pieces. * pkg/storage/ec: Update log message in clien.Repair * satellite: Update configuration lock file
2019-07-11 23:44:47 +01:00
if totalPiecesToRepair == 0 {
break
2019-03-28 20:09:23 +00:00
}
}
return limits, signer.PrivateKey, nil
2019-03-28 20:09:23 +00:00
}
2019-04-05 08:42:56 +01:00
// CreateGracefulExitPutOrderLimit creates an order limit for graceful exit put transfers.
func (service *Service) CreateGracefulExitPutOrderLimit(ctx context.Context, bucket metabase.BucketLocation, nodeID storj.NodeID, pieceNum int32, rootPieceID storj.PieceID, shareSize int32) (limit *pb.AddressedOrderLimit, _ storj.PiecePrivateKey, err error) {
defer mon.Task()(&ctx)(&err)
// should this use KnownReliable or similar?
node, err := service.overlay.Get(ctx, nodeID)
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
if node.Disqualified != nil {
return nil, storj.PiecePrivateKey{}, overlay.ErrNodeDisqualified.New("%v", nodeID)
}
if !service.overlay.IsOnline(node) {
return nil, storj.PiecePrivateKey{}, overlay.ErrNodeOffline.New("%v", nodeID)
}
signer, err := NewSignerGracefulExit(service, rootPieceID, time.Now(), shareSize, bucket)
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
limit, err = signer.Sign(ctx, resolveStorageNode(&node.Node, node.LastIPPort, true), pieceNum)
if err != nil {
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
}
return limit, signer.PrivateKey, nil
}
// UpdateGetInlineOrder updates amount of inline GET bandwidth for given bucket.
func (service *Service) UpdateGetInlineOrder(ctx context.Context, bucket metabase.BucketLocation, amount int64) (err error) {
defer mon.Task()(&ctx)(&err)
now := time.Now().UTC()
2019-04-05 08:42:56 +01:00
intervalStart := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, now.Location())
return service.orders.UpdateBucketBandwidthInline(ctx, bucket.ProjectID, []byte(bucket.BucketName), pb.PieceAction_GET, amount, intervalStart)
2019-04-05 08:42:56 +01:00
}
// UpdatePutInlineOrder updates amount of inline PUT bandwidth for given bucket.
func (service *Service) UpdatePutInlineOrder(ctx context.Context, bucket metabase.BucketLocation, amount int64) (err error) {
defer mon.Task()(&ctx)(&err)
now := time.Now().UTC()
2019-04-05 08:42:56 +01:00
intervalStart := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, now.Location())
return service.orders.UpdateBucketBandwidthInline(ctx, bucket.ProjectID, []byte(bucket.BucketName), pb.PieceAction_PUT, amount, intervalStart)
2019-04-05 08:42:56 +01:00
}
// DecryptOrderMetadata decrypts the order metadata.
func (service *Service) DecryptOrderMetadata(ctx context.Context, order *pb.OrderLimit) (_ *internalpb.OrderLimitMetadata, err error) {
defer mon.Task()(&ctx)(&err)
var orderKeyID EncryptionKeyID
copy(orderKeyID[:], order.EncryptedMetadataKeyId)
key := service.encryptionKeys.Default
if key.ID != orderKeyID {
val, ok := service.encryptionKeys.KeyByID[orderKeyID]
if !ok {
return nil, ErrDecryptOrderMetadata.New("no encryption key found that matches the order.EncryptedMetadataKeyId")
}
key = EncryptionKey{
ID: orderKeyID,
Key: val,
}
}
return key.DecryptMetadata(order.SerialNumber, order.EncryptedMetadata)
}
func resolveStorageNode_Selected(node *overlay.SelectedNode, resolveDNS bool) *pb.Node {
return resolveStorageNode(&pb.Node{
Id: node.ID,
Address: node.Address,
}, node.LastIPPort, resolveDNS)
}
func resolveStorageNode_Reputation(node *overlay.NodeReputation) *pb.Node {
return resolveStorageNode(&pb.Node{
Id: node.ID,
Address: node.Address,
}, node.LastIPPort, false)
}
func resolveStorageNode(node *pb.Node, lastIPPort string, resolveDNS bool) *pb.Node {
if resolveDNS && lastIPPort != "" {
node = pb.CopyNode(node) // we mutate
node.Address.Address = lastIPPort
}
return node
}