2019-03-28 20:09:23 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package orders
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2019-07-11 23:44:47 +01:00
|
|
|
"math"
|
2020-05-14 16:45:35 +01:00
|
|
|
mathrand "math/rand"
|
2020-01-27 20:01:37 +00:00
|
|
|
"sync"
|
2019-03-28 20:09:23 +00:00
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/zeebo/errs"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/pb"
|
|
|
|
"storj.io/common/signing"
|
|
|
|
"storj.io/common/storj"
|
2020-03-30 10:08:50 +01:00
|
|
|
"storj.io/common/uuid"
|
2021-01-08 16:04:46 +00:00
|
|
|
"storj.io/storj/satellite/internalpb"
|
2020-08-28 12:56:09 +01:00
|
|
|
"storj.io/storj/satellite/metainfo/metabase"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2020-02-21 14:07:29 +00:00
|
|
|
"storj.io/uplink/private/eestream"
|
2019-03-28 20:09:23 +00:00
|
|
|
)
|
|
|
|
|
2020-11-18 21:39:13 +00:00
|
|
|
var (
|
|
|
|
// ErrDownloadFailedNotEnoughPieces is returned when download failed due to missing pieces.
|
|
|
|
ErrDownloadFailedNotEnoughPieces = errs.Class("not enough pieces for download")
|
|
|
|
// ErrDecryptOrderMetadata is returned when a step of decrypting metadata fails.
|
|
|
|
ErrDecryptOrderMetadata = errs.Class("decrytping order metadata")
|
|
|
|
)
|
2019-12-04 21:24:36 +00:00
|
|
|
|
2019-06-21 11:38:40 +01:00
|
|
|
// Config is a configuration struct for orders Service.
|
|
|
|
type Config struct {
|
2021-01-22 13:51:29 +00:00
|
|
|
EncryptionKeys EncryptionKeys `help:"encryption keys to encrypt info in orders" default:""`
|
|
|
|
Expiration time.Duration `help:"how long until an order expires" default:"48h"` // 2 days
|
|
|
|
FlushBatchSize int `help:"how many items in the rollups write cache before they are flushed to the database" devDefault:"20" releaseDefault:"10000"`
|
|
|
|
FlushInterval time.Duration `help:"how often to flush the rollups write cache to the database" devDefault:"30s" releaseDefault:"1m"`
|
|
|
|
NodeStatusLogging bool `hidden:"true" help:"deprecated, log the offline/disqualification status of nodes" default:"false"`
|
|
|
|
OrdersSemaphoreSize int `help:"how many concurrent orders to process at once. zero is unlimited" default:"2"`
|
2019-06-21 11:38:40 +01:00
|
|
|
}
|
|
|
|
|
2020-06-25 15:47:44 +01:00
|
|
|
// BucketsDB returns information about buckets.
|
|
|
|
type BucketsDB interface {
|
|
|
|
// GetBucketID returns an existing bucket id.
|
2020-07-24 18:13:15 +01:00
|
|
|
GetBucketID(ctx context.Context, bucket metabase.BucketLocation) (id uuid.UUID, err error)
|
2020-06-25 15:47:44 +01:00
|
|
|
}
|
|
|
|
|
2019-03-28 20:09:23 +00:00
|
|
|
// Service for creating order limits.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Service
|
2019-03-28 20:09:23 +00:00
|
|
|
type Service struct {
|
2020-08-27 15:30:04 +01:00
|
|
|
log *zap.Logger
|
|
|
|
satellite signing.Signer
|
|
|
|
overlay *overlay.Service
|
|
|
|
orders DB
|
|
|
|
buckets BucketsDB
|
|
|
|
|
2020-12-18 18:16:20 +00:00
|
|
|
encryptionKeys EncryptionKeys
|
2020-07-24 18:13:15 +01:00
|
|
|
|
2021-01-08 17:25:39 +00:00
|
|
|
orderExpiration time.Duration
|
2020-08-27 15:30:04 +01:00
|
|
|
|
|
|
|
rngMu sync.Mutex
|
|
|
|
rng *mathrand.Rand
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewService creates new service for creating order limits.
|
2019-07-11 23:44:47 +01:00
|
|
|
func NewService(
|
2019-08-06 17:35:59 +01:00
|
|
|
log *zap.Logger, satellite signing.Signer, overlay *overlay.Service,
|
2020-06-25 15:47:44 +01:00
|
|
|
orders DB, buckets BucketsDB,
|
2020-08-27 15:30:04 +01:00
|
|
|
config Config,
|
2020-07-24 18:13:15 +01:00
|
|
|
) (*Service, error) {
|
2020-12-18 18:16:20 +00:00
|
|
|
if config.EncryptionKeys.Default.IsZero() {
|
2020-07-24 18:13:15 +01:00
|
|
|
return nil, Error.New("encryption keys must be specified to include encrypted metadata")
|
|
|
|
}
|
|
|
|
|
2019-03-28 20:09:23 +00:00
|
|
|
return &Service{
|
2020-08-27 15:30:04 +01:00
|
|
|
log: log,
|
|
|
|
satellite: satellite,
|
|
|
|
overlay: overlay,
|
|
|
|
orders: orders,
|
|
|
|
buckets: buckets,
|
|
|
|
|
2020-12-18 18:16:20 +00:00
|
|
|
encryptionKeys: config.EncryptionKeys,
|
2020-07-24 18:13:15 +01:00
|
|
|
|
2021-01-08 17:25:39 +00:00
|
|
|
orderExpiration: config.Expiration,
|
2020-07-24 12:08:58 +01:00
|
|
|
|
|
|
|
rng: mathrand.New(mathrand.NewSource(time.Now().UnixNano())),
|
2020-07-24 18:13:15 +01:00
|
|
|
}, nil
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// VerifyOrderLimitSignature verifies that the signature inside order limit belongs to the satellite.
|
2019-07-01 16:54:11 +01:00
|
|
|
func (service *Service) VerifyOrderLimitSignature(ctx context.Context, signed *pb.OrderLimit) (err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-06-05 14:47:01 +01:00
|
|
|
return signing.VerifyOrderLimitSignature(ctx, service.satellite, signed)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
func (service *Service) updateBandwidth(ctx context.Context, bucket metabase.BucketLocation, addressedOrderLimits ...*pb.AddressedOrderLimit) (err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-04-01 21:14:58 +01:00
|
|
|
if len(addressedOrderLimits) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2019-06-10 15:58:28 +01:00
|
|
|
|
2019-04-01 21:14:58 +01:00
|
|
|
var action pb.PieceAction
|
2019-06-10 15:58:28 +01:00
|
|
|
|
|
|
|
var bucketAllocation int64
|
|
|
|
|
2019-04-01 21:14:58 +01:00
|
|
|
for _, addressedOrderLimit := range addressedOrderLimits {
|
2020-01-27 20:01:37 +00:00
|
|
|
if addressedOrderLimit != nil && addressedOrderLimit.Limit != nil {
|
2019-04-01 21:14:58 +01:00
|
|
|
orderLimit := addressedOrderLimit.Limit
|
|
|
|
action = orderLimit.Action
|
2019-06-10 15:58:28 +01:00
|
|
|
bucketAllocation += orderLimit.Limit
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
|
|
|
}
|
2019-06-10 15:58:28 +01:00
|
|
|
|
2019-04-09 20:12:58 +01:00
|
|
|
now := time.Now().UTC()
|
2019-04-04 16:20:59 +01:00
|
|
|
intervalStart := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, now.Location())
|
2019-04-01 21:14:58 +01:00
|
|
|
|
2019-06-12 16:00:29 +01:00
|
|
|
// TODO: all of this below should be a single db transaction. in fact, this whole function should probably be part of an existing transaction
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.orders.UpdateBucketBandwidthAllocation(ctx, bucket.ProjectID, []byte(bucket.BucketName), action, bucketAllocation, intervalStart); err != nil {
|
2019-06-10 15:58:28 +01:00
|
|
|
return Error.Wrap(err)
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
2019-06-10 15:58:28 +01:00
|
|
|
|
2019-04-01 21:14:58 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// CreateGetOrderLimits creates the order limits for downloading the pieces of a segment.
|
|
|
|
func (service *Service) CreateGetOrderLimits(ctx context.Context, bucket metabase.BucketLocation, segment metabase.Segment) (_ []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-11 21:51:40 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
2019-03-28 20:09:23 +00:00
|
|
|
if err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
2020-12-14 12:54:22 +00:00
|
|
|
pieceSize := eestream.CalcPieceSize(int64(segment.EncryptedSize), redundancy)
|
2019-03-28 20:09:23 +00:00
|
|
|
|
2020-11-06 11:54:52 +00:00
|
|
|
nodeIDs := make([]storj.NodeID, len(segment.Pieces))
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
nodeIDs[i] = piece.StorageNode
|
|
|
|
}
|
|
|
|
|
|
|
|
nodes, err := service.overlay.GetOnlineNodesForGetDelete(ctx, nodeIDs)
|
|
|
|
if err != nil {
|
|
|
|
service.log.Debug("error getting nodes from overlay", zap.Error(err))
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
signer, err := NewSignerGet(service, segment.RootPieceID, time.Now(), pieceSize, bucket)
|
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
neededLimits := segment.Redundancy.DownloadNodes()
|
|
|
|
|
|
|
|
pieces := segment.Pieces
|
|
|
|
for _, pieceIndex := range service.perm(len(pieces)) {
|
|
|
|
piece := pieces[pieceIndex]
|
|
|
|
node, ok := nodes[piece.StorageNode]
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
address := node.Address.Address
|
|
|
|
if node.LastIPPort != "" {
|
|
|
|
address = node.LastIPPort
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err := signer.Sign(ctx, storj.NodeURL{
|
|
|
|
ID: piece.StorageNode,
|
|
|
|
Address: address,
|
|
|
|
}, int32(piece.Number))
|
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(signer.AddressedLimits) >= int(neededLimits) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(signer.AddressedLimits) < redundancy.RequiredCount() {
|
|
|
|
mon.Meter("download_failed_not_enough_pieces_uplink").Mark(1) //mon:locked
|
|
|
|
return nil, storj.PiecePrivateKey{}, ErrDownloadFailedNotEnoughPieces.New("not enough orderlimits: got %d, required %d", len(signer.AddressedLimits), redundancy.RequiredCount())
|
2019-03-29 09:53:53 +00:00
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, signer.AddressedLimits...); err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
|
|
|
|
2020-08-14 15:36:30 +01:00
|
|
|
return signer.AddressedLimits, signer.PrivateKey, nil
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-08-14 15:36:30 +01:00
|
|
|
func (service *Service) perm(n int) []int {
|
2020-01-27 20:01:37 +00:00
|
|
|
service.rngMu.Lock()
|
2020-08-14 15:36:30 +01:00
|
|
|
defer service.rngMu.Unlock()
|
|
|
|
return service.rng.Perm(n)
|
2020-01-27 20:01:37 +00:00
|
|
|
}
|
|
|
|
|
2019-03-28 20:09:23 +00:00
|
|
|
// CreatePutOrderLimits creates the order limits for uploading pieces to nodes.
|
2020-08-28 12:56:09 +01:00
|
|
|
func (service *Service) CreatePutOrderLimits(ctx context.Context, bucket metabase.BucketLocation, nodes []*overlay.SelectedNode, pieceExpiration time.Time, maxPieceSize int64) (_ storj.PieceID, _ []*pb.AddressedOrderLimit, privateKey storj.PiecePrivateKey, err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-09 22:54:00 +01:00
|
|
|
|
2020-07-24 18:13:15 +01:00
|
|
|
signer, err := NewSignerPut(service, pieceExpiration, time.Now(), maxPieceSize, bucket)
|
2019-03-28 20:09:23 +00:00
|
|
|
if err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return storj.PieceID{}, nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
for pieceNum, node := range nodes {
|
|
|
|
address := node.Address.Address
|
2020-03-06 22:04:23 +00:00
|
|
|
if node.LastIPPort != "" {
|
2020-07-24 19:57:11 +01:00
|
|
|
address = node.LastIPPort
|
2020-03-06 22:04:23 +00:00
|
|
|
}
|
2020-07-24 19:57:11 +01:00
|
|
|
_, err := signer.Sign(ctx, storj.NodeURL{ID: node.ID, Address: address}, int32(pieceNum))
|
|
|
|
if err != nil {
|
|
|
|
return storj.PieceID{}, nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, signer.AddressedLimits...); err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return storj.PieceID{}, nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
return signer.RootPieceID, signer.AddressedLimits, signer.PrivateKey, nil
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// CreateAuditOrderLimits creates the order limits for auditing the pieces of a segment.
|
|
|
|
func (service *Service) CreateAuditOrderLimits(ctx context.Context, bucket metabase.BucketLocation, segment metabase.Segment, skip map[storj.NodeID]bool) (_ []*pb.AddressedOrderLimit, _ storj.PiecePrivateKey, cachedIPsAndPorts map[storj.NodeID]string, err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2020-07-24 19:57:11 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
nodeIDs := make([]storj.NodeID, len(segment.Pieces))
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
nodeIDs[i] = piece.StorageNode
|
2020-03-13 18:01:48 +00:00
|
|
|
}
|
|
|
|
|
2020-03-30 14:32:02 +01:00
|
|
|
nodes, err := service.overlay.GetOnlineNodesForGetDelete(ctx, nodeIDs)
|
2020-03-13 18:01:48 +00:00
|
|
|
if err != nil {
|
|
|
|
service.log.Debug("error getting nodes from overlay", zap.Error(err))
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
|
2020-03-13 18:01:48 +00:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
signer, err := NewSignerAudit(service, segment.RootPieceID, time.Now(), int64(segment.Redundancy.ShareSize), bucket)
|
2020-07-24 19:57:11 +01:00
|
|
|
if err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
|
2020-07-24 19:57:11 +01:00
|
|
|
}
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
cachedIPsAndPorts = make(map[storj.NodeID]string)
|
2020-03-30 14:32:02 +01:00
|
|
|
var nodeErrors errs.Group
|
2020-12-14 12:54:22 +00:00
|
|
|
var limitsCount int16
|
|
|
|
limits := make([]*pb.AddressedOrderLimit, segment.Redundancy.TotalShares)
|
|
|
|
for _, piece := range segment.Pieces {
|
|
|
|
if skip[piece.StorageNode] {
|
2019-05-27 12:13:47 +01:00
|
|
|
continue
|
|
|
|
}
|
2020-12-14 12:54:22 +00:00
|
|
|
node, ok := nodes[piece.StorageNode]
|
2020-03-13 18:01:48 +00:00
|
|
|
if !ok {
|
2020-12-14 12:54:22 +00:00
|
|
|
nodeErrors.Add(errs.New("node %q is not reliable", piece.StorageNode))
|
2019-03-29 08:53:43 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
address := node.Address.Address
|
|
|
|
if node.LastIPPort != "" {
|
2020-12-14 12:54:22 +00:00
|
|
|
cachedIPsAndPorts[piece.StorageNode] = node.LastIPPort
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
}
|
2020-07-24 19:57:11 +01:00
|
|
|
limit, err := signer.Sign(ctx, storj.NodeURL{
|
2020-12-14 12:54:22 +00:00
|
|
|
ID: piece.StorageNode,
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
Address: address,
|
2020-12-14 12:54:22 +00:00
|
|
|
}, int32(piece.Number))
|
2019-03-28 20:09:23 +00:00
|
|
|
if err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
limits[piece.Number] = limit
|
2019-03-28 20:09:23 +00:00
|
|
|
limitsCount++
|
|
|
|
}
|
2019-03-29 08:53:43 +00:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
if limitsCount < segment.Redundancy.RequiredShares {
|
|
|
|
err = Error.New("not enough nodes available: got %d, required %d", limitsCount, segment.Redundancy.RequiredShares)
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, nil, errs.Combine(err, nodeErrors.Err())
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, limits...); err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, nil, Error.Wrap(err)
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return limits, signer.PrivateKey, cachedIPsAndPorts, nil
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// CreateAuditOrderLimit creates an order limit for auditing a single the piece from a segment.
|
|
|
|
func (service *Service) CreateAuditOrderLimit(ctx context.Context, bucket metabase.BucketLocation, nodeID storj.NodeID, pieceNum uint16, rootPieceID storj.PieceID, shareSize int32) (limit *pb.AddressedOrderLimit, _ storj.PiecePrivateKey, cachedIPAndPort string, err error) {
|
2019-07-03 17:53:15 +01:00
|
|
|
// TODO reduce number of params ?
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-09 22:54:00 +01:00
|
|
|
|
2019-08-06 17:35:59 +01:00
|
|
|
node, err := service.overlay.Get(ctx, nodeID)
|
2019-05-27 12:13:47 +01:00
|
|
|
if err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", Error.Wrap(err)
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
2019-06-24 15:46:10 +01:00
|
|
|
if node.Disqualified != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", overlay.ErrNodeDisqualified.New("%v", nodeID)
|
2019-06-24 15:46:10 +01:00
|
|
|
}
|
2020-08-13 13:00:56 +01:00
|
|
|
if node.ExitStatus.ExitFinishedAt != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", overlay.ErrNodeFinishedGE.New("%v", nodeID)
|
2020-08-13 13:00:56 +01:00
|
|
|
}
|
2019-08-06 17:35:59 +01:00
|
|
|
if !service.overlay.IsOnline(node) {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", overlay.ErrNodeOffline.New("%v", nodeID)
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
|
|
|
|
2020-07-24 18:13:15 +01:00
|
|
|
signer, err := NewSignerAudit(service, rootPieceID, time.Now(), int64(shareSize), bucket)
|
2019-05-27 12:13:47 +01:00
|
|
|
if err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", Error.Wrap(err)
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
orderLimit, err := signer.Sign(ctx, storj.NodeURL{
|
|
|
|
ID: nodeID,
|
|
|
|
Address: node.Address.Address,
|
2020-12-14 12:54:22 +00:00
|
|
|
}, int32(pieceNum))
|
2020-07-24 19:57:11 +01:00
|
|
|
if err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", Error.Wrap(err)
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, limit); err != nil {
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, "", Error.Wrap(err)
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
return orderLimit, signer.PrivateKey, node.LastIPPort, nil
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
|
|
|
|
2019-07-11 23:44:47 +01:00
|
|
|
// CreateGetRepairOrderLimits creates the order limits for downloading the
|
2020-12-14 14:29:48 +00:00
|
|
|
// healthy pieces of segment as the source for repair.
|
2019-07-11 23:44:47 +01:00
|
|
|
//
|
|
|
|
// The length of the returned orders slice is the total number of pieces of the
|
|
|
|
// segment, setting to null the ones which don't correspond to a healthy piece.
|
2020-12-14 14:29:48 +00:00
|
|
|
func (service *Service) CreateGetRepairOrderLimits(ctx context.Context, bucket metabase.BucketLocation, segment metabase.Segment, healthy metabase.Pieces) (_ []*pb.AddressedOrderLimit, _ storj.PiecePrivateKey, err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-11 21:51:40 +01:00
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
2019-04-03 14:17:32 +01:00
|
|
|
if err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-04-03 14:17:32 +01:00
|
|
|
}
|
2019-07-11 21:51:40 +01:00
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
pieceSize := eestream.CalcPieceSize(int64(segment.EncryptedSize), redundancy)
|
2019-04-03 14:17:32 +01:00
|
|
|
totalPieces := redundancy.TotalCount()
|
2019-03-28 20:09:23 +00:00
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
nodeIDs := make([]storj.NodeID, len(segment.Pieces))
|
|
|
|
for i, piece := range segment.Pieces {
|
|
|
|
nodeIDs[i] = piece.StorageNode
|
2020-03-13 18:01:48 +00:00
|
|
|
}
|
|
|
|
|
2020-03-30 14:32:02 +01:00
|
|
|
nodes, err := service.overlay.GetOnlineNodesForGetDelete(ctx, nodeIDs)
|
2020-03-13 18:01:48 +00:00
|
|
|
if err != nil {
|
|
|
|
service.log.Debug("error getting nodes from overlay", zap.Error(err))
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
signer, err := NewSignerRepairGet(service, segment.RootPieceID, time.Now(), pieceSize, bucket)
|
2020-07-24 19:57:11 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2020-03-30 14:32:02 +01:00
|
|
|
var nodeErrors errs.Group
|
2019-04-03 14:17:32 +01:00
|
|
|
var limitsCount int
|
2019-03-28 20:09:23 +00:00
|
|
|
limits := make([]*pb.AddressedOrderLimit, totalPieces)
|
|
|
|
for _, piece := range healthy {
|
2020-12-14 14:29:48 +00:00
|
|
|
node, ok := nodes[piece.StorageNode]
|
2020-03-13 18:01:48 +00:00
|
|
|
if !ok {
|
2020-12-14 14:29:48 +00:00
|
|
|
nodeErrors.Add(errs.New("node %q is not reliable", piece.StorageNode))
|
2019-03-29 08:53:43 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
limit, err := signer.Sign(ctx, storj.NodeURL{
|
2020-12-14 14:29:48 +00:00
|
|
|
ID: piece.StorageNode,
|
2020-07-24 19:57:11 +01:00
|
|
|
Address: node.Address.Address,
|
2020-12-14 14:29:48 +00:00
|
|
|
}, int32(piece.Number))
|
2019-03-28 20:09:23 +00:00
|
|
|
if err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
limits[piece.Number] = limit
|
2019-04-03 14:17:32 +01:00
|
|
|
limitsCount++
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2019-04-03 14:17:32 +01:00
|
|
|
if limitsCount < redundancy.RequiredCount() {
|
|
|
|
err = Error.New("not enough nodes available: got %d, required %d", limitsCount, redundancy.RequiredCount())
|
2020-03-30 14:32:02 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, errs.Combine(err, nodeErrors.Err())
|
2019-03-29 08:53:43 +00:00
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, limits...); err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
return limits, signer.PrivateKey, nil
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-12-14 14:29:48 +00:00
|
|
|
// CreatePutRepairOrderLimits creates the order limits for uploading the repaired pieces of segment to newNodes.
|
|
|
|
func (service *Service) CreatePutRepairOrderLimits(ctx context.Context, bucket metabase.BucketLocation, segment metabase.Segment, getOrderLimits []*pb.AddressedOrderLimit, newNodes []*overlay.SelectedNode, optimalThresholdMultiplier float64) (_ []*pb.AddressedOrderLimit, _ storj.PiecePrivateKey, err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-03-28 20:09:23 +00:00
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
// Create the order limits for being used to upload the repaired pieces
|
2020-12-14 14:29:48 +00:00
|
|
|
redundancy, err := eestream.NewRedundancyStrategyFromStorj(segment.Redundancy)
|
2019-07-11 21:51:40 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
2020-12-14 14:29:48 +00:00
|
|
|
pieceSize := eestream.CalcPieceSize(int64(segment.EncryptedSize), redundancy)
|
2020-07-24 19:57:11 +01:00
|
|
|
|
|
|
|
totalPieces := redundancy.TotalCount()
|
|
|
|
totalPiecesAfterRepair := int(math.Ceil(float64(redundancy.OptimalThreshold()) * optimalThresholdMultiplier))
|
|
|
|
if totalPiecesAfterRepair > totalPieces {
|
|
|
|
totalPiecesAfterRepair = totalPieces
|
|
|
|
}
|
|
|
|
|
|
|
|
var numCurrentPieces int
|
|
|
|
for _, o := range getOrderLimits {
|
|
|
|
if o != nil {
|
|
|
|
numCurrentPieces++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
totalPiecesToRepair := totalPiecesAfterRepair - numCurrentPieces
|
2019-07-11 21:51:40 +01:00
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
limits := make([]*pb.AddressedOrderLimit, totalPieces)
|
2020-12-14 14:29:48 +00:00
|
|
|
|
|
|
|
expirationDate := time.Time{} // TODO previously 'pointer.ExpirationDate'
|
|
|
|
signer, err := NewSignerRepairPut(service, segment.RootPieceID, expirationDate, time.Now(), pieceSize, bucket)
|
2019-03-28 20:09:23 +00:00
|
|
|
if err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
var pieceNum int32
|
|
|
|
for _, node := range newNodes {
|
|
|
|
for int(pieceNum) < totalPieces && getOrderLimits[pieceNum] != nil {
|
|
|
|
pieceNum++
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
if int(pieceNum) >= totalPieces { // should not happen
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.New("piece num greater than total pieces: %d >= %d", pieceNum, totalPieces)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
limit, err := signer.Sign(ctx, storj.NodeURL{
|
|
|
|
ID: node.ID,
|
|
|
|
Address: node.Address.Address,
|
|
|
|
}, pieceNum)
|
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
limits[pieceNum] = limit
|
|
|
|
pieceNum++
|
|
|
|
totalPiecesToRepair--
|
2019-07-11 23:44:47 +01:00
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
if totalPiecesToRepair == 0 {
|
|
|
|
break
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, limits...); err != nil {
|
2019-07-11 21:51:40 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-04-01 21:14:58 +01:00
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
return limits, signer.PrivateKey, nil
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
2019-04-05 08:42:56 +01:00
|
|
|
|
2019-10-11 22:18:05 +01:00
|
|
|
// CreateGracefulExitPutOrderLimit creates an order limit for graceful exit put transfers.
|
2020-08-28 12:56:09 +01:00
|
|
|
func (service *Service) CreateGracefulExitPutOrderLimit(ctx context.Context, bucket metabase.BucketLocation, nodeID storj.NodeID, pieceNum int32, rootPieceID storj.PieceID, shareSize int32) (limit *pb.AddressedOrderLimit, _ storj.PiecePrivateKey, err error) {
|
2019-10-11 22:18:05 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
// should this use KnownReliable or similar?
|
2019-10-11 22:18:05 +01:00
|
|
|
node, err := service.overlay.Get(ctx, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
if node.Disqualified != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, overlay.ErrNodeDisqualified.New("%v", nodeID)
|
|
|
|
}
|
|
|
|
if !service.overlay.IsOnline(node) {
|
|
|
|
return nil, storj.PiecePrivateKey{}, overlay.ErrNodeOffline.New("%v", nodeID)
|
|
|
|
}
|
|
|
|
|
2020-07-24 18:13:15 +01:00
|
|
|
signer, err := NewSignerGracefulExit(service, rootPieceID, time.Now(), shareSize, bucket)
|
2019-10-11 22:18:05 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2020-11-10 11:56:30 +00:00
|
|
|
address := node.Address.Address
|
|
|
|
if node.LastIPPort != "" {
|
|
|
|
address = node.LastIPPort
|
|
|
|
}
|
|
|
|
nodeURL := storj.NodeURL{ID: nodeID, Address: address}
|
2020-07-24 19:57:11 +01:00
|
|
|
limit, err = signer.Sign(ctx, nodeURL, pieceNum)
|
|
|
|
if err != nil {
|
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
2019-10-11 22:18:05 +01:00
|
|
|
}
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
if err := service.updateBandwidth(ctx, bucket, limit); err != nil {
|
2019-10-11 22:18:05 +01:00
|
|
|
return nil, storj.PiecePrivateKey{}, Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2020-07-24 19:57:11 +01:00
|
|
|
return limit, signer.PrivateKey, nil
|
2019-10-11 22:18:05 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// UpdateGetInlineOrder updates amount of inline GET bandwidth for given bucket.
|
2020-08-28 12:56:09 +01:00
|
|
|
func (service *Service) UpdateGetInlineOrder(ctx context.Context, bucket metabase.BucketLocation, amount int64) (err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-04-09 20:12:58 +01:00
|
|
|
now := time.Now().UTC()
|
2019-04-05 08:42:56 +01:00
|
|
|
intervalStart := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, now.Location())
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
return service.orders.UpdateBucketBandwidthInline(ctx, bucket.ProjectID, []byte(bucket.BucketName), pb.PieceAction_GET, amount, intervalStart)
|
2019-04-05 08:42:56 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// UpdatePutInlineOrder updates amount of inline PUT bandwidth for given bucket.
|
2020-08-28 12:56:09 +01:00
|
|
|
func (service *Service) UpdatePutInlineOrder(ctx context.Context, bucket metabase.BucketLocation, amount int64) (err error) {
|
2019-06-04 12:55:38 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-04-09 20:12:58 +01:00
|
|
|
now := time.Now().UTC()
|
2019-04-05 08:42:56 +01:00
|
|
|
intervalStart := time.Date(now.Year(), now.Month(), now.Day(), now.Hour(), 0, 0, 0, now.Location())
|
|
|
|
|
2020-08-28 12:56:09 +01:00
|
|
|
return service.orders.UpdateBucketBandwidthInline(ctx, bucket.ProjectID, []byte(bucket.BucketName), pb.PieceAction_PUT, amount, intervalStart)
|
2019-04-05 08:42:56 +01:00
|
|
|
}
|
2020-11-18 21:39:13 +00:00
|
|
|
|
|
|
|
// DecryptOrderMetadata decrypts the order metadata.
|
2021-01-08 16:04:46 +00:00
|
|
|
func (service *Service) DecryptOrderMetadata(ctx context.Context, order *pb.OrderLimit) (_ *internalpb.OrderLimitMetadata, err error) {
|
2020-11-18 21:39:13 +00:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
var orderKeyID EncryptionKeyID
|
|
|
|
copy(orderKeyID[:], order.EncryptedMetadataKeyId)
|
|
|
|
|
2021-01-22 13:51:29 +00:00
|
|
|
key := service.encryptionKeys.Default
|
2020-11-18 21:39:13 +00:00
|
|
|
if key.ID != orderKeyID {
|
|
|
|
val, ok := service.encryptionKeys.KeyByID[orderKeyID]
|
|
|
|
if !ok {
|
|
|
|
return nil, ErrDecryptOrderMetadata.New("no encryption key found that matches the order.EncryptedMetadataKeyId")
|
|
|
|
}
|
|
|
|
key = EncryptionKey{
|
|
|
|
ID: orderKeyID,
|
|
|
|
Key: val,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return key.DecryptMetadata(order.SerialNumber, order.EncryptedMetadata)
|
|
|
|
}
|