2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-09 22:10:37 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package audit
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"context"
|
2023-02-28 05:08:47 +00:00
|
|
|
"errors"
|
2021-06-07 18:33:42 +01:00
|
|
|
"fmt"
|
2018-10-09 22:10:37 +01:00
|
|
|
"io"
|
2019-09-11 23:37:01 +01:00
|
|
|
"math/rand"
|
2019-03-19 17:37:26 +00:00
|
|
|
"time"
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2023-02-28 05:08:47 +00:00
|
|
|
"github.com/davecgh/go-spew/spew"
|
2019-11-08 20:40:39 +00:00
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
2018-10-09 22:10:37 +01:00
|
|
|
"github.com/vivint/infectious"
|
2019-01-29 20:42:27 +00:00
|
|
|
"github.com/zeebo/errs"
|
2019-03-18 10:55:06 +00:00
|
|
|
"go.uber.org/zap"
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/errs2"
|
|
|
|
"storj.io/common/identity"
|
|
|
|
"storj.io/common/memory"
|
|
|
|
"storj.io/common/pb"
|
|
|
|
"storj.io/common/rpc"
|
2022-10-19 14:32:24 +01:00
|
|
|
"storj.io/common/rpc/rpcpool"
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/rpc/rpcstatus"
|
|
|
|
"storj.io/common/storj"
|
2021-04-21 13:42:57 +01:00
|
|
|
"storj.io/storj/satellite/metabase"
|
2019-03-28 20:09:23 +00:00
|
|
|
"storj.io/storj/satellite/orders"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2020-02-21 14:07:29 +00:00
|
|
|
"storj.io/uplink/private/piecestore"
|
2018-10-09 22:10:37 +01:00
|
|
|
)
|
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
var (
|
|
|
|
mon = monkit.Package()
|
2019-05-27 12:13:47 +01:00
|
|
|
|
2020-08-11 15:50:01 +01:00
|
|
|
// ErrNotEnoughShares is the errs class for when not enough shares are available to do an audit.
|
2019-05-27 12:13:47 +01:00
|
|
|
ErrNotEnoughShares = errs.Class("not enough shares for successful audit")
|
2020-08-11 15:50:01 +01:00
|
|
|
// ErrSegmentDeleted is the errs class when the audited segment was deleted during the audit.
|
2019-06-19 10:02:25 +01:00
|
|
|
ErrSegmentDeleted = errs.Class("segment deleted during audit")
|
2020-08-11 15:50:01 +01:00
|
|
|
// ErrSegmentModified is the errs class used when a segment has been changed in any way.
|
2020-03-04 23:09:18 +00:00
|
|
|
ErrSegmentModified = errs.Class("segment has been modified")
|
2019-03-19 17:37:26 +00:00
|
|
|
)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// Share represents required information about an audited share.
|
2019-01-23 19:58:44 +00:00
|
|
|
type Share struct {
|
2019-03-18 10:55:06 +00:00
|
|
|
Error error
|
|
|
|
PieceNum int
|
2019-06-11 09:00:59 +01:00
|
|
|
NodeID storj.NodeID
|
2019-03-18 10:55:06 +00:00
|
|
|
Data []byte
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2020-12-05 16:01:42 +00:00
|
|
|
// Verifier helps verify the correctness of a given stripe.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Worker
|
2018-10-10 19:25:46 +01:00
|
|
|
type Verifier struct {
|
2022-11-23 15:24:30 +00:00
|
|
|
log *zap.Logger
|
|
|
|
metabase *metabase.DB
|
|
|
|
orders *orders.Service
|
|
|
|
auditor *identity.PeerIdentity
|
|
|
|
dialer rpc.Dialer
|
|
|
|
overlay *overlay.Service
|
|
|
|
containment Containment
|
2019-06-03 10:17:09 +01:00
|
|
|
minBytesPerSecond memory.Size
|
|
|
|
minDownloadTimeout time.Duration
|
2019-09-11 23:37:01 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
nowFn func() time.Time
|
2019-09-11 23:37:01 +01:00
|
|
|
OnTestingCheckSegmentAlteredHook func()
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// NewVerifier creates a Verifier.
|
2022-11-23 15:24:30 +00:00
|
|
|
func NewVerifier(log *zap.Logger, metabase *metabase.DB, dialer rpc.Dialer, overlay *overlay.Service, containment Containment, orders *orders.Service, id *identity.FullIdentity, minBytesPerSecond memory.Size, minDownloadTimeout time.Duration) *Verifier {
|
2019-05-27 12:13:47 +01:00
|
|
|
return &Verifier{
|
2019-06-03 10:17:09 +01:00
|
|
|
log: log,
|
2020-12-14 12:54:22 +00:00
|
|
|
metabase: metabase,
|
2019-06-03 10:17:09 +01:00
|
|
|
orders: orders,
|
|
|
|
auditor: id.PeerIdentity(),
|
2019-09-19 05:46:39 +01:00
|
|
|
dialer: dialer,
|
2019-06-03 10:17:09 +01:00
|
|
|
overlay: overlay,
|
2022-11-23 15:24:30 +00:00
|
|
|
containment: containment,
|
2019-06-03 10:17:09 +01:00
|
|
|
minBytesPerSecond: minBytesPerSecond,
|
|
|
|
minDownloadTimeout: minDownloadTimeout,
|
2020-12-14 12:54:22 +00:00
|
|
|
nowFn: time.Now,
|
2019-05-27 12:13:47 +01:00
|
|
|
}
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
// Verify downloads shares then verifies the data correctness at a random stripe.
|
2020-12-14 12:54:22 +00:00
|
|
|
func (verifier *Verifier) Verify(ctx context.Context, segment Segment, skip map[storj.NodeID]bool) (report Report, err error) {
|
2019-03-20 10:54:37 +00:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2021-08-11 21:02:54 +01:00
|
|
|
var segmentInfo metabase.Segment
|
|
|
|
defer func() {
|
2021-09-15 21:31:33 +01:00
|
|
|
recordStats(report, len(segmentInfo.Pieces), err)
|
2021-08-11 21:02:54 +01:00
|
|
|
}()
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
if segment.Expired(verifier.nowFn()) {
|
2020-08-25 14:32:05 +01:00
|
|
|
verifier.log.Debug("segment expired before Verify")
|
|
|
|
return Report{}, nil
|
2019-11-05 19:41:48 +00:00
|
|
|
}
|
2019-09-11 23:37:01 +01:00
|
|
|
|
2021-08-11 21:02:54 +01:00
|
|
|
segmentInfo, err = verifier.metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
2020-12-14 12:54:22 +00:00
|
|
|
StreamID: segment.StreamID,
|
|
|
|
Position: segment.Position,
|
|
|
|
})
|
2019-09-11 23:37:01 +01:00
|
|
|
if err != nil {
|
2020-12-14 12:54:22 +00:00
|
|
|
if metabase.ErrSegmentNotFound.Has(err) {
|
|
|
|
verifier.log.Debug("segment deleted before Verify")
|
|
|
|
return Report{}, nil
|
|
|
|
}
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{}, err
|
2019-09-11 23:37:01 +01:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
randomIndex, err := GetRandomStripe(ctx, segmentInfo)
|
2020-08-28 12:56:09 +01:00
|
|
|
if err != nil {
|
|
|
|
return Report{}, err
|
|
|
|
}
|
2019-03-28 20:09:23 +00:00
|
|
|
|
2019-06-07 13:38:41 +01:00
|
|
|
var offlineNodes storj.NodeIDList
|
|
|
|
var failedNodes storj.NodeIDList
|
2019-11-19 16:30:28 +00:00
|
|
|
var unknownNodes storj.NodeIDList
|
2019-06-07 13:38:41 +01:00
|
|
|
containedNodes := make(map[int]storj.NodeID)
|
|
|
|
sharesToAudit := make(map[int]Share)
|
|
|
|
|
2021-11-08 20:51:04 +00:00
|
|
|
orderLimits, privateKey, cachedNodesInfo, err := verifier.orders.CreateAuditOrderLimits(ctx, segmentInfo, skip)
|
2019-03-28 20:09:23 +00:00
|
|
|
if err != nil {
|
2021-08-17 17:33:22 +01:00
|
|
|
if orders.ErrDownloadFailedNotEnoughPieces.Has(err) {
|
satellite/audit: better handling of piece fetch errors
We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.
Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.
Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.
After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.
Refs: https://github.com/storj/storj/issues/4669
Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
2022-09-27 00:47:03 +01:00
|
|
|
mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
|
|
|
|
mon.Counter("audit_not_enough_nodes_online").Inc(1) //mon:locked
|
2021-08-17 17:33:22 +01:00
|
|
|
err = ErrNotEnoughShares.Wrap(err)
|
|
|
|
}
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{}, err
|
2019-03-28 20:09:23 +00:00
|
|
|
}
|
2021-11-08 20:51:04 +00:00
|
|
|
cachedNodesReputation := make(map[storj.NodeID]overlay.ReputationStatus, len(cachedNodesInfo))
|
|
|
|
for id, info := range cachedNodesInfo {
|
|
|
|
cachedNodesReputation[id] = info.Reputation
|
|
|
|
}
|
|
|
|
defer func() { report.NodesReputation = cachedNodesReputation }()
|
2019-03-20 10:54:37 +00:00
|
|
|
|
2019-08-20 15:23:14 +01:00
|
|
|
// NOTE offlineNodes will include disqualified nodes because they aren't in
|
|
|
|
// the skip list
|
2020-12-14 12:54:22 +00:00
|
|
|
offlineNodes = getOfflineNodes(segmentInfo, orderLimits, skip)
|
2019-06-07 13:38:41 +01:00
|
|
|
if len(offlineNodes) > 0 {
|
2019-10-08 11:51:57 +01:00
|
|
|
verifier.log.Debug("Verify: order limits not created for some nodes (offline/disqualified)",
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.Strings("Node IDs", offlineNodes.Strings()),
|
|
|
|
zap.String("Segment", segmentInfoString(segment)))
|
2019-06-07 13:38:41 +01:00
|
|
|
}
|
|
|
|
|
2021-11-08 20:51:04 +00:00
|
|
|
shares, err := verifier.DownloadShares(ctx, orderLimits, privateKey, cachedNodesInfo, randomIndex, segmentInfo.Redundancy.ShareSize)
|
2019-03-20 10:54:37 +00:00
|
|
|
if err != nil {
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2019-06-19 10:02:25 +01:00
|
|
|
Offlines: offlineNodes,
|
|
|
|
}, err
|
|
|
|
}
|
|
|
|
|
2021-06-11 15:34:46 +01:00
|
|
|
err = verifier.checkIfSegmentAltered(ctx, segmentInfo)
|
2019-06-19 10:02:25 +01:00
|
|
|
if err != nil {
|
2020-08-25 14:32:05 +01:00
|
|
|
if ErrSegmentDeleted.Has(err) {
|
|
|
|
verifier.log.Debug("segment deleted during Verify")
|
|
|
|
return Report{}, nil
|
|
|
|
}
|
|
|
|
if ErrSegmentModified.Has(err) {
|
|
|
|
verifier.log.Debug("segment modified during Verify")
|
|
|
|
return Report{}, nil
|
|
|
|
}
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2019-06-19 10:02:25 +01:00
|
|
|
Offlines: offlineNodes,
|
|
|
|
}, err
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for pieceNum, share := range shares {
|
2019-06-07 13:38:41 +01:00
|
|
|
if share.Error == nil {
|
|
|
|
// no error -- share downloaded successfully
|
|
|
|
sharesToAudit[pieceNum] = share
|
|
|
|
continue
|
|
|
|
}
|
2023-02-28 05:08:47 +00:00
|
|
|
|
|
|
|
// TODO: just because an error came from the common/rpc package
|
|
|
|
// does not decisively mean that the problem is something to do
|
|
|
|
// with dialing. instead of trying to guess what different
|
|
|
|
// error classes mean, we should make GetShare inside
|
|
|
|
// DownloadShares return more direct information about when
|
|
|
|
// the failure happened.
|
2019-09-19 05:46:39 +01:00
|
|
|
if rpc.Error.Has(share.Error) {
|
2023-02-28 05:08:47 +00:00
|
|
|
if errors.Is(share.Error, context.DeadlineExceeded) || errs.Is(share.Error, context.DeadlineExceeded) {
|
2019-06-07 13:38:41 +01:00
|
|
|
// dial timeout
|
2019-06-11 09:00:59 +01:00
|
|
|
offlineNodes = append(offlineNodes, share.NodeID)
|
2019-10-08 11:51:57 +01:00
|
|
|
verifier.log.Debug("Verify: dial timeout (offline)",
|
|
|
|
zap.Stringer("Node ID", share.NodeID),
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.String("Segment", segmentInfoString(segment)),
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Error(share.Error))
|
2019-06-07 13:38:41 +01:00
|
|
|
continue
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
2019-09-19 05:46:39 +01:00
|
|
|
if errs2.IsRPC(share.Error, rpcstatus.Unknown) {
|
2019-06-07 13:38:41 +01:00
|
|
|
// dial failed -- offline node
|
2023-02-28 05:08:47 +00:00
|
|
|
// TODO: we should never assume what an unknown
|
|
|
|
// error means. This should be looking for an explicit
|
|
|
|
// indication that dialing failed, not assuming dialing
|
|
|
|
// failed because the rpc status is unknown
|
2019-06-11 09:00:59 +01:00
|
|
|
offlineNodes = append(offlineNodes, share.NodeID)
|
2019-10-08 11:51:57 +01:00
|
|
|
verifier.log.Debug("Verify: dial failed (offline)",
|
|
|
|
zap.Stringer("Node ID", share.NodeID),
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.String("Segment", segmentInfoString(segment)),
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Error(share.Error))
|
2019-06-07 13:38:41 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
// unknown transport error
|
2019-11-19 16:30:28 +00:00
|
|
|
unknownNodes = append(unknownNodes, share.NodeID)
|
2020-03-30 19:09:50 +01:00
|
|
|
verifier.log.Info("Verify: unknown transport error (skipped)",
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Stringer("Node ID", share.NodeID),
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.String("Segment", segmentInfoString(segment)),
|
2023-02-28 05:08:47 +00:00
|
|
|
zap.Error(share.Error),
|
|
|
|
zap.String("ErrorType", spew.Sprintf("%#+v", share.Error)))
|
2019-10-11 19:40:02 +01:00
|
|
|
continue
|
2019-06-07 13:38:41 +01:00
|
|
|
}
|
|
|
|
|
2019-09-19 05:46:39 +01:00
|
|
|
if errs2.IsRPC(share.Error, rpcstatus.NotFound) {
|
2019-06-07 13:38:41 +01:00
|
|
|
// missing share
|
2019-06-11 09:00:59 +01:00
|
|
|
failedNodes = append(failedNodes, share.NodeID)
|
2020-03-30 19:09:50 +01:00
|
|
|
verifier.log.Info("Verify: piece not found (audit failed)",
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Stringer("Node ID", share.NodeID),
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.String("Segment", segmentInfoString(segment)),
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Error(share.Error))
|
2019-06-07 13:38:41 +01:00
|
|
|
continue
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
2019-06-07 13:38:41 +01:00
|
|
|
|
2019-09-19 05:46:39 +01:00
|
|
|
if errs2.IsRPC(share.Error, rpcstatus.DeadlineExceeded) {
|
2019-06-07 13:38:41 +01:00
|
|
|
// dial successful, but download timed out
|
2019-06-11 09:00:59 +01:00
|
|
|
containedNodes[pieceNum] = share.NodeID
|
2020-03-30 19:09:50 +01:00
|
|
|
verifier.log.Info("Verify: download timeout (contained)",
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Stringer("Node ID", share.NodeID),
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.String("Segment", segmentInfoString(segment)),
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Error(share.Error))
|
2019-06-07 13:38:41 +01:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// unknown error
|
2019-11-19 16:30:28 +00:00
|
|
|
unknownNodes = append(unknownNodes, share.NodeID)
|
2020-03-30 19:09:50 +01:00
|
|
|
verifier.log.Info("Verify: unknown error (skipped)",
|
2019-10-08 11:51:57 +01:00
|
|
|
zap.Stringer("Node ID", share.NodeID),
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.String("Segment", segmentInfoString(segment)),
|
2023-02-28 05:08:47 +00:00
|
|
|
zap.Error(share.Error),
|
|
|
|
zap.String("ErrorType", spew.Sprintf("%#+v", share.Error)))
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
2020-10-13 13:13:41 +01:00
|
|
|
mon.IntVal("verify_shares_downloaded_successfully").Observe(int64(len(sharesToAudit))) //mon:locked
|
2020-02-26 21:19:58 +00:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
required := segmentInfo.Redundancy.RequiredShares
|
|
|
|
total := segmentInfo.Redundancy.TotalShares
|
2019-03-20 10:54:37 +00:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
if len(sharesToAudit) < int(required) {
|
satellite/audit: better handling of piece fetch errors
We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.
Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.
Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.
After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.
Refs: https://github.com/storj/storj/issues/4669
Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
2022-09-27 00:47:03 +01:00
|
|
|
mon.Counter("not_enough_shares_for_audit").Inc(1) //mon:locked
|
2021-07-19 17:32:47 +01:00
|
|
|
// if we have reached this point, most likely something went wrong
|
satellite/audit: better handling of piece fetch errors
We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.
Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.
Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.
After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.
Refs: https://github.com/storj/storj/issues/4669
Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
2022-09-27 00:47:03 +01:00
|
|
|
// like a network problem or a forgotten delete. Don't fail nodes.
|
|
|
|
// We have an alert on this. Check the logs and see what happened.
|
|
|
|
if len(offlineNodes)+len(containedNodes) > len(sharesToAudit)+len(failedNodes)+len(unknownNodes) {
|
|
|
|
mon.Counter("audit_suspected_network_problem").Inc(1) //mon:locked
|
|
|
|
} else {
|
|
|
|
mon.Counter("audit_not_enough_shares_acquired").Inc(1) //mon:locked
|
|
|
|
}
|
2021-07-19 17:32:47 +01:00
|
|
|
report := Report{
|
2019-05-23 23:32:19 +01:00
|
|
|
Offlines: offlineNodes,
|
2019-11-19 16:30:28 +00:00
|
|
|
Unknown: unknownNodes,
|
2021-07-19 17:32:47 +01:00
|
|
|
}
|
2021-08-06 18:58:22 +01:00
|
|
|
return report, ErrNotEnoughShares.New("got: %d, required: %d, failed: %d, offline: %d, unknown: %d, contained: %d",
|
|
|
|
len(sharesToAudit), required, len(failedNodes), len(offlineNodes), len(unknownNodes), len(containedNodes))
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
satellite/audit: better handling of piece fetch errors
We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.
Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.
Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.
After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.
Refs: https://github.com/storj/storj/issues/4669
Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
2022-09-27 00:47:03 +01:00
|
|
|
// ensure we get values, even if only zero values, so that redash can have an alert based on these
|
|
|
|
mon.Counter("not_enough_shares_for_audit").Inc(0) //mon:locked
|
|
|
|
mon.Counter("audit_not_enough_nodes_online").Inc(0) //mon:locked
|
|
|
|
mon.Counter("audit_not_enough_shares_acquired").Inc(0) //mon:locked
|
|
|
|
mon.Counter("could_not_verify_audit_shares").Inc(0) //mon:locked
|
|
|
|
mon.Counter("audit_suspected_network_problem").Inc(0) //mon:locked
|
2019-03-20 10:54:37 +00:00
|
|
|
|
2022-11-22 23:18:01 +00:00
|
|
|
pieceNums, _, err := auditShares(ctx, required, total, sharesToAudit)
|
2019-03-20 10:54:37 +00:00
|
|
|
if err != nil {
|
2021-08-16 21:11:45 +01:00
|
|
|
mon.Counter("could_not_verify_audit_shares").Inc(1) //mon:locked
|
satellite/audit: better handling of piece fetch errors
We have an alert on `not_enough_shares_for_audit` which fires too
frequently. Every time so far, it has been because of a network blip of
some nature on the satellite side.
Satellite operators are expected to have other means in place for
alerting on network problems and fixing them, so it's not necessary for
the audit framework to act in that way.
Instead, in this change, we add three new metrics,
`audit_not_enough_nodes_online`, `audit_not_enough_shares_acquired`, and
`audit_suspected_network_problem`. When an audit fails, and emits
`not_enough_shares_for_audit`, we will now determine whether it looks
like we are having network problems (most errors are connection
failures, possibly also some successful connections which subsequently
time out) or whether something else has happened.
After this is deployed, we can remove the alert on
`not_enough_shares_for_audit` and add new alerts on
`audit_not_enough_nodes_online` and `audit_not_enough_shares_acquired`.
`audit_suspected_network_problem` does not need an alert.
Refs: https://github.com/storj/storj/issues/4669
Change-Id: Ibb256bc19d2578904f71f5229111ac98e5212fcb
2022-09-27 00:47:03 +01:00
|
|
|
verifier.log.Error("could not verify shares", zap.String("Segment", segmentInfoString(segment)), zap.Error(err))
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2019-06-07 13:38:41 +01:00
|
|
|
Fails: failedNodes,
|
2019-05-23 23:32:19 +01:00
|
|
|
Offlines: offlineNodes,
|
2019-11-19 16:30:28 +00:00
|
|
|
Unknown: unknownNodes,
|
2019-05-23 21:07:19 +01:00
|
|
|
}, err
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, pieceNum := range pieceNums {
|
2021-03-09 22:05:37 +00:00
|
|
|
verifier.log.Info("Verify: share data altered (audit failed)",
|
2021-06-07 18:33:42 +01:00
|
|
|
zap.Stringer("Node ID", shares[pieceNum].NodeID),
|
|
|
|
zap.String("Segment", segmentInfoString(segment)))
|
2019-06-11 09:00:59 +01:00
|
|
|
failedNodes = append(failedNodes, shares[pieceNum].NodeID)
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
|
|
|
|
2019-11-19 16:30:28 +00:00
|
|
|
successNodes := getSuccessNodes(ctx, shares, failedNodes, offlineNodes, unknownNodes, containedNodes)
|
2019-05-23 21:07:19 +01:00
|
|
|
|
2022-11-22 23:18:01 +00:00
|
|
|
pendingAudits, err := createPendingAudits(ctx, containedNodes, segment)
|
2019-05-23 21:07:19 +01:00
|
|
|
if err != nil {
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2019-05-23 23:32:19 +01:00
|
|
|
Successes: successNodes,
|
|
|
|
Fails: failedNodes,
|
|
|
|
Offlines: offlineNodes,
|
2019-11-19 16:30:28 +00:00
|
|
|
Unknown: unknownNodes,
|
2019-05-23 21:07:19 +01:00
|
|
|
}, err
|
|
|
|
}
|
2019-03-20 10:54:37 +00:00
|
|
|
|
2019-10-09 15:06:58 +01:00
|
|
|
return Report{
|
2022-11-23 15:24:30 +00:00
|
|
|
Successes: successNodes,
|
|
|
|
Fails: failedNodes,
|
|
|
|
Offlines: offlineNodes,
|
|
|
|
PendingAudits: pendingAudits,
|
|
|
|
Unknown: unknownNodes,
|
2019-03-20 10:54:37 +00:00
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
2021-06-07 18:33:42 +01:00
|
|
|
func segmentInfoString(segment Segment) string {
|
2021-06-14 16:40:46 +01:00
|
|
|
return fmt.Sprintf("%s/%d",
|
2021-06-07 18:33:42 +01:00
|
|
|
segment.StreamID.String(),
|
|
|
|
segment.Position.Encode(),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// DownloadShares downloads shares from the nodes where remote pieces are located.
|
2021-11-08 20:51:04 +00:00
|
|
|
func (verifier *Verifier) DownloadShares(ctx context.Context, limits []*pb.AddressedOrderLimit, piecePrivateKey storj.PiecePrivateKey, cachedNodesInfo map[storj.NodeID]overlay.NodeReputation, stripeIndex int32, shareSize int32) (shares map[int]Share, err error) {
|
2019-03-20 10:54:37 +00:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
shares = make(map[int]Share, len(limits))
|
2019-06-11 09:00:59 +01:00
|
|
|
ch := make(chan *Share, len(limits))
|
2019-03-20 10:54:37 +00:00
|
|
|
|
|
|
|
for i, limit := range limits {
|
|
|
|
if limit == nil {
|
2019-06-11 09:00:59 +01:00
|
|
|
ch <- nil
|
2019-03-20 10:54:37 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-11-08 20:51:04 +00:00
|
|
|
var ipPort string
|
|
|
|
node, ok := cachedNodesInfo[limit.Limit.StorageNodeId]
|
|
|
|
if ok && node.LastIPPort != "" {
|
|
|
|
ipPort = node.LastIPPort
|
|
|
|
}
|
|
|
|
|
2019-06-11 09:00:59 +01:00
|
|
|
go func(i int, limit *pb.AddressedOrderLimit) {
|
2021-11-08 20:51:04 +00:00
|
|
|
share, err := verifier.GetShare(ctx, limit, piecePrivateKey, ipPort, stripeIndex, shareSize, i)
|
2019-06-11 09:00:59 +01:00
|
|
|
if err != nil {
|
|
|
|
share = Share{
|
|
|
|
Error: err,
|
|
|
|
PieceNum: i,
|
|
|
|
NodeID: limit.GetLimit().StorageNodeId,
|
|
|
|
Data: nil,
|
|
|
|
}
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
2019-06-11 09:00:59 +01:00
|
|
|
ch <- &share
|
|
|
|
}(i, limit)
|
|
|
|
}
|
2019-03-20 10:54:37 +00:00
|
|
|
|
2019-06-11 09:00:59 +01:00
|
|
|
for range limits {
|
|
|
|
share := <-ch
|
|
|
|
if share != nil {
|
|
|
|
shares[share.PieceNum] = *share
|
|
|
|
}
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
|
|
|
|
2019-06-11 09:00:59 +01:00
|
|
|
return shares, nil
|
2019-03-20 10:54:37 +00:00
|
|
|
}
|
|
|
|
|
2022-11-22 22:34:55 +00:00
|
|
|
// IdentifyContainedNodes returns the set of all contained nodes out of the
|
|
|
|
// holders of pieces in the given segment.
|
|
|
|
func (verifier *Verifier) IdentifyContainedNodes(ctx context.Context, segment Segment) (skipList map[storj.NodeID]bool, err error) {
|
|
|
|
segmentInfo, err := verifier.metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: segment.StreamID,
|
|
|
|
Position: segment.Position,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
skipList = make(map[storj.NodeID]bool)
|
|
|
|
for _, piece := range segmentInfo.Pieces {
|
2022-11-23 15:24:30 +00:00
|
|
|
_, err := verifier.containment.Get(ctx, piece.StorageNode)
|
2022-11-22 22:34:55 +00:00
|
|
|
if err != nil {
|
|
|
|
if ErrContainedNotFound.Has(err) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
verifier.log.Error("can not determine if node is contained", zap.Stringer("node-id", piece.StorageNode), zap.Error(err))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
skipList[piece.StorageNode] = true
|
|
|
|
}
|
|
|
|
return skipList, nil
|
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// GetShare use piece store client to download shares from nodes.
|
2020-12-14 12:54:22 +00:00
|
|
|
func (verifier *Verifier) GetShare(ctx context.Context, limit *pb.AddressedOrderLimit, piecePrivateKey storj.PiecePrivateKey, cachedIPAndPort string, stripeIndex, shareSize int32, pieceNum int) (share Share, err error) {
|
2018-10-09 22:10:37 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-01-10 20:13:40 +00:00
|
|
|
|
2019-03-19 17:37:26 +00:00
|
|
|
bandwidthMsgSize := shareSize
|
|
|
|
|
|
|
|
// determines number of seconds allotted for receiving data from a storage node
|
2019-03-22 13:14:17 +00:00
|
|
|
timedCtx := ctx
|
2019-05-17 19:48:32 +01:00
|
|
|
if verifier.minBytesPerSecond > 0 {
|
|
|
|
maxTransferTime := time.Duration(int64(time.Second) * int64(bandwidthMsgSize) / verifier.minBytesPerSecond.Int64())
|
2019-06-03 10:17:09 +01:00
|
|
|
if maxTransferTime < verifier.minDownloadTimeout {
|
|
|
|
maxTransferTime = verifier.minDownloadTimeout
|
2019-04-03 18:17:29 +01:00
|
|
|
}
|
2019-03-22 13:14:17 +00:00
|
|
|
var cancel func()
|
|
|
|
timedCtx, cancel = context.WithTimeout(ctx, maxTransferTime)
|
|
|
|
defer cancel()
|
|
|
|
}
|
2019-03-19 17:37:26 +00:00
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
targetNodeID := limit.GetLimit().StorageNodeId
|
|
|
|
log := verifier.log.Named(targetNodeID.String())
|
|
|
|
var ps *piecestore.Client
|
|
|
|
|
|
|
|
// if cached IP is given, try connecting there first
|
|
|
|
if cachedIPAndPort != "" {
|
|
|
|
nodeAddr := storj.NodeURL{
|
|
|
|
ID: targetNodeID,
|
|
|
|
Address: cachedIPAndPort,
|
|
|
|
}
|
2022-10-19 14:32:24 +01:00
|
|
|
ps, err = piecestore.Dial(rpcpool.WithForceDial(timedCtx), verifier.dialer, nodeAddr, piecestore.DefaultConfig)
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
if err != nil {
|
|
|
|
log.Debug("failed to connect to audit target node at cached IP", zap.String("cached-ip-and-port", cachedIPAndPort), zap.Error(err))
|
|
|
|
}
|
2020-05-19 16:49:13 +01:00
|
|
|
}
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
|
|
|
|
// if no cached IP was given, or connecting to cached IP failed, use node address
|
|
|
|
if ps == nil {
|
|
|
|
nodeAddr := storj.NodeURL{
|
|
|
|
ID: targetNodeID,
|
|
|
|
Address: limit.GetStorageNodeAddress().Address,
|
|
|
|
}
|
2022-10-19 14:32:24 +01:00
|
|
|
ps, err = piecestore.Dial(rpcpool.WithForceDial(timedCtx), verifier.dialer, nodeAddr, piecestore.DefaultConfig)
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
if err != nil {
|
|
|
|
return Share{}, Error.Wrap(err)
|
|
|
|
}
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
|
2019-04-03 14:42:24 +01:00
|
|
|
defer func() {
|
|
|
|
err := ps.Close()
|
|
|
|
if err != nil {
|
2019-05-17 19:48:32 +01:00
|
|
|
verifier.log.Error("audit verifier failed to close conn to node: %+v", zap.Error(err))
|
2019-04-03 14:42:24 +01:00
|
|
|
}
|
|
|
|
}()
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
offset := int64(shareSize) * int64(stripeIndex)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-07-11 21:51:40 +01:00
|
|
|
downloader, err := ps.Download(timedCtx, limit.GetLimit(), piecePrivateKey, offset, int64(shareSize))
|
2018-10-09 22:10:37 +01:00
|
|
|
if err != nil {
|
2019-03-18 10:55:06 +00:00
|
|
|
return Share{}, err
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
2019-03-18 10:55:06 +00:00
|
|
|
defer func() { err = errs.Combine(err, downloader.Close()) }()
|
2018-10-09 22:10:37 +01:00
|
|
|
|
|
|
|
buf := make([]byte, shareSize)
|
2019-03-18 10:55:06 +00:00
|
|
|
_, err = io.ReadFull(downloader, buf)
|
2018-10-09 22:10:37 +01:00
|
|
|
if err != nil {
|
2019-03-18 10:55:06 +00:00
|
|
|
return Share{}, err
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2019-03-18 10:55:06 +00:00
|
|
|
return Share{
|
|
|
|
Error: nil,
|
|
|
|
PieceNum: pieceNum,
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
NodeID: targetNodeID,
|
2019-03-18 10:55:06 +00:00
|
|
|
Data: buf,
|
|
|
|
}, nil
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// checkIfSegmentAltered checks if oldSegment has been altered since it was selected for audit.
|
2021-06-11 15:34:46 +01:00
|
|
|
func (verifier *Verifier) checkIfSegmentAltered(ctx context.Context, oldSegment metabase.Segment) (err error) {
|
2019-06-19 10:02:25 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
if verifier.OnTestingCheckSegmentAlteredHook != nil {
|
|
|
|
verifier.OnTestingCheckSegmentAlteredHook()
|
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
newSegment, err := verifier.metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: oldSegment.StreamID,
|
|
|
|
Position: oldSegment.Position,
|
|
|
|
})
|
2019-06-19 10:02:25 +01:00
|
|
|
if err != nil {
|
2020-12-14 12:54:22 +00:00
|
|
|
if metabase.ErrSegmentNotFound.Has(err) {
|
2021-06-11 15:34:46 +01:00
|
|
|
return ErrSegmentDeleted.New("StreamID: %q Position: %d", oldSegment.StreamID.String(), oldSegment.Position.Encode())
|
2019-06-19 10:02:25 +01:00
|
|
|
}
|
2020-03-04 23:09:18 +00:00
|
|
|
return err
|
2019-06-19 10:02:25 +01:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
if !oldSegment.Pieces.Equal(newSegment.Pieces) {
|
2021-06-11 15:34:46 +01:00
|
|
|
return ErrSegmentModified.New("StreamID: %q Position: %d", oldSegment.StreamID.String(), oldSegment.Position.Encode())
|
2019-06-19 10:02:25 +01:00
|
|
|
}
|
2020-03-04 23:09:18 +00:00
|
|
|
return nil
|
2019-06-19 10:02:25 +01:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// SetNow allows tests to have the server act as if the current time is whatever they want.
|
|
|
|
func (verifier *Verifier) SetNow(nowFn func() time.Time) {
|
|
|
|
verifier.nowFn = nowFn
|
|
|
|
}
|
|
|
|
|
2018-10-09 22:10:37 +01:00
|
|
|
// auditShares takes the downloaded shares and uses infectious's Correct function to check that they
|
2019-05-23 21:07:19 +01:00
|
|
|
// haven't been altered. auditShares returns a slice containing the piece numbers of altered shares,
|
|
|
|
// and a slice of the corrected shares.
|
2020-12-14 12:54:22 +00:00
|
|
|
func auditShares(ctx context.Context, required, total int16, originals map[int]Share) (pieceNums []int, corrected []infectious.Share, err error) {
|
2018-10-09 22:10:37 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2020-12-14 12:54:22 +00:00
|
|
|
f, err := infectious.NewFEC(int(required), int(total))
|
2018-10-09 22:10:37 +01:00
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
return nil, nil, err
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
2018-11-07 01:16:43 +00:00
|
|
|
|
2018-10-09 22:10:37 +01:00
|
|
|
copies, err := makeCopies(ctx, originals)
|
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
return nil, nil, err
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
err = f.Correct(copies)
|
|
|
|
if err != nil {
|
2019-05-23 21:07:19 +01:00
|
|
|
return nil, nil, err
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
2019-03-19 17:37:26 +00:00
|
|
|
|
2018-11-28 07:33:17 +00:00
|
|
|
for _, share := range copies {
|
|
|
|
if !bytes.Equal(originals[share.Number].Data, share.Data) {
|
2018-10-09 22:10:37 +01:00
|
|
|
pieceNums = append(pieceNums, share.Number)
|
|
|
|
}
|
|
|
|
}
|
2019-05-23 21:07:19 +01:00
|
|
|
return pieceNums, copies, nil
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// makeCopies takes in a map of audit Shares and deep copies their data to a slice of infectious Shares.
|
2019-03-20 10:54:37 +00:00
|
|
|
func makeCopies(ctx context.Context, originals map[int]Share) (copies []infectious.Share, err error) {
|
2018-10-09 22:10:37 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-03-20 10:54:37 +00:00
|
|
|
copies = make([]infectious.Share, 0, len(originals))
|
|
|
|
for _, original := range originals {
|
|
|
|
copies = append(copies, infectious.Share{
|
|
|
|
Data: append([]byte{}, original.Data...),
|
|
|
|
Number: original.PieceNum})
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
2019-03-20 10:54:37 +00:00
|
|
|
return copies, nil
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
|
2022-11-22 23:18:01 +00:00
|
|
|
// getOfflineNodes returns those storage nodes from the segment which have no
|
2019-08-20 15:23:14 +01:00
|
|
|
// order limit nor are skipped.
|
2020-12-14 12:54:22 +00:00
|
|
|
func getOfflineNodes(segment metabase.Segment, limits []*pb.AddressedOrderLimit, skip map[storj.NodeID]bool) storj.NodeIDList {
|
2019-06-07 13:38:41 +01:00
|
|
|
var offlines storj.NodeIDList
|
|
|
|
|
|
|
|
nodesWithLimit := make(map[storj.NodeID]bool, len(limits))
|
|
|
|
for _, limit := range limits {
|
|
|
|
if limit != nil {
|
|
|
|
nodesWithLimit[limit.GetLimit().StorageNodeId] = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
for _, piece := range segment.Pieces {
|
|
|
|
if !nodesWithLimit[piece.StorageNode] && !skip[piece.StorageNode] {
|
|
|
|
offlines = append(offlines, piece.StorageNode)
|
2019-06-07 13:38:41 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return offlines
|
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// getSuccessNodes uses the failed nodes, offline nodes and contained nodes arrays to determine which nodes passed the audit.
|
2019-11-19 16:30:28 +00:00
|
|
|
func getSuccessNodes(ctx context.Context, shares map[int]Share, failedNodes, offlineNodes, unknownNodes storj.NodeIDList, containedNodes map[int]storj.NodeID) (successNodes storj.NodeIDList) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(nil)
|
2018-11-29 18:39:27 +00:00
|
|
|
fails := make(map[storj.NodeID]bool)
|
2018-10-16 18:40:34 +01:00
|
|
|
for _, fail := range failedNodes {
|
|
|
|
fails[fail] = true
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
2018-10-16 18:40:34 +01:00
|
|
|
for _, offline := range offlineNodes {
|
|
|
|
fails[offline] = true
|
|
|
|
}
|
2019-11-19 16:30:28 +00:00
|
|
|
for _, unknown := range unknownNodes {
|
|
|
|
fails[unknown] = true
|
|
|
|
}
|
2019-05-23 21:07:19 +01:00
|
|
|
for _, contained := range containedNodes {
|
|
|
|
fails[contained] = true
|
|
|
|
}
|
2018-10-16 18:40:34 +01:00
|
|
|
|
2019-06-11 09:00:59 +01:00
|
|
|
for _, share := range shares {
|
|
|
|
if !fails[share.NodeID] {
|
|
|
|
successNodes = append(successNodes, share.NodeID)
|
2018-10-16 18:40:34 +01:00
|
|
|
}
|
|
|
|
}
|
2019-02-01 14:48:57 +00:00
|
|
|
|
2018-10-16 18:40:34 +01:00
|
|
|
return successNodes
|
|
|
|
}
|
2019-03-28 20:09:23 +00:00
|
|
|
|
2022-11-22 23:18:01 +00:00
|
|
|
func createPendingAudits(ctx context.Context, containedNodes map[int]storj.NodeID, segment Segment) (pending []*ReverificationJob, err error) {
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-06-07 13:38:41 +01:00
|
|
|
|
|
|
|
if len(containedNodes) == 0 {
|
2019-05-23 21:07:19 +01:00
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
2022-11-22 23:18:01 +00:00
|
|
|
pending = make([]*ReverificationJob, 0, len(containedNodes))
|
2019-05-23 21:07:19 +01:00
|
|
|
for pieceNum, nodeID := range containedNodes {
|
2022-11-22 23:18:01 +00:00
|
|
|
pending = append(pending, &ReverificationJob{
|
|
|
|
Locator: PieceLocator{
|
|
|
|
NodeID: nodeID,
|
|
|
|
StreamID: segment.StreamID,
|
|
|
|
Position: segment.Position,
|
|
|
|
PieceNum: pieceNum,
|
|
|
|
},
|
2019-05-23 21:07:19 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-06-07 13:38:41 +01:00
|
|
|
return pending, nil
|
2019-05-23 21:07:19 +01:00
|
|
|
}
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// GetRandomStripe takes a segment and returns a random stripe index within that segment.
|
|
|
|
func GetRandomStripe(ctx context.Context, segment metabase.Segment) (index int32, err error) {
|
2019-09-11 23:37:01 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
|
|
|
// the last segment could be smaller than stripe size
|
2020-12-14 12:54:22 +00:00
|
|
|
if segment.EncryptedSize < segment.Redundancy.StripeSize() {
|
2019-09-11 23:37:01 +01:00
|
|
|
return 0, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var src cryptoSource
|
|
|
|
rnd := rand.New(src)
|
2021-08-19 13:06:03 +01:00
|
|
|
numStripes := segment.Redundancy.StripeCount(segment.EncryptedSize)
|
2020-12-14 12:54:22 +00:00
|
|
|
randomStripeIndex := rnd.Int31n(numStripes)
|
2019-09-11 23:37:01 +01:00
|
|
|
|
|
|
|
return randomStripeIndex, nil
|
|
|
|
}
|
2021-09-15 21:31:33 +01:00
|
|
|
|
|
|
|
func recordStats(report Report, totalPieces int, verifyErr error) {
|
|
|
|
// If an audit was able to complete without auditing any nodes, that means
|
|
|
|
// the segment has been altered.
|
|
|
|
if verifyErr == nil && len(report.Successes) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
numOffline := len(report.Offlines)
|
|
|
|
numSuccessful := len(report.Successes)
|
|
|
|
numFailed := len(report.Fails)
|
2022-11-23 15:24:30 +00:00
|
|
|
numContained := len(report.PendingAudits)
|
2021-09-15 21:31:33 +01:00
|
|
|
numUnknown := len(report.Unknown)
|
|
|
|
|
|
|
|
totalAudited := numSuccessful + numFailed + numOffline + numContained
|
|
|
|
auditedPercentage := float64(totalAudited) / float64(totalPieces)
|
|
|
|
offlinePercentage := float64(0)
|
|
|
|
successfulPercentage := float64(0)
|
|
|
|
failedPercentage := float64(0)
|
|
|
|
containedPercentage := float64(0)
|
|
|
|
unknownPercentage := float64(0)
|
|
|
|
if totalAudited > 0 {
|
|
|
|
offlinePercentage = float64(numOffline) / float64(totalAudited)
|
|
|
|
successfulPercentage = float64(numSuccessful) / float64(totalAudited)
|
|
|
|
failedPercentage = float64(numFailed) / float64(totalAudited)
|
|
|
|
containedPercentage = float64(numContained) / float64(totalAudited)
|
|
|
|
unknownPercentage = float64(numUnknown) / float64(totalAudited)
|
|
|
|
}
|
|
|
|
|
|
|
|
mon.Meter("audit_success_nodes_global").Mark(numSuccessful) //mon:locked
|
|
|
|
mon.Meter("audit_fail_nodes_global").Mark(numFailed) //mon:locked
|
|
|
|
mon.Meter("audit_offline_nodes_global").Mark(numOffline) //mon:locked
|
|
|
|
mon.Meter("audit_contained_nodes_global").Mark(numContained) //mon:locked
|
|
|
|
mon.Meter("audit_unknown_nodes_global").Mark(numUnknown) //mon:locked
|
|
|
|
mon.Meter("audit_total_nodes_global").Mark(totalAudited) //mon:locked
|
|
|
|
mon.Meter("audit_total_pointer_nodes_global").Mark(totalPieces) //mon:locked
|
|
|
|
|
|
|
|
mon.IntVal("audit_success_nodes").Observe(int64(numSuccessful)) //mon:locked
|
|
|
|
mon.IntVal("audit_fail_nodes").Observe(int64(numFailed)) //mon:locked
|
|
|
|
mon.IntVal("audit_offline_nodes").Observe(int64(numOffline)) //mon:locked
|
|
|
|
mon.IntVal("audit_contained_nodes").Observe(int64(numContained)) //mon:locked
|
|
|
|
mon.IntVal("audit_unknown_nodes").Observe(int64(numUnknown)) //mon:locked
|
|
|
|
mon.IntVal("audit_total_nodes").Observe(int64(totalAudited)) //mon:locked
|
|
|
|
mon.IntVal("audit_total_pointer_nodes").Observe(int64(totalPieces)) //mon:locked
|
|
|
|
mon.FloatVal("audited_percentage").Observe(auditedPercentage) //mon:locked
|
|
|
|
mon.FloatVal("audit_offline_percentage").Observe(offlinePercentage) //mon:locked
|
|
|
|
mon.FloatVal("audit_successful_percentage").Observe(successfulPercentage) //mon:locked
|
|
|
|
mon.FloatVal("audit_failed_percentage").Observe(failedPercentage) //mon:locked
|
|
|
|
mon.FloatVal("audit_contained_percentage").Observe(containedPercentage) //mon:locked
|
|
|
|
mon.FloatVal("audit_unknown_percentage").Observe(unknownPercentage) //mon:locked
|
|
|
|
}
|