2022-10-05 14:24:04 +01:00
// Copyright (C) 2022 Storj Labs, Inc.
// See LICENSE for copying information.
package audit
import (
2022-10-11 20:38:40 +01:00
"bytes"
"context"
"io"
2022-10-05 14:24:04 +01:00
"time"
2022-10-11 20:38:40 +01:00
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/errs2"
"storj.io/common/pb"
"storj.io/common/pkcrypto"
"storj.io/common/rpc"
"storj.io/common/rpc/rpcstatus"
"storj.io/common/signing"
2022-10-05 14:24:04 +01:00
"storj.io/common/storj"
"storj.io/common/uuid"
"storj.io/storj/satellite/metabase"
2022-10-11 20:38:40 +01:00
"storj.io/storj/satellite/overlay"
"storj.io/uplink/private/piecestore"
2022-10-05 14:24:04 +01:00
)
// PieceLocator specifies all information necessary to look up a particular piece
// on a particular satellite.
type PieceLocator struct {
StreamID uuid . UUID
Position metabase . SegmentPosition
NodeID storj . NodeID
PieceNum int
}
// ReverificationJob represents a job as received from the reverification
// audit queue.
type ReverificationJob struct {
Locator PieceLocator
InsertedAt time . Time
ReverifyCount int
2022-11-21 23:53:00 +00:00
LastAttempt time . Time
2022-10-05 14:24:04 +01:00
}
2022-10-11 20:38:40 +01:00
2022-11-22 00:35:13 +00:00
// Reverifier pulls jobs from the reverification queue and fulfills them
// by performing the requested reverifications.
//
// architecture: Worker
type Reverifier struct {
* Verifier
log * zap . Logger
db ReverifyQueue
// retryInterval defines a limit on how frequently we will retry
// reverification audits. At least this long should elapse between
// attempts.
retryInterval time . Duration
}
2022-10-11 20:38:40 +01:00
// Outcome enumerates the possible results of a piecewise audit.
//
// Note that it is very similar to reputation.AuditType, but it is
// different in scope and needs a slightly different set of values.
type Outcome int
const (
// OutcomeNotPerformed indicates an audit was not performed, for any of a
// variety of reasons, but that it should be reattempted later.
OutcomeNotPerformed Outcome = iota
// OutcomeNotNecessary indicates that an audit is no longer required,
// for example because the segment has been updated or no longer exists.
OutcomeNotNecessary
// OutcomeSuccess indicates that an audit took place and the piece was
// fully validated.
OutcomeSuccess
// OutcomeFailure indicates that an audit took place but that the node
// failed the audit, either because it did not have the piece or the
// data was incorrect.
OutcomeFailure
// OutcomeTimedOut indicates the audit could not be completed because
// it took too long. The audit should be retried later.
OutcomeTimedOut
// OutcomeNodeOffline indicates that the audit could not be completed
// because the node could not be contacted. The audit should be
// retried later.
OutcomeNodeOffline
// OutcomeUnknownError indicates that the audit could not be completed
// because of an error not otherwise expected or recognized. The
// audit should be retried later.
OutcomeUnknownError
)
2022-11-22 00:35:13 +00:00
// NewReverifier creates a Reverifier.
func NewReverifier ( log * zap . Logger , verifier * Verifier , db ReverifyQueue , config Config ) * Reverifier {
return & Reverifier {
log : log ,
Verifier : verifier ,
db : db ,
retryInterval : config . ReverificationRetryInterval ,
}
}
2022-10-11 20:38:40 +01:00
// ReverifyPiece acquires a piece from a single node and verifies its
// contents, its hash, and its order limit.
2022-11-22 18:23:43 +00:00
func ( reverifier * Reverifier ) ReverifyPiece ( ctx context . Context , logger * zap . Logger , locator * PieceLocator ) ( outcome Outcome , reputation overlay . ReputationStatus ) {
2022-10-11 20:38:40 +01:00
defer mon . Task ( ) ( & ctx ) ( nil )
2022-11-22 18:23:43 +00:00
outcome , reputation , err := reverifier . DoReverifyPiece ( ctx , logger , locator )
2022-10-11 20:38:40 +01:00
if err != nil {
logger . Error ( "could not perform reverification due to error" , zap . Error ( err ) )
2022-11-22 18:23:43 +00:00
return outcome , reputation
2022-10-11 20:38:40 +01:00
}
var (
successes int
offlines int
fails int
pending int
unknown int
)
switch outcome {
2022-11-22 18:23:43 +00:00
case OutcomeNotPerformed , OutcomeNotNecessary :
2022-10-11 20:38:40 +01:00
case OutcomeSuccess :
successes ++
case OutcomeFailure :
fails ++
case OutcomeTimedOut :
pending ++
case OutcomeNodeOffline :
offlines ++
case OutcomeUnknownError :
unknown ++
}
mon . Meter ( "reverify_successes_global" ) . Mark ( successes ) //mon:locked
mon . Meter ( "reverify_offlines_global" ) . Mark ( offlines ) //mon:locked
mon . Meter ( "reverify_fails_global" ) . Mark ( fails ) //mon:locked
mon . Meter ( "reverify_contained_global" ) . Mark ( pending ) //mon:locked
mon . Meter ( "reverify_unknown_global" ) . Mark ( unknown ) //mon:locked
2022-11-22 18:23:43 +00:00
return outcome , reputation
2022-10-11 20:38:40 +01:00
}
// DoReverifyPiece acquires a piece from a single node and verifies its
// contents, its hash, and its order limit.
2022-11-22 18:23:43 +00:00
func ( reverifier * Reverifier ) DoReverifyPiece ( ctx context . Context , logger * zap . Logger , locator * PieceLocator ) ( outcome Outcome , reputation overlay . ReputationStatus , err error ) {
2022-10-11 20:38:40 +01:00
defer mon . Task ( ) ( & ctx ) ( & err )
// First, we must ensure that the specified node still holds the indicated piece.
2022-11-22 00:35:13 +00:00
segment , err := reverifier . metabase . GetSegmentByPosition ( ctx , metabase . GetSegmentByPosition {
2022-10-11 20:38:40 +01:00
StreamID : locator . StreamID ,
Position : locator . Position ,
} )
if err != nil {
if metabase . ErrSegmentNotFound . Has ( err ) {
logger . Debug ( "segment no longer exists" )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
2022-11-22 18:23:43 +00:00
return OutcomeNotPerformed , reputation , Error . Wrap ( err )
2022-10-11 20:38:40 +01:00
}
2022-11-22 00:35:13 +00:00
if segment . Expired ( reverifier . nowFn ( ) ) {
2022-10-11 20:38:40 +01:00
logger . Debug ( "segment expired before ReverifyPiece" )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
piece , found := segment . Pieces . FindByNum ( locator . PieceNum )
if ! found || piece . StorageNode != locator . NodeID {
logger . Debug ( "piece is no longer held by the indicated node" )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
// TODO remove this when old entries with empty StreamID will be deleted
if locator . StreamID . IsZero ( ) {
logger . Debug ( "ReverifyPiece: skip pending audit with empty StreamID" )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
2023-02-22 10:32:26 +00:00
pieceSize := segment . PieceSize ( )
2022-10-11 20:38:40 +01:00
2022-11-22 00:35:13 +00:00
limit , piecePrivateKey , cachedNodeInfo , err := reverifier . orders . CreateAuditPieceOrderLimit ( ctx , locator . NodeID , uint16 ( locator . PieceNum ) , segment . RootPieceID , int32 ( pieceSize ) )
2022-10-11 20:38:40 +01:00
if err != nil {
if overlay . ErrNodeDisqualified . Has ( err ) {
logger . Debug ( "ReverifyPiece: order limit not created (node is already disqualified)" )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
if overlay . ErrNodeFinishedGE . Has ( err ) {
logger . Debug ( "ReverifyPiece: order limit not created (node has completed graceful exit)" )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
if overlay . ErrNodeOffline . Has ( err ) {
logger . Debug ( "ReverifyPiece: order limit not created (node considered offline)" )
2022-11-22 23:18:01 +00:00
return OutcomeNodeOffline , reputation , nil
2022-10-11 20:38:40 +01:00
}
2022-11-22 18:23:43 +00:00
return OutcomeNotPerformed , reputation , Error . Wrap ( err )
2022-10-11 20:38:40 +01:00
}
2022-11-22 18:23:43 +00:00
reputation = cachedNodeInfo . Reputation
2022-11-22 00:35:13 +00:00
pieceData , pieceHash , pieceOriginalLimit , err := reverifier . GetPiece ( ctx , limit , piecePrivateKey , cachedNodeInfo . LastIPPort , int32 ( pieceSize ) )
2022-10-11 20:38:40 +01:00
if err != nil {
if rpc . Error . Has ( err ) {
if errs . Is ( err , context . DeadlineExceeded ) {
// dial timeout
2022-11-22 18:23:43 +00:00
return OutcomeTimedOut , reputation , nil
2022-10-11 20:38:40 +01:00
}
if errs2 . IsRPC ( err , rpcstatus . Unknown ) {
// dial failed -- offline node
2022-11-22 18:23:43 +00:00
return OutcomeNodeOffline , reputation , nil
2022-10-11 20:38:40 +01:00
}
// unknown transport error
logger . Info ( "ReverifyPiece: unknown transport error" , zap . Error ( err ) )
2022-11-22 18:23:43 +00:00
return OutcomeUnknownError , reputation , nil
2022-10-11 20:38:40 +01:00
}
if errs2 . IsRPC ( err , rpcstatus . NotFound ) {
// Fetch the segment metadata again and see if it has been altered in the interim
2022-11-22 00:35:13 +00:00
err := reverifier . checkIfSegmentAltered ( ctx , segment )
2022-10-11 20:38:40 +01:00
if err != nil {
// if so, we skip this audit
logger . Debug ( "ReverifyPiece: audit source segment changed during reverification" , zap . Error ( err ) )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
// missing share
logger . Info ( "ReverifyPiece: audit failure; node indicates piece not found" )
2022-11-22 18:23:43 +00:00
return OutcomeFailure , reputation , nil
2022-10-11 20:38:40 +01:00
}
if errs2 . IsRPC ( err , rpcstatus . DeadlineExceeded ) {
// dial successful, but download timed out
2022-11-22 18:23:43 +00:00
return OutcomeTimedOut , reputation , nil
2022-10-11 20:38:40 +01:00
}
// unknown error
logger . Info ( "ReverifyPiece: unknown error from node" , zap . Error ( err ) )
2022-11-22 18:23:43 +00:00
return OutcomeUnknownError , reputation , nil
2022-10-11 20:38:40 +01:00
}
// We have successfully acquired the piece from the node. Now, we must verify its contents.
if pieceHash == nil {
logger . Info ( "ReverifyPiece: audit failure; node did not send piece hash as requested" )
2022-11-22 18:23:43 +00:00
return OutcomeFailure , reputation , nil
2022-10-11 20:38:40 +01:00
}
if pieceOriginalLimit == nil {
logger . Info ( "ReverifyPiece: audit failure; node did not send original order limit as requested" )
2022-11-22 18:23:43 +00:00
return OutcomeFailure , reputation , nil
2022-10-11 20:38:40 +01:00
}
// check for the correct size
if int64 ( len ( pieceData ) ) != pieceSize {
logger . Info ( "ReverifyPiece: audit failure; downloaded piece has incorrect size" , zap . Int64 ( "expected-size" , pieceSize ) , zap . Int ( "received-size" , len ( pieceData ) ) )
outcome = OutcomeFailure
// continue to run, so we can check if the piece was legitimately changed before
// blaming the node
} else {
// check for a matching hash
downloadedHash := pkcrypto . SHA256Hash ( pieceData )
if ! bytes . Equal ( downloadedHash , pieceHash . Hash ) {
logger . Info ( "ReverifyPiece: audit failure; downloaded piece does not match hash" , zap . ByteString ( "downloaded" , downloadedHash ) , zap . ByteString ( "expected" , pieceHash . Hash ) )
outcome = OutcomeFailure
// continue to run, so we can check if the piece was legitimately changed
// before blaming the node
} else {
// check that the order limit and hash sent by the storagenode were
// correctly signed (order limit signed by this satellite, hash signed
// by the uplink public key in the order limit)
2022-11-22 00:35:13 +00:00
signer := signing . SigneeFromPeerIdentity ( reverifier . auditor )
2022-10-11 20:38:40 +01:00
if err := signing . VerifyOrderLimitSignature ( ctx , signer , pieceOriginalLimit ) ; err != nil {
2022-11-22 18:23:43 +00:00
return OutcomeFailure , reputation , nil
2022-10-11 20:38:40 +01:00
}
if err := signing . VerifyUplinkPieceHashSignature ( ctx , pieceOriginalLimit . UplinkPublicKey , pieceHash ) ; err != nil {
2022-11-22 18:23:43 +00:00
return OutcomeFailure , reputation , nil
2022-10-11 20:38:40 +01:00
}
}
}
2022-11-22 00:35:13 +00:00
if err := reverifier . checkIfSegmentAltered ( ctx , segment ) ; err != nil {
2022-10-11 20:38:40 +01:00
logger . Debug ( "ReverifyPiece: audit source segment changed during reverification" , zap . Error ( err ) )
2022-11-22 18:23:43 +00:00
return OutcomeNotNecessary , reputation , nil
2022-10-11 20:38:40 +01:00
}
if outcome == OutcomeFailure {
2022-11-22 18:23:43 +00:00
return OutcomeFailure , reputation , nil
2022-10-11 20:38:40 +01:00
}
2022-11-22 18:23:43 +00:00
return OutcomeSuccess , reputation , nil
2022-10-11 20:38:40 +01:00
}
// GetPiece uses the piecestore client to download a piece (and the associated
// original OrderLimit and PieceHash) from a node.
2022-11-22 00:35:13 +00:00
func ( reverifier * Reverifier ) GetPiece ( ctx context . Context , limit * pb . AddressedOrderLimit , piecePrivateKey storj . PiecePrivateKey , cachedIPAndPort string , pieceSize int32 ) ( pieceData [ ] byte , hash * pb . PieceHash , origLimit * pb . OrderLimit , err error ) {
2022-10-11 20:38:40 +01:00
defer mon . Task ( ) ( & ctx ) ( & err )
// determines number of seconds allotted for receiving data from a storage node
timedCtx := ctx
2022-11-22 00:35:13 +00:00
if reverifier . minBytesPerSecond > 0 {
maxTransferTime := time . Duration ( int64 ( time . Second ) * int64 ( pieceSize ) / reverifier . minBytesPerSecond . Int64 ( ) )
if maxTransferTime < reverifier . minDownloadTimeout {
maxTransferTime = reverifier . minDownloadTimeout
2022-10-11 20:38:40 +01:00
}
var cancel func ( )
timedCtx , cancel = context . WithTimeout ( ctx , maxTransferTime )
defer cancel ( )
}
targetNodeID := limit . GetLimit ( ) . StorageNodeId
2022-11-22 00:35:13 +00:00
log := reverifier . log . With ( zap . Stringer ( "node-id" , targetNodeID ) , zap . Stringer ( "piece-id" , limit . GetLimit ( ) . PieceId ) )
2022-10-11 20:38:40 +01:00
var ps * piecestore . Client
// if cached IP is given, try connecting there first
if cachedIPAndPort != "" {
nodeAddr := storj . NodeURL {
ID : targetNodeID ,
Address : cachedIPAndPort ,
}
2022-11-22 00:35:13 +00:00
ps , err = piecestore . Dial ( timedCtx , reverifier . dialer , nodeAddr , piecestore . DefaultConfig )
2022-10-11 20:38:40 +01:00
if err != nil {
log . Debug ( "failed to connect to audit target node at cached IP" , zap . String ( "cached-ip-and-port" , cachedIPAndPort ) , zap . Error ( err ) )
}
}
// if no cached IP was given, or connecting to cached IP failed, use node address
if ps == nil {
nodeAddr := storj . NodeURL {
ID : targetNodeID ,
Address : limit . GetStorageNodeAddress ( ) . Address ,
}
2022-11-22 00:35:13 +00:00
ps , err = piecestore . Dial ( timedCtx , reverifier . dialer , nodeAddr , piecestore . DefaultConfig )
2022-10-11 20:38:40 +01:00
if err != nil {
return nil , nil , nil , Error . Wrap ( err )
}
}
defer func ( ) {
err := ps . Close ( )
if err != nil {
2022-11-22 00:35:13 +00:00
log . Error ( "audit reverifier failed to close conn to node" , zap . Error ( err ) )
2022-10-11 20:38:40 +01:00
}
} ( )
downloader , err := ps . Download ( timedCtx , limit . GetLimit ( ) , piecePrivateKey , 0 , int64 ( pieceSize ) )
if err != nil {
return nil , nil , nil , Error . Wrap ( err )
}
defer func ( ) { err = errs . Combine ( err , Error . Wrap ( downloader . Close ( ) ) ) } ( )
buf := make ( [ ] byte , pieceSize )
_ , err = io . ReadFull ( downloader , buf )
if err != nil {
return nil , nil , nil , Error . Wrap ( err )
}
hash , originLimit := downloader . GetHashAndLimit ( )
return buf , hash , originLimit , nil
}