2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-09 22:10:37 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
package audit_test
|
2018-10-09 22:10:37 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"testing"
|
2019-06-07 22:02:36 +01:00
|
|
|
"time"
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-06-07 13:38:41 +01:00
|
|
|
"github.com/stretchr/testify/assert"
|
2019-05-24 17:57:07 +01:00
|
|
|
"github.com/stretchr/testify/require"
|
2019-06-07 22:02:36 +01:00
|
|
|
"github.com/zeebo/errs"
|
2019-07-05 17:04:15 +01:00
|
|
|
"go.uber.org/zap"
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/errs2"
|
|
|
|
"storj.io/common/memory"
|
|
|
|
"storj.io/common/peertls/tlsopts"
|
|
|
|
"storj.io/common/rpc"
|
|
|
|
"storj.io/common/rpc/rpcstatus"
|
|
|
|
"storj.io/common/storj"
|
|
|
|
"storj.io/common/testcontext"
|
|
|
|
"storj.io/common/testrand"
|
2019-11-14 19:46:15 +00:00
|
|
|
"storj.io/storj/private/testblobs"
|
|
|
|
"storj.io/storj/private/testplanet"
|
2019-11-19 16:30:28 +00:00
|
|
|
"storj.io/storj/satellite"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/audit"
|
2021-04-21 13:42:57 +01:00
|
|
|
"storj.io/storj/satellite/metabase"
|
2019-07-05 17:04:15 +01:00
|
|
|
"storj.io/storj/storagenode"
|
2018-10-09 22:10:37 +01:00
|
|
|
)
|
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
// TestDownloadSharesHappyPath checks that the Share.Error field of all shares
|
|
|
|
// returned by the DownloadShares method contain no error if all shares were
|
|
|
|
// downloaded successfully.
|
|
|
|
func TestDownloadSharesHappyPath(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
uplink := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, segment)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
shareSize := segment.Redundancy.ShareSize
|
|
|
|
|
2021-06-14 16:40:46 +01:00
|
|
|
limits, privateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, segment, nil)
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
shares, err := audits.Verifier.DownloadShares(ctx, limits, privateKey, cachedIPsAndPorts, randomIndex, shareSize)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
for _, share := range shares {
|
|
|
|
assert.NoError(t, share.Error)
|
2018-11-28 07:33:17 +00:00
|
|
|
}
|
2019-06-07 22:02:36 +01:00
|
|
|
})
|
|
|
|
}
|
2019-05-24 17:57:07 +01:00
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
// TestDownloadSharesOfflineNode checks that the Share.Error field of the
|
|
|
|
// shares returned by the DownloadShares method for offline nodes contain an
|
|
|
|
// error that:
|
2019-09-19 05:46:39 +01:00
|
|
|
// - has the rpc.Error class
|
2019-06-07 22:02:36 +01:00
|
|
|
// - is not a context.DeadlineExceeded error
|
|
|
|
// - is not an RPC error
|
|
|
|
//
|
|
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
|
|
// change that affects the audit service.
|
|
|
|
func TestDownloadSharesOfflineNode(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
uplink := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
2019-05-24 17:57:07 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, segment)
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
shareSize := segment.Redundancy.ShareSize
|
|
|
|
|
2021-06-14 16:40:46 +01:00
|
|
|
limits, privateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, segment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// stop the first node in the segment
|
|
|
|
stoppedNodeID := segment.Pieces[0].StorageNode
|
2020-05-07 09:23:40 +01:00
|
|
|
err = planet.StopNodeAndUpdate(ctx, planet.FindNode(stoppedNodeID))
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
shares, err := audits.Verifier.DownloadShares(ctx, limits, privateKey, cachedIPsAndPorts, randomIndex, shareSize)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-06-11 09:00:59 +01:00
|
|
|
for _, share := range shares {
|
|
|
|
if share.NodeID == stoppedNodeID {
|
2019-09-19 05:46:39 +01:00
|
|
|
assert.True(t, rpc.Error.Has(share.Error), "unexpected error: %+v", share.Error)
|
2019-06-26 08:38:07 +01:00
|
|
|
assert.False(t, errs.Is(share.Error, context.DeadlineExceeded), "unexpected error: %+v", share.Error)
|
2019-09-19 05:46:39 +01:00
|
|
|
assert.True(t, errs2.IsRPC(share.Error, rpcstatus.Unknown), "unexpected error: %+v", share.Error)
|
2019-06-07 22:02:36 +01:00
|
|
|
} else {
|
|
|
|
assert.NoError(t, share.Error)
|
|
|
|
}
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
2019-06-07 22:02:36 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestDownloadSharesMissingPiece checks that the Share.Error field of the
|
|
|
|
// shares returned by the DownloadShares method for nodes that don't have the
|
|
|
|
// audited piece contain an RPC error with code NotFound.
|
|
|
|
//
|
|
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
|
|
// change that affects the audit service.
|
|
|
|
func TestDownloadSharesMissingPiece(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
uplink := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, segment)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// replace the piece id of the selected stripe with a new random one
|
|
|
|
// to simulate missing piece on the storage nodes
|
2020-12-14 12:54:22 +00:00
|
|
|
segment.RootPieceID = storj.NewPieceID()
|
2019-05-24 17:57:07 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
shareSize := segment.Redundancy.ShareSize
|
|
|
|
|
2021-06-14 16:40:46 +01:00
|
|
|
limits, privateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, segment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
shares, err := audits.Verifier.DownloadShares(ctx, limits, privateKey, cachedIPsAndPorts, randomIndex, shareSize)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
for _, share := range shares {
|
2019-09-19 05:46:39 +01:00
|
|
|
assert.True(t, errs2.IsRPC(share.Error, rpcstatus.NotFound), "unexpected error: %+v", share.Error)
|
2019-06-07 22:02:36 +01:00
|
|
|
}
|
|
|
|
})
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
// TestDownloadSharesDialTimeout checks that the Share.Error field of the
|
|
|
|
// shares returned by the DownloadShares method for nodes that time out on
|
|
|
|
// dialing contain an error that:
|
2019-09-19 05:46:39 +01:00
|
|
|
// - has the rpc.Error class
|
2019-06-07 22:02:36 +01:00
|
|
|
// - is a context.DeadlineExceeded error
|
|
|
|
// - is not an RPC error
|
|
|
|
//
|
|
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
|
|
// change that affects the audit service.
|
|
|
|
func TestDownloadSharesDialTimeout(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
upl := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := upl.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
2018-10-09 22:10:37 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, segment)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-19 05:46:39 +01:00
|
|
|
tlsOptions, err := tlsopts.NewOptions(satellite.Identity, tlsopts.Config{}, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-19 05:46:39 +01:00
|
|
|
dialer := rpc.NewDefaultDialer(tlsOptions)
|
|
|
|
dialer.DialTimeout = 20 * time.Millisecond
|
|
|
|
dialer.DialLatency = 200 * time.Second
|
2020-09-30 18:39:47 +01:00
|
|
|
|
|
|
|
connector := rpc.NewDefaultTCPConnector(nil)
|
|
|
|
connector.TransferRate = 1 * memory.KB
|
|
|
|
dialer.Connector = connector
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
// This config value will create a very short timeframe allowed for receiving
|
2019-07-05 17:04:15 +01:00
|
|
|
// data from storage nodes. This will cause context to cancel with timeout.
|
2019-06-07 22:02:36 +01:00
|
|
|
minBytesPerSecond := 100 * memory.KiB
|
|
|
|
|
2019-07-01 15:02:00 +01:00
|
|
|
verifier := audit.NewVerifier(
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite.Log.Named("verifier"),
|
2020-12-14 12:54:22 +00:00
|
|
|
satellite.Metainfo.Metabase,
|
2019-09-19 05:46:39 +01:00
|
|
|
dialer,
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite.Overlay.Service,
|
|
|
|
satellite.DB.Containment(),
|
|
|
|
satellite.Orders.Service,
|
|
|
|
satellite.Identity,
|
2019-06-07 22:02:36 +01:00
|
|
|
minBytesPerSecond,
|
|
|
|
5*time.Second)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
shareSize := segment.Redundancy.ShareSize
|
|
|
|
|
2021-06-14 16:40:46 +01:00
|
|
|
limits, privateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, segment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
shares, err := verifier.DownloadShares(ctx, limits, privateKey, cachedIPsAndPorts, randomIndex, shareSize)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
for _, share := range shares {
|
2019-09-19 05:46:39 +01:00
|
|
|
assert.True(t, rpc.Error.Has(share.Error), "unexpected error: %+v", share.Error)
|
2019-06-26 08:38:07 +01:00
|
|
|
assert.True(t, errs.Is(share.Error, context.DeadlineExceeded), "unexpected error: %+v", share.Error)
|
2019-06-07 22:02:36 +01:00
|
|
|
}
|
|
|
|
})
|
2018-10-09 22:10:37 +01:00
|
|
|
}
|
2019-06-07 13:38:41 +01:00
|
|
|
|
2019-06-07 22:02:36 +01:00
|
|
|
// TestDownloadSharesDownloadTimeout checks that the Share.Error field of the
|
|
|
|
// shares returned by the DownloadShares method for nodes that are successfully
|
|
|
|
// dialed, but time out during the download of the share contain an error that:
|
|
|
|
// - is an RPC error with code DeadlineExceeded
|
2019-09-19 05:46:39 +01:00
|
|
|
// - does not have the rpc.Error class
|
2019-06-07 22:02:36 +01:00
|
|
|
//
|
|
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
|
|
// change that affects the audit service.
|
|
|
|
func TestDownloadSharesDownloadTimeout(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
2019-07-05 17:04:15 +01:00
|
|
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-03-27 16:18:19 +00:00
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
2019-07-05 17:04:15 +01:00
|
|
|
return testblobs.NewSlowDB(log.Named("slowdb"), db), nil
|
|
|
|
},
|
|
|
|
},
|
2019-06-07 22:02:36 +01:00
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-07-05 17:04:15 +01:00
|
|
|
storageNodeDB := planet.StorageNodes[0].DB.(*testblobs.SlowDB)
|
2019-09-11 23:37:01 +01:00
|
|
|
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
upl := planet.Uplinks[0]
|
2019-07-05 17:04:15 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := upl.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, segment)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// This config value will create a very short timeframe allowed for receiving
|
2019-07-05 17:04:15 +01:00
|
|
|
// data from storage nodes. This will cause context to cancel with timeout.
|
|
|
|
minBytesPerSecond := 100 * memory.KiB
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-07-01 15:02:00 +01:00
|
|
|
verifier := audit.NewVerifier(
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite.Log.Named("verifier"),
|
2020-12-14 12:54:22 +00:00
|
|
|
satellite.Metainfo.Metabase,
|
2019-09-19 05:46:39 +01:00
|
|
|
satellite.Dialer,
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite.Overlay.Service,
|
|
|
|
satellite.DB.Containment(),
|
|
|
|
satellite.Orders.Service,
|
|
|
|
satellite.Identity,
|
2019-06-07 22:02:36 +01:00
|
|
|
minBytesPerSecond,
|
2019-07-10 22:45:09 +01:00
|
|
|
150*time.Millisecond)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
shareSize := segment.Redundancy.ShareSize
|
|
|
|
|
2021-06-14 16:40:46 +01:00
|
|
|
limits, privateKey, cachedIPsAndPorts, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, segment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-07-10 22:45:09 +01:00
|
|
|
// make downloads on storage node slower than the timeout on the satellite for downloading shares
|
|
|
|
delay := 200 * time.Millisecond
|
2019-07-05 17:04:15 +01:00
|
|
|
storageNodeDB.SetLatency(delay)
|
|
|
|
|
satellite/audit: use LastIPAndPort preferentially
This preserves the last_ip_and_port field from node lookups through
CreateAuditOrderLimits() and CreateAuditOrderLimit(), so that later
calls to (*Verifier).GetShare() can try to use that IP and port. If a
connection to the given IP and port cannot be made, or the connection
cannot be verified and secured with the target node identity, an
attempt is made to connect to the original node address instead.
A similar change is not necessary to the other Create*OrderLimits
functions, because they already replace node addresses with the cached
IP and port as appropriate. We might want to consider making a similar
change to CreateGetRepairOrderLimits(), though.
The audit situation is unique because the ramifications are especially
powerful when we get the address wrong. Failing a single audit can have
a heavy cost to a storage node. We need to make extra effort in order
to avoid imposing that cost unfairly.
Situation 1: If an audit fails because the repair worker failed to make
a DNS query (which might well be the fault on the satellite side), and
we have last_ip_and_port information available for the target node, it
would be unfair not to try connecting to that last_ip_and_port address.
Situation 2: If a node has changed addresses recently and the operator
correctly changed its DNS entry, but we don't bother querying DNS, it
would be unfair to penalize the node for our failure to connect to it.
So the audit worker must try both last_ip_and_port _and_ the node
address as supplied by the SNO.
We elect here to try last_ip_and_port first, on the grounds that (a) it
is expected to work in the large majority of cases, and (b) there
should not be any security concerns with connecting to an out-or-date
address, and (c) avoiding DNS queries on the satellite side helps
alleviate satellite operational load.
Change-Id: I9bf6c6c79866d879adecac6144a6c346f4f61200
2020-09-30 05:53:43 +01:00
|
|
|
shares, err := verifier.DownloadShares(ctx, limits, privateKey, cachedIPsAndPorts, randomIndex, shareSize)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-07-05 17:04:15 +01:00
|
|
|
require.Len(t, shares, 1)
|
|
|
|
share := shares[0]
|
2019-09-19 05:46:39 +01:00
|
|
|
assert.True(t, errs2.IsRPC(share.Error, rpcstatus.DeadlineExceeded), "unexpected error: %+v", share.Error)
|
|
|
|
assert.False(t, rpc.Error.Has(share.Error), "unexpected error: %+v", share.Error)
|
2019-06-07 22:02:36 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierHappyPath(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
assert.Len(t, report.Successes, len(segment.Pieces))
|
2019-06-07 22:02:36 +01:00
|
|
|
assert.Len(t, report.Fails, 0)
|
2019-11-05 19:41:48 +00:00
|
|
|
assert.Len(t, report.Offlines, 0)
|
|
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierExpired(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-11-05 19:41:48 +00:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
err := ul.UploadWithExpiration(ctx, satellite, "testbucket", "test/path", testData, time.Now().Add(1*time.Hour))
|
2019-11-05 19:41:48 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-11-05 19:41:48 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// move time into the future so the segment is expired
|
|
|
|
audits.Verifier.SetNow(func() time.Time {
|
|
|
|
return time.Now().Add(2 * time.Hour)
|
|
|
|
})
|
2019-11-05 19:41:48 +00:00
|
|
|
|
2020-08-25 14:32:05 +01:00
|
|
|
// Verify should not return an error
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2020-08-25 14:32:05 +01:00
|
|
|
require.NoError(t, err)
|
2019-11-05 19:41:48 +00:00
|
|
|
|
|
|
|
assert.Len(t, report.Successes, 0)
|
|
|
|
assert.Len(t, report.Fails, 0)
|
2019-06-07 22:02:36 +01:00
|
|
|
assert.Len(t, report.Offlines, 0)
|
|
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierOfflineNode(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-07-01 15:02:00 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
// stop the first node in the segment
|
|
|
|
stoppedNodeID := segment.Pieces[0].StorageNode
|
2020-05-07 09:23:40 +01:00
|
|
|
err = planet.StopNodeAndUpdate(ctx, planet.FindNode(stoppedNodeID))
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
assert.Len(t, report.Successes, len(segment.Pieces)-1)
|
2019-06-07 22:02:36 +01:00
|
|
|
assert.Len(t, report.Fails, 0)
|
|
|
|
assert.Len(t, report.Offlines, 1)
|
|
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierMissingPiece(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// delete the piece from the first node
|
2020-12-14 12:54:22 +00:00
|
|
|
origNumPieces := len(segment.Pieces)
|
|
|
|
piece := segment.Pieces[0]
|
|
|
|
pieceID := segment.RootPieceID.Derive(piece.StorageNode, int32(piece.Number))
|
|
|
|
node := planet.FindNode(piece.StorageNode)
|
2019-09-11 23:37:01 +01:00
|
|
|
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-07-18 19:08:15 +01:00
|
|
|
assert.Len(t, report.Successes, origNumPieces-1)
|
2019-06-07 22:02:36 +01:00
|
|
|
assert.Len(t, report.Fails, 1)
|
|
|
|
assert.Len(t, report.Offlines, 0)
|
|
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierDialTimeout(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-07 22:02:36 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-19 05:46:39 +01:00
|
|
|
tlsOptions, err := tlsopts.NewOptions(satellite.Identity, tlsopts.Config{}, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-19 05:46:39 +01:00
|
|
|
dialer := rpc.NewDefaultDialer(tlsOptions)
|
|
|
|
dialer.DialTimeout = 20 * time.Millisecond
|
|
|
|
dialer.DialLatency = 200 * time.Second
|
2020-09-30 18:39:47 +01:00
|
|
|
|
|
|
|
connector := rpc.NewDefaultTCPConnector(nil)
|
|
|
|
connector.TransferRate = 1 * memory.KB
|
|
|
|
dialer.Connector = connector
|
2019-06-07 22:02:36 +01:00
|
|
|
|
|
|
|
// This config value will create a very short timeframe allowed for receiving
|
2019-07-05 17:04:15 +01:00
|
|
|
// data from storage nodes. This will cause context to cancel with timeout.
|
2019-06-07 22:02:36 +01:00
|
|
|
minBytesPerSecond := 100 * memory.KiB
|
|
|
|
|
2019-07-01 15:02:00 +01:00
|
|
|
verifier := audit.NewVerifier(
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite.Log.Named("verifier"),
|
2020-12-14 12:54:22 +00:00
|
|
|
satellite.Metainfo.Metabase,
|
2019-09-19 05:46:39 +01:00
|
|
|
dialer,
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite.Overlay.Service,
|
|
|
|
satellite.DB.Containment(),
|
|
|
|
satellite.Orders.Service,
|
|
|
|
satellite.Identity,
|
2019-06-07 22:02:36 +01:00
|
|
|
minBytesPerSecond,
|
|
|
|
5*time.Second)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := verifier.Verify(ctx, queueSegment, nil)
|
2019-06-07 22:02:36 +01:00
|
|
|
require.True(t, audit.ErrNotEnoughShares.Has(err), "unexpected error: %+v", err)
|
|
|
|
|
|
|
|
assert.Len(t, report.Successes, 0)
|
|
|
|
assert.Len(t, report.Fails, 0)
|
2020-12-14 12:54:22 +00:00
|
|
|
assert.Len(t, report.Offlines, len(segment.Pieces))
|
2019-06-07 22:02:36 +01:00
|
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-06-19 10:02:25 +01:00
|
|
|
func TestVerifierDeletedSegment(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-19 10:02:25 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-19 10:02:25 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-19 10:02:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := queue.Next()
|
2019-06-19 10:02:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// delete the file
|
2020-02-10 12:18:18 +00:00
|
|
|
err = ul.DeleteObject(ctx, satellite, "testbucket", "test/path")
|
2019-06-19 10:02:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-08-25 14:32:05 +01:00
|
|
|
// Verify should not return an error, but report should be empty
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, segment, nil)
|
2020-08-25 14:32:05 +01:00
|
|
|
require.NoError(t, err)
|
2019-06-19 10:02:25 +01:00
|
|
|
assert.Empty(t, report)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierModifiedSegment(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2020-03-04 23:09:18 +00:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2020-03-04 23:09:18 +00:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2020-03-04 23:09:18 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Verifier.OnTestingCheckSegmentAlteredHook = func() {
|
|
|
|
// remove one piece from the segment so that checkIfSegmentAltered fails
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2020-03-04 23:09:18 +00:00
|
|
|
require.NoError(t, err)
|
2020-12-14 12:54:22 +00:00
|
|
|
|
|
|
|
err = satellite.Metainfo.Metabase.UpdateSegmentPieces(ctx, metabase.UpdateSegmentPieces{
|
2021-03-12 11:23:44 +00:00
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
OldPieces: segment.Pieces,
|
|
|
|
NewPieces: append([]metabase.Piece{segment.Pieces[0]}, segment.Pieces[2:]...),
|
|
|
|
NewRedundancy: segment.Redundancy,
|
2020-12-14 12:54:22 +00:00
|
|
|
})
|
2020-03-04 23:09:18 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
|
|
|
|
2020-08-25 14:32:05 +01:00
|
|
|
// Verify should not return an error, but report should be empty
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2020-08-25 14:32:05 +01:00
|
|
|
require.NoError(t, err)
|
2020-03-04 23:09:18 +00:00
|
|
|
assert.Empty(t, report)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func TestVerifierReplacedSegment(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-06-19 10:02:25 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
2019-06-26 11:38:51 +01:00
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
2019-06-19 10:02:25 +01:00
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-06-19 10:02:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := queue.Next()
|
2019-06-19 10:02:25 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Verifier.OnTestingCheckSegmentAlteredHook = func() {
|
|
|
|
// replace the file so that checkIfSegmentAltered fails
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
|
|
require.NoError(t, err)
|
|
|
|
}
|
2019-06-19 10:02:25 +01:00
|
|
|
|
2020-08-25 14:32:05 +01:00
|
|
|
// Verify should not return an error, but report should be empty
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, segment, nil)
|
2020-08-25 14:32:05 +01:00
|
|
|
require.NoError(t, err)
|
2019-06-19 10:02:25 +01:00
|
|
|
assert.Empty(t, report)
|
|
|
|
})
|
|
|
|
}
|
2019-07-18 19:08:15 +01:00
|
|
|
|
|
|
|
func TestVerifierModifiedSegmentFailsOnce(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
2019-09-11 23:37:01 +01:00
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-07-18 19:08:15 +01:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-07-18 19:08:15 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-09-11 23:37:01 +01:00
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-09-11 23:37:01 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-07-18 19:08:15 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// delete the piece from the first node
|
2020-12-14 12:54:22 +00:00
|
|
|
origNumPieces := len(segment.Pieces)
|
|
|
|
piece := segment.Pieces[0]
|
|
|
|
pieceID := segment.RootPieceID.Derive(piece.StorageNode, int32(piece.Number))
|
|
|
|
node := planet.FindNode(piece.StorageNode)
|
2019-09-11 23:37:01 +01:00
|
|
|
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
2019-07-18 19:08:15 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2019-07-18 19:08:15 +01:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
assert.Len(t, report.Successes, origNumPieces-1)
|
|
|
|
assert.Len(t, report.Fails, 1)
|
2020-12-14 12:54:22 +00:00
|
|
|
assert.Equal(t, report.Fails[0], piece.StorageNode)
|
2019-07-18 19:08:15 +01:00
|
|
|
assert.Len(t, report.Offlines, 0)
|
|
|
|
require.Len(t, report.PendingAudits, 0)
|
|
|
|
})
|
|
|
|
}
|
2019-11-19 16:30:28 +00:00
|
|
|
|
|
|
|
// TestVerifierSlowDownload checks that a node that times out while sending data to the
|
2020-07-16 15:18:02 +01:00
|
|
|
// audit service gets put into containment mode.
|
2019-11-19 16:30:28 +00:00
|
|
|
func TestVerifierSlowDownload(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-03-27 16:18:19 +00:00
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
2019-11-19 16:30:28 +00:00
|
|
|
return testblobs.NewSlowDB(log.Named("slowdb"), db), nil
|
|
|
|
},
|
2020-10-27 17:34:59 +00:00
|
|
|
Satellite: testplanet.Combine(
|
|
|
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
|
|
|
// These config values are chosen to force the slow node to time out without timing out on the three normal nodes
|
|
|
|
config.Audit.MinBytesPerSecond = 100 * memory.KiB
|
|
|
|
config.Audit.MinDownloadTimeout = 950 * time.Millisecond
|
|
|
|
},
|
|
|
|
testplanet.ReconfigureRS(2, 2, 4, 4),
|
|
|
|
),
|
2019-11-19 16:30:28 +00:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-11-19 16:30:28 +00:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
slowNode := planet.FindNode(segment.Pieces[0].StorageNode)
|
2020-05-07 09:23:40 +01:00
|
|
|
slowNodeDB := slowNode.DB.(*testblobs.SlowDB)
|
|
|
|
// make downloads on storage node slower than the timeout on the satellite for downloading shares
|
|
|
|
delay := 1 * time.Second
|
|
|
|
slowNodeDB.SetLatency(delay)
|
2019-11-19 16:30:28 +00:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-05-07 09:23:40 +01:00
|
|
|
assert.NotContains(t, report.Successes, slowNode.ID())
|
2020-04-30 22:55:28 +01:00
|
|
|
assert.Len(t, report.Fails, 0)
|
|
|
|
assert.Len(t, report.Offlines, 0)
|
|
|
|
assert.Len(t, report.Unknown, 0)
|
|
|
|
assert.Len(t, report.PendingAudits, 1)
|
2020-05-07 09:23:40 +01:00
|
|
|
assert.Equal(t, report.PendingAudits[0].NodeID, slowNode.ID())
|
2019-11-19 16:30:28 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestVerifierUnknownError checks that a node that returns an unknown error in response to an audit request
|
2020-07-16 15:18:02 +01:00
|
|
|
// does not get marked as successful, failed, or contained.
|
2019-11-19 16:30:28 +00:00
|
|
|
func TestVerifierUnknownError(t *testing.T) {
|
|
|
|
testplanet.Run(t, testplanet.Config{
|
|
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
|
|
Reconfigure: testplanet.Reconfigure{
|
2020-03-27 16:18:19 +00:00
|
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
2019-11-19 16:30:28 +00:00
|
|
|
return testblobs.NewBadDB(log.Named("baddb"), db), nil
|
|
|
|
},
|
2020-01-21 10:38:41 +00:00
|
|
|
Satellite: testplanet.ReconfigureRS(2, 2, 4, 4),
|
2019-11-19 16:30:28 +00:00
|
|
|
},
|
|
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
satellite := planet.Satellites[0]
|
|
|
|
audits := satellite.Audit
|
|
|
|
|
|
|
|
audits.Worker.Loop.Pause()
|
2020-07-13 23:24:15 +01:00
|
|
|
audits.Chore.Loop.Pause()
|
2019-11-19 16:30:28 +00:00
|
|
|
|
|
|
|
ul := planet.Uplinks[0]
|
|
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
|
2020-01-21 10:38:41 +00:00
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
2020-08-20 14:29:02 +01:00
|
|
|
queue := audits.Queues.Fetch()
|
2020-12-14 12:54:22 +00:00
|
|
|
queueSegment, err := queue.Next()
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
segment, err := satellite.Metainfo.Metabase.GetSegmentByPosition(ctx, metabase.GetSegmentByPosition{
|
|
|
|
StreamID: queueSegment.StreamID,
|
|
|
|
Position: queueSegment.Position,
|
|
|
|
})
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
badNode := planet.FindNode(segment.Pieces[0].StorageNode)
|
2020-05-07 09:23:40 +01:00
|
|
|
badNodeDB := badNode.DB.(*testblobs.BadDB)
|
|
|
|
// return an error when the verifier attempts to download from this node
|
|
|
|
badNodeDB.SetError(errs.New("unknown error"))
|
2019-11-19 16:30:28 +00:00
|
|
|
|
2020-12-14 12:54:22 +00:00
|
|
|
report, err := audits.Verifier.Verify(ctx, queueSegment, nil)
|
2019-11-19 16:30:28 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
require.Len(t, report.Successes, 3)
|
|
|
|
require.Len(t, report.Fails, 0)
|
|
|
|
require.Len(t, report.Offlines, 0)
|
|
|
|
require.Len(t, report.PendingAudits, 0)
|
|
|
|
require.Len(t, report.Unknown, 1)
|
2020-05-07 09:23:40 +01:00
|
|
|
require.Equal(t, report.Unknown[0], badNode.ID())
|
2019-11-19 16:30:28 +00:00
|
|
|
})
|
|
|
|
}
|