11a44cdd88
Change-Id: I8822dea0d1b7b99e0b828e0373a0308a42dde2be
880 lines
28 KiB
Go
880 lines
28 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package audit_test
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"github.com/zeebo/errs"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/errs2"
|
|
"storj.io/common/memory"
|
|
"storj.io/common/pb"
|
|
"storj.io/common/peertls/tlsopts"
|
|
"storj.io/common/rpc"
|
|
"storj.io/common/rpc/rpcstatus"
|
|
"storj.io/common/storj"
|
|
"storj.io/common/testcontext"
|
|
"storj.io/common/testrand"
|
|
"storj.io/storj/private/testblobs"
|
|
"storj.io/storj/private/testplanet"
|
|
"storj.io/storj/satellite"
|
|
"storj.io/storj/satellite/audit"
|
|
"storj.io/storj/storage"
|
|
"storj.io/storj/storagenode"
|
|
)
|
|
|
|
// TestDownloadSharesHappyPath checks that the Share.Error field of all shares
|
|
// returned by the DownloadShares method contain no error if all shares were
|
|
// downloaded successfully.
|
|
func TestDownloadSharesHappyPath(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
uplink := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
projects, err := satellite.DB.Console().Projects().GetAll(ctx)
|
|
require.NoError(t, err)
|
|
|
|
bucketID := []byte(storj.JoinPaths(projects[0].ID.String(), "testbucket"))
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, pointer)
|
|
require.NoError(t, err)
|
|
|
|
shareSize := pointer.GetRemote().GetRedundancy().GetErasureShareSize()
|
|
limits, privateKey, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, bucketID, pointer, nil)
|
|
require.NoError(t, err)
|
|
|
|
shares, err := audits.Verifier.DownloadShares(ctx, limits, privateKey, randomIndex, shareSize)
|
|
require.NoError(t, err)
|
|
|
|
for _, share := range shares {
|
|
assert.NoError(t, share.Error)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestDownloadSharesOfflineNode checks that the Share.Error field of the
|
|
// shares returned by the DownloadShares method for offline nodes contain an
|
|
// error that:
|
|
// - has the rpc.Error class
|
|
// - is not a context.DeadlineExceeded error
|
|
// - is not an RPC error
|
|
//
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
// change that affects the audit service.
|
|
func TestDownloadSharesOfflineNode(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
uplink := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
projects, err := satellite.DB.Console().Projects().GetAll(ctx)
|
|
require.NoError(t, err)
|
|
|
|
bucketID := []byte(storj.JoinPaths(projects[0].ID.String(), "testbucket"))
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, pointer)
|
|
require.NoError(t, err)
|
|
|
|
shareSize := pointer.GetRemote().GetRedundancy().GetErasureShareSize()
|
|
limits, privateKey, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, bucketID, pointer, nil)
|
|
require.NoError(t, err)
|
|
|
|
// stop the first node in the pointer
|
|
stoppedNodeID := pointer.GetRemote().GetRemotePieces()[0].NodeId
|
|
err = stopStorageNode(ctx, planet, stoppedNodeID)
|
|
require.NoError(t, err)
|
|
|
|
shares, err := audits.Verifier.DownloadShares(ctx, limits, privateKey, randomIndex, shareSize)
|
|
require.NoError(t, err)
|
|
|
|
for _, share := range shares {
|
|
if share.NodeID == stoppedNodeID {
|
|
assert.True(t, rpc.Error.Has(share.Error), "unexpected error: %+v", share.Error)
|
|
assert.False(t, errs.Is(share.Error, context.DeadlineExceeded), "unexpected error: %+v", share.Error)
|
|
assert.True(t, errs2.IsRPC(share.Error, rpcstatus.Unknown), "unexpected error: %+v", share.Error)
|
|
} else {
|
|
assert.NoError(t, share.Error)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestDownloadSharesMissingPiece checks that the Share.Error field of the
|
|
// shares returned by the DownloadShares method for nodes that don't have the
|
|
// audited piece contain an RPC error with code NotFound.
|
|
//
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
// change that affects the audit service.
|
|
func TestDownloadSharesMissingPiece(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
uplink := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, pointer)
|
|
require.NoError(t, err)
|
|
|
|
projects, err := satellite.DB.Console().Projects().GetAll(ctx)
|
|
require.NoError(t, err)
|
|
|
|
bucketID := []byte(storj.JoinPaths(projects[0].ID.String(), "testbucket"))
|
|
|
|
// replace the piece id of the selected stripe with a new random one
|
|
// to simulate missing piece on the storage nodes
|
|
pointer.GetRemote().RootPieceId = storj.NewPieceID()
|
|
|
|
shareSize := pointer.GetRemote().GetRedundancy().GetErasureShareSize()
|
|
limits, privateKey, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, bucketID, pointer, nil)
|
|
require.NoError(t, err)
|
|
|
|
shares, err := audits.Verifier.DownloadShares(ctx, limits, privateKey, randomIndex, shareSize)
|
|
require.NoError(t, err)
|
|
|
|
for _, share := range shares {
|
|
assert.True(t, errs2.IsRPC(share.Error, rpcstatus.NotFound), "unexpected error: %+v", share.Error)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestDownloadSharesDialTimeout checks that the Share.Error field of the
|
|
// shares returned by the DownloadShares method for nodes that time out on
|
|
// dialing contain an error that:
|
|
// - has the rpc.Error class
|
|
// - is a context.DeadlineExceeded error
|
|
// - is not an RPC error
|
|
//
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
// change that affects the audit service.
|
|
func TestDownloadSharesDialTimeout(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
upl := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := upl.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, pointer)
|
|
require.NoError(t, err)
|
|
|
|
projects, err := satellite.DB.Console().Projects().GetAll(ctx)
|
|
require.NoError(t, err)
|
|
|
|
bucketID := []byte(storj.JoinPaths(projects[0].ID.String(), "testbucket"))
|
|
|
|
tlsOptions, err := tlsopts.NewOptions(satellite.Identity, tlsopts.Config{}, nil)
|
|
require.NoError(t, err)
|
|
|
|
dialer := rpc.NewDefaultDialer(tlsOptions)
|
|
dialer.DialTimeout = 20 * time.Millisecond
|
|
dialer.DialLatency = 200 * time.Second
|
|
dialer.TransferRate = 1 * memory.KB
|
|
|
|
// This config value will create a very short timeframe allowed for receiving
|
|
// data from storage nodes. This will cause context to cancel with timeout.
|
|
minBytesPerSecond := 100 * memory.KiB
|
|
|
|
verifier := audit.NewVerifier(
|
|
satellite.Log.Named("verifier"),
|
|
satellite.Metainfo.Service,
|
|
dialer,
|
|
satellite.Overlay.Service,
|
|
satellite.DB.Containment(),
|
|
satellite.Orders.Service,
|
|
satellite.Identity,
|
|
minBytesPerSecond,
|
|
5*time.Second)
|
|
|
|
shareSize := pointer.GetRemote().GetRedundancy().GetErasureShareSize()
|
|
limits, privateKey, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, bucketID, pointer, nil)
|
|
require.NoError(t, err)
|
|
|
|
shares, err := verifier.DownloadShares(ctx, limits, privateKey, randomIndex, shareSize)
|
|
require.NoError(t, err)
|
|
|
|
for _, share := range shares {
|
|
assert.True(t, rpc.Error.Has(share.Error), "unexpected error: %+v", share.Error)
|
|
assert.True(t, errs.Is(share.Error, context.DeadlineExceeded), "unexpected error: %+v", share.Error)
|
|
}
|
|
})
|
|
}
|
|
|
|
// TestDownloadSharesDownloadTimeout checks that the Share.Error field of the
|
|
// shares returned by the DownloadShares method for nodes that are successfully
|
|
// dialed, but time out during the download of the share contain an error that:
|
|
// - is an RPC error with code DeadlineExceeded
|
|
// - does not have the rpc.Error class
|
|
//
|
|
// If this test fails, this most probably means we made a backward-incompatible
|
|
// change that affects the audit service.
|
|
func TestDownloadSharesDownloadTimeout(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 1,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
return testblobs.NewSlowDB(log.Named("slowdb"), db), nil
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
storageNodeDB := planet.StorageNodes[0].DB.(*testblobs.SlowDB)
|
|
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
upl := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := upl.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
projects, err := satellite.DB.Console().Projects().GetAll(ctx)
|
|
require.NoError(t, err)
|
|
|
|
bucketID := []byte(storj.JoinPaths(projects[0].ID.String(), "testbucket"))
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
randomIndex, err := audit.GetRandomStripe(ctx, pointer)
|
|
require.NoError(t, err)
|
|
|
|
// This config value will create a very short timeframe allowed for receiving
|
|
// data from storage nodes. This will cause context to cancel with timeout.
|
|
minBytesPerSecond := 100 * memory.KiB
|
|
|
|
verifier := audit.NewVerifier(
|
|
satellite.Log.Named("verifier"),
|
|
satellite.Metainfo.Service,
|
|
satellite.Dialer,
|
|
satellite.Overlay.Service,
|
|
satellite.DB.Containment(),
|
|
satellite.Orders.Service,
|
|
satellite.Identity,
|
|
minBytesPerSecond,
|
|
150*time.Millisecond)
|
|
|
|
shareSize := pointer.GetRemote().GetRedundancy().GetErasureShareSize()
|
|
limits, privateKey, err := satellite.Orders.Service.CreateAuditOrderLimits(ctx, bucketID, pointer, nil)
|
|
require.NoError(t, err)
|
|
|
|
// make downloads on storage node slower than the timeout on the satellite for downloading shares
|
|
delay := 200 * time.Millisecond
|
|
storageNodeDB.SetLatency(delay)
|
|
|
|
shares, err := verifier.DownloadShares(ctx, limits, privateKey, randomIndex, shareSize)
|
|
require.NoError(t, err)
|
|
|
|
require.Len(t, shares, 1)
|
|
share := shares[0]
|
|
assert.True(t, errs2.IsRPC(share.Error, rpcstatus.DeadlineExceeded), "unexpected error: %+v", share.Error)
|
|
assert.False(t, rpc.Error.Has(share.Error), "unexpected error: %+v", share.Error)
|
|
})
|
|
}
|
|
|
|
func TestVerifierHappyPath(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
assert.Len(t, report.Successes, len(pointer.GetRemote().GetRemotePieces()))
|
|
assert.Len(t, report.Fails, 0)
|
|
assert.Len(t, report.Offlines, 0)
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
func TestVerifierExpired(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
// set pointer's expiration date to be already expired
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
oldPointerBytes, err := pb.Marshal(pointer)
|
|
require.NoError(t, err)
|
|
newPointer := &pb.Pointer{}
|
|
err = pb.Unmarshal(oldPointerBytes, newPointer)
|
|
require.NoError(t, err)
|
|
newPointer.ExpirationDate = time.Now().Add(-1 * time.Hour)
|
|
newPointerBytes, err := pb.Marshal(newPointer)
|
|
require.NoError(t, err)
|
|
err = satellite.Metainfo.Database.CompareAndSwap(ctx, storage.Key(path), oldPointerBytes, newPointerBytes)
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.Error(t, err)
|
|
require.True(t, audit.ErrSegmentExpired.Has(err))
|
|
|
|
// Verify should delete the expired segment
|
|
pointer, err = satellite.Metainfo.Service.Get(ctx, path)
|
|
require.Error(t, err)
|
|
require.Nil(t, pointer)
|
|
|
|
assert.Len(t, report.Successes, 0)
|
|
assert.Len(t, report.Fails, 0)
|
|
assert.Len(t, report.Offlines, 0)
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
func TestVerifierOfflineNode(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
// stop the first node in the pointer
|
|
stoppedNodeID := pointer.GetRemote().GetRemotePieces()[0].NodeId
|
|
err = stopStorageNode(ctx, planet, stoppedNodeID)
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
assert.Len(t, report.Successes, len(pointer.GetRemote().GetRemotePieces())-1)
|
|
assert.Len(t, report.Fails, 0)
|
|
assert.Len(t, report.Offlines, 1)
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
func TestVerifierMissingPiece(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
// delete the piece from the first node
|
|
origNumPieces := len(pointer.GetRemote().GetRemotePieces())
|
|
piece := pointer.GetRemote().GetRemotePieces()[0]
|
|
pieceID := pointer.GetRemote().RootPieceId.Derive(piece.NodeId, piece.PieceNum)
|
|
node := planet.FindNode(piece.NodeId)
|
|
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
assert.Len(t, report.Successes, origNumPieces-1)
|
|
assert.Len(t, report.Fails, 1)
|
|
assert.Len(t, report.Offlines, 0)
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
// TestVerifierMissingPieceHashesNotVerified tests that if piece hashes were not verified for a pointer,
|
|
// a node that fails an audit for that pointer does not get marked as failing an audit, but is removed from
|
|
// the pointer.
|
|
func TestVerifierMissingPieceHashesNotVerified(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
// update pointer to have PieceHashesVerified false
|
|
err = satellite.Metainfo.Service.UnsynchronizedDelete(ctx, path)
|
|
require.NoError(t, err)
|
|
pointer.PieceHashesVerified = false
|
|
err = satellite.Metainfo.Service.Put(ctx, path, pointer)
|
|
require.NoError(t, err)
|
|
|
|
// delete the piece from the first node
|
|
origNumPieces := len(pointer.GetRemote().GetRemotePieces())
|
|
piece := pointer.GetRemote().GetRemotePieces()[0]
|
|
pieceID := pointer.GetRemote().RootPieceId.Derive(piece.NodeId, piece.PieceNum)
|
|
node := planet.FindNode(piece.NodeId)
|
|
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
assert.Len(t, report.Successes, origNumPieces-1)
|
|
// expect no failed audit
|
|
assert.Len(t, report.Fails, 0)
|
|
assert.Len(t, report.Offlines, 0)
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
func TestVerifierDialTimeout(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
tlsOptions, err := tlsopts.NewOptions(satellite.Identity, tlsopts.Config{}, nil)
|
|
require.NoError(t, err)
|
|
|
|
dialer := rpc.NewDefaultDialer(tlsOptions)
|
|
dialer.DialTimeout = 20 * time.Millisecond
|
|
dialer.DialLatency = 200 * time.Second
|
|
dialer.TransferRate = 1 * memory.KB
|
|
|
|
// This config value will create a very short timeframe allowed for receiving
|
|
// data from storage nodes. This will cause context to cancel with timeout.
|
|
minBytesPerSecond := 100 * memory.KiB
|
|
|
|
verifier := audit.NewVerifier(
|
|
satellite.Log.Named("verifier"),
|
|
satellite.Metainfo.Service,
|
|
dialer,
|
|
satellite.Overlay.Service,
|
|
satellite.DB.Containment(),
|
|
satellite.Orders.Service,
|
|
satellite.Identity,
|
|
minBytesPerSecond,
|
|
5*time.Second)
|
|
|
|
report, err := verifier.Verify(ctx, path, nil)
|
|
require.True(t, audit.ErrNotEnoughShares.Has(err), "unexpected error: %+v", err)
|
|
|
|
assert.Len(t, report.Successes, 0)
|
|
assert.Len(t, report.Fails, 0)
|
|
assert.Len(t, report.Offlines, len(pointer.GetRemote().GetRemotePieces()))
|
|
assert.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
func TestVerifierDeletedSegment(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
// delete the file
|
|
err = ul.DeleteObject(ctx, satellite, "testbucket", "test/path")
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.True(t, audit.ErrSegmentDeleted.Has(err))
|
|
assert.Empty(t, report)
|
|
})
|
|
}
|
|
|
|
func TestVerifierModifiedSegment(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
metainfo := satellite.Metainfo.Service
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
audits.Verifier.OnTestingCheckSegmentAlteredHook = func() {
|
|
// remove one piece from the segment so that checkIfSegmentAltered fails
|
|
pointer, err := metainfo.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
pieceToRemove := pointer.Remote.RemotePieces[0]
|
|
_, err = metainfo.UpdatePieces(ctx, path, pointer, nil, []*pb.RemotePiece{pieceToRemove})
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.True(t, audit.ErrSegmentModified.Has(err))
|
|
assert.Empty(t, report)
|
|
})
|
|
}
|
|
|
|
func TestVerifierReplacedSegment(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
audits.Verifier.OnTestingCheckSegmentAlteredHook = func() {
|
|
// replace the file so that checkIfSegmentAltered fails
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.True(t, audit.ErrSegmentDeleted.Has(err))
|
|
assert.Empty(t, report)
|
|
})
|
|
}
|
|
|
|
func TestVerifierModifiedSegmentFailsOnce(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
// delete the piece from the first node
|
|
origNumPieces := len(pointer.GetRemote().GetRemotePieces())
|
|
piece := pointer.GetRemote().GetRemotePieces()[0]
|
|
pieceID := pointer.GetRemote().RootPieceId.Derive(piece.NodeId, piece.PieceNum)
|
|
node := planet.FindNode(piece.NodeId)
|
|
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
|
|
require.NoError(t, err)
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
assert.Len(t, report.Successes, origNumPieces-1)
|
|
assert.Len(t, report.Fails, 1)
|
|
assert.Equal(t, report.Fails[0], piece.NodeId)
|
|
assert.Len(t, report.Offlines, 0)
|
|
require.Len(t, report.PendingAudits, 0)
|
|
})
|
|
}
|
|
|
|
// TestVerifierSlowDownload checks that a node that times out while sending data to the
|
|
// audit service gets put into containment mode
|
|
func TestVerifierSlowDownload(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
return testblobs.NewSlowDB(log.Named("slowdb"), db), nil
|
|
},
|
|
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
|
// These config values are chosen to force the slow node to time out without timing out on the three normal nodes
|
|
config.Audit.MinBytesPerSecond = 100 * memory.KiB
|
|
config.Audit.MinDownloadTimeout = 500 * time.Millisecond
|
|
|
|
config.Metainfo.RS.MinThreshold = 2
|
|
config.Metainfo.RS.RepairThreshold = 2
|
|
config.Metainfo.RS.SuccessThreshold = 4
|
|
config.Metainfo.RS.TotalThreshold = 4
|
|
},
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
slowNode := pointer.Remote.RemotePieces[0].NodeId
|
|
for _, node := range planet.StorageNodes {
|
|
if node.ID() == slowNode {
|
|
slowNodeDB := node.DB.(*testblobs.SlowDB)
|
|
// make downloads on storage node slower than the timeout on the satellite for downloading shares
|
|
delay := 1 * time.Second
|
|
slowNodeDB.SetLatency(delay)
|
|
break
|
|
}
|
|
}
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
require.Len(t, report.Successes, 3)
|
|
require.Len(t, report.Fails, 0)
|
|
require.Len(t, report.Offlines, 0)
|
|
require.Len(t, report.PendingAudits, 1)
|
|
require.Len(t, report.Unknown, 0)
|
|
require.Equal(t, report.PendingAudits[0].NodeID, slowNode)
|
|
})
|
|
}
|
|
|
|
// TestVerifierUnknownError checks that a node that returns an unknown error in response to an audit request
|
|
// does not get marked as successful, failed, or contained
|
|
func TestVerifierUnknownError(t *testing.T) {
|
|
testplanet.Run(t, testplanet.Config{
|
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
|
Reconfigure: testplanet.Reconfigure{
|
|
StorageNodeDB: func(index int, db storagenode.DB, log *zap.Logger) (storagenode.DB, error) {
|
|
return testblobs.NewBadDB(log.Named("baddb"), db), nil
|
|
},
|
|
Satellite: testplanet.ReconfigureRS(2, 2, 4, 4),
|
|
},
|
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
satellite := planet.Satellites[0]
|
|
audits := satellite.Audit
|
|
queue := audits.Queue
|
|
|
|
audits.Worker.Loop.Pause()
|
|
|
|
ul := planet.Uplinks[0]
|
|
testData := testrand.Bytes(8 * memory.KiB)
|
|
|
|
err := ul.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
|
require.NoError(t, err)
|
|
|
|
audits.Chore.Loop.TriggerWait()
|
|
path, err := queue.Next()
|
|
require.NoError(t, err)
|
|
|
|
pointer, err := satellite.Metainfo.Service.Get(ctx, path)
|
|
require.NoError(t, err)
|
|
|
|
badNode := pointer.Remote.RemotePieces[0].NodeId
|
|
for _, node := range planet.StorageNodes {
|
|
if node.ID() == badNode {
|
|
badNodeDB := node.DB.(*testblobs.BadDB)
|
|
// return an error when the verifier attempts to download from this node
|
|
badNodeDB.SetError(errs.New("unknown error"))
|
|
break
|
|
}
|
|
}
|
|
|
|
report, err := audits.Verifier.Verify(ctx, path, nil)
|
|
require.NoError(t, err)
|
|
|
|
require.Len(t, report.Successes, 3)
|
|
require.Len(t, report.Fails, 0)
|
|
require.Len(t, report.Offlines, 0)
|
|
require.Len(t, report.PendingAudits, 0)
|
|
require.Len(t, report.Unknown, 1)
|
|
require.Equal(t, report.Unknown[0], badNode)
|
|
})
|
|
}
|