satellite/audit: fix containment mode (#3085)

* add test to make sure we will reverify the share in the containment db rather than in the pointer passed into reverify

* use pending audit information only when running reverify
This commit is contained in:
Maximillian von Briesen 2019-09-18 19:45:15 -04:00 committed by littleskunk
parent 1c72e80e40
commit a4048fd529
2 changed files with 184 additions and 53 deletions

View File

@ -519,3 +519,113 @@ func TestReverifyModifiedSegment(t *testing.T) {
require.True(t, audit.ErrContainedNotFound.Has(err)) require.True(t, audit.ErrContainedNotFound.Has(err))
}) })
} }
func TestReverifyDifferentShare(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
// - uploads random data to two files
// - get a random stripe to audit from file 1
// - creates one pending audit for a node holding a piece for that stripe
// - the actual share is downloaded to make sure ExpectedShareHash is correct
// - delete piece for file 1 from the selected node
// - calls reverify on some stripe from file 2
// - expects one storage node to be marked as a fail in the audit report
// - (if file 2 is used during reverify, the node will pass the audit and the test should fail)
satellite := planet.Satellites[0]
audits := satellite.Audit
queue := audits.Queue
audits.Worker.Loop.Pause()
ul := planet.Uplinks[0]
testData1 := testrand.Bytes(8 * memory.KiB)
testData2 := testrand.Bytes(8 * memory.KiB)
err := ul.Upload(ctx, satellite, "testbucket", "test/path1", testData1)
require.NoError(t, err)
err = ul.Upload(ctx, satellite, "testbucket", "test/path2", testData2)
require.NoError(t, err)
audits.Chore.Loop.TriggerWait()
path1, err := queue.Next()
require.NoError(t, err)
path2, err := queue.Next()
require.NoError(t, err)
require.NotEqual(t, path1, path2)
pointer1, err := satellite.Metainfo.Service.Get(ctx, path1)
require.NoError(t, err)
pointer2, err := satellite.Metainfo.Service.Get(ctx, path2)
require.NoError(t, err)
// find a node that contains a piece for both files
// save that node ID and the piece number associated with it for pointer1
var selectedNode storj.NodeID
var selectedPieceNum int32
p1Nodes := make(map[storj.NodeID]int32)
for _, piece := range pointer1.GetRemote().GetRemotePieces() {
p1Nodes[piece.NodeId] = piece.PieceNum
}
for _, piece := range pointer2.GetRemote().GetRemotePieces() {
pieceNum, ok := p1Nodes[piece.NodeId]
if ok {
selectedNode = piece.NodeId
selectedPieceNum = pieceNum
break
}
}
require.NotEqual(t, selectedNode, storj.NodeID{})
randomIndex, err := audit.GetRandomStripe(ctx, pointer1)
require.NoError(t, err)
orders := satellite.Orders.Service
containment := satellite.DB.Containment()
projects, err := satellite.DB.Console().Projects().GetAll(ctx)
require.NoError(t, err)
bucketID := []byte(storj.JoinPaths(projects[0].ID.String(), "testbucket"))
shareSize := pointer1.GetRemote().GetRedundancy().GetErasureShareSize()
rootPieceID := pointer1.GetRemote().RootPieceId
limit, privateKey, err := orders.CreateAuditOrderLimit(ctx, bucketID, selectedNode, selectedPieceNum, rootPieceID, shareSize)
require.NoError(t, err)
share, err := audits.Verifier.GetShare(ctx, limit, privateKey, randomIndex, shareSize, int(selectedPieceNum))
require.NoError(t, err)
pending := &audit.PendingAudit{
NodeID: selectedNode,
PieceID: rootPieceID,
StripeIndex: randomIndex,
ShareSize: shareSize,
ExpectedShareHash: pkcrypto.SHA256Hash(share.Data),
ReverifyCount: 0,
Path: path1,
}
err = containment.IncrementPending(ctx, pending)
require.NoError(t, err)
// delete the piece for pointer1 from the selected node
pieceID := pointer1.GetRemote().RootPieceId.Derive(selectedNode, selectedPieceNum)
node := getStorageNode(planet, selectedNode)
err = node.Storage2.Store.Delete(ctx, satellite.ID(), pieceID)
require.NoError(t, err)
// reverify with path 2. Since the selected node was put in containment for path1,
// it should be audited for path1 and fail
report, err := audits.Verifier.Reverify(ctx, path2)
require.NoError(t, err)
require.Len(t, report.Successes, 0)
require.Len(t, report.Offlines, 0)
require.Len(t, report.PendingAudits, 0)
require.Len(t, report.Fails, 1)
require.Equal(t, report.Fails[0], selectedNode)
})
}

View File

@ -340,41 +340,62 @@ func (verifier *Verifier) Reverify(ctx context.Context, path storj.Path) (report
} }
containedInSegment++ containedInSegment++
go func(pending *PendingAudit, piece *pb.RemotePiece) { go func(pending *PendingAudit) {
limit, piecePrivateKey, err := verifier.orders.CreateAuditOrderLimit(ctx, createBucketID(path), pending.NodeID, piece.PieceNum, pending.PieceID, pending.ShareSize) // TODO perhaps we should save piece number as part of the pending audit so we do not need to use metainfo here
pendingPointer, err := verifier.metainfo.Get(ctx, pending.Path)
if err != nil { if err != nil {
if overlay.ErrNodeDisqualified.Has(err) { ch <- result{nodeID: pending.NodeID, status: erred, err: err}
_, errDelete := verifier.containment.Delete(ctx, piece.NodeId) verifier.log.Debug("Reverify: error getting pending pointer from metainfo", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
if errDelete != nil { return
verifier.log.Debug("Error deleting disqualified node from containment db", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) }
err = errs.Combine(err, errDelete) var pieceNum int32
} found := false
ch <- result{nodeID: piece.NodeId, status: erred, err: err} for _, piece := range pendingPointer.GetRemote().GetRemotePieces() {
verifier.log.Debug("Reverify: order limit not created (disqualified)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId)) if piece.NodeId == pending.NodeID {
return pieceNum = piece.PieceNum
found = true
} }
if overlay.ErrNodeOffline.Has(err) { }
ch <- result{nodeID: piece.NodeId, status: offline} if !found {
verifier.log.Debug("Reverify: order limit not created (offline)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId)) ch <- result{nodeID: pending.NodeID, status: erred, err: err}
return verifier.log.Debug("Reverify: could not find node in pointer to audit", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID))
}
ch <- result{nodeID: piece.NodeId, status: erred, err: err}
verifier.log.Debug("Reverify: error creating order limit", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err))
return return
} }
share, err := verifier.GetShare(ctx, limit, piecePrivateKey, pending.StripeIndex, pending.ShareSize, int(piece.PieceNum)) limit, piecePrivateKey, err := verifier.orders.CreateAuditOrderLimit(ctx, createBucketID(pending.Path), pending.NodeID, pieceNum, pending.PieceID, pending.ShareSize)
if err != nil {
// check if the pending audit was deleted while downloading the share if overlay.ErrNodeDisqualified.Has(err) {
_, getErr := verifier.containment.Get(ctx, piece.NodeId) _, errDelete := verifier.containment.Delete(ctx, pending.NodeID)
if getErr != nil { if errDelete != nil {
if ErrContainedNotFound.Has(getErr) { verifier.log.Debug("Error deleting disqualified node from containment db", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
ch <- result{nodeID: piece.NodeId, status: skipped} err = errs.Combine(err, errDelete)
verifier.log.Debug("Reverify: pending audit deleted during reverification", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(getErr)) }
ch <- result{nodeID: pending.NodeID, status: erred, err: err}
verifier.log.Debug("Reverify: order limit not created (disqualified)", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID))
return return
} }
ch <- result{nodeID: piece.NodeId, status: erred, err: getErr} if overlay.ErrNodeOffline.Has(err) {
verifier.log.Debug("Reverify: error getting from containment db", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(getErr)) ch <- result{nodeID: pending.NodeID, status: offline}
verifier.log.Debug("Reverify: order limit not created (offline)", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID))
return
}
ch <- result{nodeID: pending.NodeID, status: erred, err: err}
verifier.log.Debug("Reverify: error creating order limit", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return
}
share, err := verifier.GetShare(ctx, limit, piecePrivateKey, pending.StripeIndex, pending.ShareSize, int(pieceNum))
// check if the pending audit was deleted while downloading the share
_, getErr := verifier.containment.Get(ctx, pending.NodeID)
if getErr != nil {
if ErrContainedNotFound.Has(getErr) {
ch <- result{nodeID: pending.NodeID, status: skipped}
verifier.log.Debug("Reverify: pending audit deleted during reverification", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID), zap.Error(getErr))
return
}
ch <- result{nodeID: pending.NodeID, status: erred, err: getErr}
verifier.log.Debug("Reverify: error getting from containment db", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID), zap.Error(getErr))
return return
} }
@ -383,71 +404,71 @@ func (verifier *Verifier) Reverify(ctx context.Context, path storj.Path) (report
if transport.Error.Has(err) { if transport.Error.Has(err) {
if errs.Is(err, context.DeadlineExceeded) { if errs.Is(err, context.DeadlineExceeded) {
// dial timeout // dial timeout
ch <- result{nodeID: piece.NodeId, status: offline} ch <- result{nodeID: pending.NodeID, status: offline}
verifier.log.Debug("Reverify: dial timeout (offline)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: dial timeout (offline)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
if errs2.IsRPC(err, codes.Unknown) { if errs2.IsRPC(err, codes.Unknown) {
// dial failed -- offline node // dial failed -- offline node
verifier.log.Debug("Reverify: dial failed (offline)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: dial failed (offline)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
ch <- result{nodeID: piece.NodeId, status: offline} ch <- result{nodeID: pending.NodeID, status: offline}
return return
} }
// unknown transport error // unknown transport error
ch <- result{nodeID: piece.NodeId, status: contained, pendingAudit: pending} ch <- result{nodeID: pending.NodeID, status: contained, pendingAudit: pending}
verifier.log.Debug("Reverify: unknown transport error (contained)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: unknown transport error (contained)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
if errs2.IsRPC(err, codes.NotFound) { if errs2.IsRPC(err, codes.NotFound) {
// Get the original segment pointer in the metainfo // Get the original segment pointer in the metainfo
oldPtr, err := verifier.checkIfSegmentAltered(ctx, pending.Path, pointer) oldPtr, err := verifier.checkIfSegmentAltered(ctx, pending.Path, pendingPointer)
if err != nil { if err != nil {
ch <- result{nodeID: piece.NodeId, status: success} ch <- result{nodeID: pending.NodeID, status: success}
verifier.log.Debug("Reverify: audit source deleted before reverification", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: audit source deleted before reverification", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
// remove failed audit pieces from the pointer so as to only penalize once for failed audits // remove failed audit pieces from the pointer so as to only penalize once for failed audits
err = verifier.removeFailedPieces(ctx, pending.Path, oldPtr, storj.NodeIDList{pending.NodeID}) err = verifier.removeFailedPieces(ctx, pending.Path, oldPtr, storj.NodeIDList{pending.NodeID})
if err != nil { if err != nil {
verifier.log.Warn("Reverify: failed to delete failed pieces", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Warn("Reverify: failed to delete failed pieces", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
} }
// missing share // missing share
ch <- result{nodeID: piece.NodeId, status: failed} ch <- result{nodeID: pending.NodeID, status: failed}
verifier.log.Debug("Reverify: piece not found (audit failed)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: piece not found (audit failed)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
if errs2.IsRPC(err, codes.DeadlineExceeded) { if errs2.IsRPC(err, codes.DeadlineExceeded) {
// dial successful, but download timed out // dial successful, but download timed out
ch <- result{nodeID: piece.NodeId, status: contained, pendingAudit: pending} ch <- result{nodeID: pending.NodeID, status: contained, pendingAudit: pending}
verifier.log.Debug("Reverify: download timeout (contained)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: download timeout (contained)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
// unknown error // unknown error
ch <- result{nodeID: piece.NodeId, status: contained, pendingAudit: pending} ch <- result{nodeID: pending.NodeID, status: contained, pendingAudit: pending}
verifier.log.Debug("Reverify: unknown error (contained)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: unknown error (contained)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
downloadedHash := pkcrypto.SHA256Hash(share.Data) downloadedHash := pkcrypto.SHA256Hash(share.Data)
if bytes.Equal(downloadedHash, pending.ExpectedShareHash) { if bytes.Equal(downloadedHash, pending.ExpectedShareHash) {
ch <- result{nodeID: piece.NodeId, status: success} ch <- result{nodeID: pending.NodeID, status: success}
verifier.log.Debug("Reverify: hashes match (audit success)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId)) verifier.log.Debug("Reverify: hashes match (audit success)", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID))
} else { } else {
oldPtr, err := verifier.checkIfSegmentAltered(ctx, pending.Path, pointer) oldPtr, err := verifier.checkIfSegmentAltered(ctx, pending.Path, pendingPointer)
if err != nil { if err != nil {
ch <- result{nodeID: piece.NodeId, status: success} ch <- result{nodeID: pending.NodeID, status: success}
verifier.log.Debug("Reverify: audit source deleted before reverification", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Debug("Reverify: audit source deleted before reverification", zap.String("Segment Path", path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
return return
} }
// remove failed audit pieces from the pointer so as to only penalize once for failed audits // remove failed audit pieces from the pointer so as to only penalize once for failed audits
err = verifier.removeFailedPieces(ctx, pending.Path, oldPtr, storj.NodeIDList{pending.NodeID}) err = verifier.removeFailedPieces(ctx, pending.Path, oldPtr, storj.NodeIDList{pending.NodeID})
if err != nil { if err != nil {
verifier.log.Warn("Reverify: failed to delete failed pieces", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), zap.Error(err)) verifier.log.Warn("Reverify: failed to delete failed pieces", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID), zap.Error(err))
} }
verifier.log.Debug("Reverify: hashes mismatch (audit failed)", zap.String("Segment Path", path), zap.Stringer("Node ID", piece.NodeId), verifier.log.Debug("Reverify: hashes mismatch (audit failed)", zap.String("Segment Path", pending.Path), zap.Stringer("Node ID", pending.NodeID),
zap.Binary("expected hash", pending.ExpectedShareHash), zap.Binary("downloaded hash", downloadedHash)) zap.Binary("expected hash", pending.ExpectedShareHash), zap.Binary("downloaded hash", downloadedHash))
ch <- result{nodeID: piece.NodeId, status: failed} ch <- result{nodeID: pending.NodeID, status: failed}
} }
}(pending, piece) }(pending)
} }
report = &Report{} report = &Report{}