satellite/repair: only record audit result if segment can be downloaded

If satellite can't find enough nodes to successfully download a segment,
it probably is not the fault of storage nodes.

Change-Id: I681f66056df0bb940da9edb3a7dbb3658c0a56cb
This commit is contained in:
Yingrong Zhao 2021-11-12 16:04:30 -05:00 committed by Yingrong Zhao
parent 3de7f8d5af
commit 336500c04d
2 changed files with 36 additions and 57 deletions

View File

@ -569,27 +569,16 @@ func testFailedDataRepair(t *testing.T, inMemoryRepair bool) {
nodesReputationAfter[piece.StorageNode] = *info nodesReputationAfter[piece.StorageNode] = *info
} }
// repair should update audit status // repair shouldn't update audit status
for _, piece := range availablePieces[1 : len(availablePieces)-1] { for _, piece := range availablePieces {
successfulNodeReputation := nodesReputation[piece.StorageNode] successfulNodeReputation := nodesReputation[piece.StorageNode]
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode] successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount) require.Equal(t, successfulNodeReputation.TotalAuditCount, successfulNodeReputationAfter.TotalAuditCount)
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount) require.Equal(t, successfulNodeReputation.AuditSuccessCount, successfulNodeReputationAfter.AuditSuccessCount)
require.True(t, successfulNodeReputation.AuditReputationAlpha < successfulNodeReputationAfter.AuditReputationAlpha) require.Equal(t, successfulNodeReputation.AuditReputationAlpha, successfulNodeReputationAfter.AuditReputationAlpha)
require.True(t, successfulNodeReputation.AuditReputationBeta >= successfulNodeReputationAfter.AuditReputationBeta) require.Equal(t, successfulNodeReputation.AuditReputationBeta, successfulNodeReputationAfter.AuditReputationBeta)
} }
offlineNodeReputation := nodesReputation[offlinePiece.StorageNode]
offlineNodeReputationAfter := nodesReputationAfter[offlinePiece.StorageNode]
require.Equal(t, offlineNodeReputation.TotalAuditCount+1, offlineNodeReputationAfter.TotalAuditCount)
require.Equal(t, int32(0), offlineNodeReputationAfter.AuditHistory.Windows[0].OnlineCount)
badNodeReputation := nodesReputation[unknownPiece.StorageNode]
badNodeReputationAfter := nodesReputationAfter[unknownPiece.StorageNode]
require.Equal(t, badNodeReputation.TotalAuditCount+1, badNodeReputationAfter.TotalAuditCount)
require.True(t, badNodeReputation.UnknownAuditReputationBeta < badNodeReputationAfter.UnknownAuditReputationBeta)
require.True(t, badNodeReputation.UnknownAuditReputationAlpha >= badNodeReputationAfter.UnknownAuditReputationAlpha)
// repair should fail, so segment should contain all the original nodes // repair should fail, so segment should contain all the original nodes
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket") segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
for _, piece := range segmentAfter.Pieces { for _, piece := range segmentAfter.Pieces {
@ -1077,22 +1066,16 @@ func testMissingPieceDataRepairFailed(t *testing.T, inMemoryRepair bool) {
nodesReputationAfter[piece.StorageNode] = *info nodesReputationAfter[piece.StorageNode] = *info
} }
// repair should update audit status // repair shouldn't update audit status
for _, piece := range successful { for _, piece := range successful {
successfulNodeReputation := nodesReputation[piece.StorageNode] successfulNodeReputation := nodesReputation[piece.StorageNode]
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode] successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount) require.Equal(t, successfulNodeReputation.TotalAuditCount, successfulNodeReputationAfter.TotalAuditCount)
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount) require.Equal(t, successfulNodeReputation.AuditSuccessCount, successfulNodeReputationAfter.AuditSuccessCount)
require.True(t, successfulNodeReputation.AuditReputationAlpha < successfulNodeReputationAfter.AuditReputationAlpha) require.Equal(t, successfulNodeReputation.AuditReputationAlpha, successfulNodeReputationAfter.AuditReputationAlpha)
require.True(t, successfulNodeReputation.AuditReputationBeta >= successfulNodeReputationAfter.AuditReputationBeta) require.Equal(t, successfulNodeReputation.AuditReputationBeta, successfulNodeReputationAfter.AuditReputationBeta)
} }
missingPieceNodeReputation := nodesReputation[missingPiece.StorageNode]
missingPieceNodeReputationAfter := nodesReputationAfter[missingPiece.StorageNode]
require.Equal(t, missingPieceNodeReputation.TotalAuditCount+1, missingPieceNodeReputationAfter.TotalAuditCount)
require.True(t, missingPieceNodeReputation.AuditReputationBeta < missingPieceNodeReputationAfter.AuditReputationBeta)
require.True(t, missingPieceNodeReputation.AuditReputationAlpha >= missingPieceNodeReputationAfter.AuditReputationAlpha)
// repair should fail, so segment should contain all the original nodes // repair should fail, so segment should contain all the original nodes
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket") segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
for _, piece := range segmentAfter.Pieces { for _, piece := range segmentAfter.Pieces {
@ -1325,22 +1308,16 @@ func testCorruptDataRepairFailed(t *testing.T, inMemoryRepair bool) {
nodesReputationAfter[piece.StorageNode] = *info nodesReputationAfter[piece.StorageNode] = *info
} }
// repair should update audit status // repair shouldn't update audit status
for _, piece := range successful { for _, piece := range successful {
successfulNodeReputation := nodesReputation[piece.StorageNode] successfulNodeReputation := nodesReputation[piece.StorageNode]
successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode] successfulNodeReputationAfter := nodesReputationAfter[piece.StorageNode]
require.Equal(t, successfulNodeReputation.TotalAuditCount+1, successfulNodeReputationAfter.TotalAuditCount) require.Equal(t, successfulNodeReputation.TotalAuditCount, successfulNodeReputationAfter.TotalAuditCount)
require.Equal(t, successfulNodeReputation.AuditSuccessCount+1, successfulNodeReputationAfter.AuditSuccessCount) require.Equal(t, successfulNodeReputation.AuditSuccessCount, successfulNodeReputationAfter.AuditSuccessCount)
require.True(t, successfulNodeReputation.AuditReputationAlpha < successfulNodeReputationAfter.AuditReputationAlpha) require.Equal(t, successfulNodeReputation.AuditReputationAlpha, successfulNodeReputationAfter.AuditReputationAlpha)
require.True(t, successfulNodeReputation.AuditReputationBeta >= successfulNodeReputationAfter.AuditReputationBeta) require.Equal(t, successfulNodeReputation.AuditReputationBeta, successfulNodeReputationAfter.AuditReputationBeta)
} }
corruptedNodeReputation := nodesReputation[corruptedPiece.StorageNode]
corruptedNodeReputationAfter := nodesReputationAfter[corruptedPiece.StorageNode]
require.Equal(t, corruptedNodeReputation.TotalAuditCount+1, corruptedNodeReputationAfter.TotalAuditCount)
require.True(t, corruptedNodeReputation.AuditReputationBeta < corruptedNodeReputationAfter.AuditReputationBeta)
require.True(t, corruptedNodeReputation.AuditReputationAlpha >= corruptedNodeReputationAfter.AuditReputationAlpha)
// repair should fail, so segment should contain all the original nodes // repair should fail, so segment should contain all the original nodes
segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket") segmentAfter, _ := getRemoteSegment(ctx, t, satellite, planet.Uplinks[0].Projects[0].ID, "testbucket")
for _, piece := range segmentAfter.Pieces { for _, piece := range segmentAfter.Pieces {

View File

@ -315,24 +315,6 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
if len(piecesReport.Contained) > 0 { if len(piecesReport.Contained) > 0 {
repairer.log.Debug("unexpected contained pieces during repair", zap.Int("count", len(piecesReport.Contained))) repairer.log.Debug("unexpected contained pieces during repair", zap.Int("count", len(piecesReport.Contained)))
} }
var report audit.Report
for _, piece := range piecesReport.Successful {
report.Successes = append(report.Successes, piece.StorageNode)
}
for _, piece := range piecesReport.Failed {
report.Fails = append(report.Fails, piece.StorageNode)
}
for _, piece := range piecesReport.Offline {
report.Offlines = append(report.Offlines, piece.StorageNode)
}
for _, piece := range piecesReport.Unknown {
report.Unknown = append(report.Unknown, piece.StorageNode)
}
_, reportErr := repairer.reporter.RecordAudits(ctx, report)
if reportErr != nil {
// failed updates should not affect repair, therefore we will not return the error
repairer.log.Debug("failed to record audit", zap.Error(reportErr))
}
if err != nil { if err != nil {
// If the context was closed during the Get phase, it will appear here as though // If the context was closed during the Get phase, it will appear here as though
@ -363,6 +345,26 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
} }
defer func() { err = errs.Combine(err, segmentReader.Close()) }() defer func() { err = errs.Combine(err, segmentReader.Close()) }()
// only report audit result when segment can be successfully downloaded
var report audit.Report
for _, piece := range piecesReport.Successful {
report.Successes = append(report.Successes, piece.StorageNode)
}
for _, piece := range piecesReport.Failed {
report.Fails = append(report.Fails, piece.StorageNode)
}
for _, piece := range piecesReport.Offline {
report.Offlines = append(report.Offlines, piece.StorageNode)
}
for _, piece := range piecesReport.Unknown {
report.Unknown = append(report.Unknown, piece.StorageNode)
}
_, reportErr := repairer.reporter.RecordAudits(ctx, report)
if reportErr != nil {
// failed updates should not affect repair, therefore we will not return the error
repairer.log.Debug("failed to record audit", zap.Error(reportErr))
}
// Upload the repaired pieces // Upload the repaired pieces
successfulNodes, _, err := repairer.ec.Repair(ctx, putLimits, putPrivateKey, redundancy, segmentReader, repairer.timeout, minSuccessfulNeeded) successfulNodes, _, err := repairer.ec.Repair(ctx, putLimits, putPrivateKey, redundancy, segmentReader, repairer.timeout, minSuccessfulNeeded)
if err != nil { if err != nil {