From da9f1f06117014fd2066cc5848e4ff4fc646955d Mon Sep 17 00:00:00 2001 From: Cameron Ayer Date: Tue, 10 Nov 2020 09:49:19 -0500 Subject: [PATCH] satellite/repair: add monkit counter for segments below minimum required The current monkit reporting for "remote_segments_lost" is not usable for triggering alerts, as it has reported no data. To allow alerting, two new metrics "checker_segments_below_min_req" and "repairer_segments_below_min_req" will increment by zero on each segment unless it is below the minimum required piece count. The two metrics report what is found by the checker and the repairer respectively. Change-Id: I98a68bb189eaf68a833d25cf5db9e68df535b9d7 --- monkit.lock | 2 ++ satellite/repair/checker/checker.go | 4 ++++ satellite/repair/repairer/segments.go | 6 +++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/monkit.lock b/monkit.lock index fd29869a0..75cf35722 100644 --- a/monkit.lock +++ b/monkit.lock @@ -58,6 +58,7 @@ storj.io/storj/satellite/repair/checker."checker_segment_age" IntVal storj.io/storj/satellite/repair/checker."checker_segment_healthy_count" IntVal storj.io/storj/satellite/repair/checker."checker_segment_time_until_irreparable" IntVal storj.io/storj/satellite/repair/checker."checker_segment_total_count" IntVal +storj.io/storj/satellite/repair/checker."checker_segments_below_min_req" Counter storj.io/storj/satellite/repair/checker."healthy_segments_removed_from_queue" IntVal storj.io/storj/satellite/repair/checker."new_remote_segments_needing_repair" IntVal storj.io/storj/satellite/repair/checker."remote_files_checked" IntVal @@ -88,6 +89,7 @@ storj.io/storj/satellite/repair/repairer."repair_segment_size" IntVal storj.io/storj/satellite/repair/repairer."repair_success" Meter storj.io/storj/satellite/repair/repairer."repair_too_many_nodes_failed" Meter storj.io/storj/satellite/repair/repairer."repair_unnecessary" Meter +storj.io/storj/satellite/repair/repairer."repairer_segments_below_min_req" Counter storj.io/storj/satellite/repair/repairer."segment_deleted_before_repair" Meter storj.io/storj/satellite/repair/repairer."segment_repair_count" IntVal storj.io/storj/satellite/repair/repairer."segment_time_until_repair" IntVal diff --git a/satellite/repair/checker/checker.go b/satellite/repair/checker/checker.go index 2c7116ce4..b369d0dc2 100644 --- a/satellite/repair/checker/checker.go +++ b/satellite/repair/checker/checker.go @@ -268,6 +268,9 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo obs.monStats.remoteSegmentsChecked++ + // ensure we get values, even if only zero values, so that redash can have an alert based on this + mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked + pieces := segment.Pieces if len(pieces) == 0 { obs.log.Debug("no pieces on remote segment") @@ -344,6 +347,7 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked obs.monStats.remoteSegmentsLost++ + mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked // make an entry into the irreparable table segmentInfo := &internalpb.IrreparableSegment{ Path: key, diff --git a/satellite/repair/repairer/segments.go b/satellite/repair/repairer/segments.go index 5368114a4..2fab87832 100644 --- a/satellite/repair/repairer/segments.go +++ b/satellite/repair/repairer/segments.go @@ -141,7 +141,8 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s numHealthy := len(pieces) - len(missingPieces) // irreparable piece if int32(numHealthy) < pointer.Remote.Redundancy.MinReq { - mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked + mon.Counter("repairer_segments_below_min_req").Inc(1) //mon:locked + mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked return true, &irreparableError{ path: path, piecesAvailable: int32(numHealthy), @@ -150,6 +151,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s } } + // ensure we get values, even if only zero values, so that redash can have an alert based on this + mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked + repairThreshold := pointer.Remote.Redundancy.RepairThreshold if repairer.repairOverride != 0 { repairThreshold = int32(repairer.repairOverride)