satellite/repair: add monkit counter for segments below minimum required

The current monkit reporting for "remote_segments_lost" is not usable for
triggering alerts, as it has reported no data. To allow alerting, two new
metrics "checker_segments_below_min_req" and "repairer_segments_below_min_req"
will increment by zero on each segment unless it is below the minimum
required piece count. The two metrics report what is found by the checker
and the repairer respectively.

Change-Id: I98a68bb189eaf68a833d25cf5db9e68df535b9d7
This commit is contained in:
Cameron Ayer 2020-11-10 09:49:19 -05:00 committed by Stefan Benten
parent 2ff7925e65
commit da9f1f0611
3 changed files with 11 additions and 1 deletions

View File

@ -58,6 +58,7 @@ storj.io/storj/satellite/repair/checker."checker_segment_age" IntVal
storj.io/storj/satellite/repair/checker."checker_segment_healthy_count" IntVal
storj.io/storj/satellite/repair/checker."checker_segment_time_until_irreparable" IntVal
storj.io/storj/satellite/repair/checker."checker_segment_total_count" IntVal
storj.io/storj/satellite/repair/checker."checker_segments_below_min_req" Counter
storj.io/storj/satellite/repair/checker."healthy_segments_removed_from_queue" IntVal
storj.io/storj/satellite/repair/checker."new_remote_segments_needing_repair" IntVal
storj.io/storj/satellite/repair/checker."remote_files_checked" IntVal
@ -88,6 +89,7 @@ storj.io/storj/satellite/repair/repairer."repair_segment_size" IntVal
storj.io/storj/satellite/repair/repairer."repair_success" Meter
storj.io/storj/satellite/repair/repairer."repair_too_many_nodes_failed" Meter
storj.io/storj/satellite/repair/repairer."repair_unnecessary" Meter
storj.io/storj/satellite/repair/repairer."repairer_segments_below_min_req" Counter
storj.io/storj/satellite/repair/repairer."segment_deleted_before_repair" Meter
storj.io/storj/satellite/repair/repairer."segment_repair_count" IntVal
storj.io/storj/satellite/repair/repairer."segment_time_until_repair" IntVal

View File

@ -268,6 +268,9 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo
obs.monStats.remoteSegmentsChecked++
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked
pieces := segment.Pieces
if len(pieces) == 0 {
obs.log.Debug("no pieces on remote segment")
@ -344,6 +347,7 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo
mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked
obs.monStats.remoteSegmentsLost++
mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked
// make an entry into the irreparable table
segmentInfo := &internalpb.IrreparableSegment{
Path: key,

View File

@ -141,7 +141,8 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
numHealthy := len(pieces) - len(missingPieces)
// irreparable piece
if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked
mon.Counter("repairer_segments_below_min_req").Inc(1) //mon:locked
mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked
return true, &irreparableError{
path: path,
piecesAvailable: int32(numHealthy),
@ -150,6 +151,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
}
}
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked
repairThreshold := pointer.Remote.Redundancy.RepairThreshold
if repairer.repairOverride != 0 {
repairThreshold = int32(repairer.repairOverride)