satellite/repair: add monkit counter for segments below minimum required
The current monkit reporting for "remote_segments_lost" is not usable for triggering alerts, as it has reported no data. To allow alerting, two new metrics "checker_segments_below_min_req" and "repairer_segments_below_min_req" will increment by zero on each segment unless it is below the minimum required piece count. The two metrics report what is found by the checker and the repairer respectively. Change-Id: I98a68bb189eaf68a833d25cf5db9e68df535b9d7
This commit is contained in:
parent
2ff7925e65
commit
da9f1f0611
@ -58,6 +58,7 @@ storj.io/storj/satellite/repair/checker."checker_segment_age" IntVal
|
||||
storj.io/storj/satellite/repair/checker."checker_segment_healthy_count" IntVal
|
||||
storj.io/storj/satellite/repair/checker."checker_segment_time_until_irreparable" IntVal
|
||||
storj.io/storj/satellite/repair/checker."checker_segment_total_count" IntVal
|
||||
storj.io/storj/satellite/repair/checker."checker_segments_below_min_req" Counter
|
||||
storj.io/storj/satellite/repair/checker."healthy_segments_removed_from_queue" IntVal
|
||||
storj.io/storj/satellite/repair/checker."new_remote_segments_needing_repair" IntVal
|
||||
storj.io/storj/satellite/repair/checker."remote_files_checked" IntVal
|
||||
@ -88,6 +89,7 @@ storj.io/storj/satellite/repair/repairer."repair_segment_size" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_success" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_too_many_nodes_failed" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_unnecessary" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repairer_segments_below_min_req" Counter
|
||||
storj.io/storj/satellite/repair/repairer."segment_deleted_before_repair" Meter
|
||||
storj.io/storj/satellite/repair/repairer."segment_repair_count" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."segment_time_until_repair" IntVal
|
||||
|
@ -268,6 +268,9 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo
|
||||
|
||||
obs.monStats.remoteSegmentsChecked++
|
||||
|
||||
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
||||
mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked
|
||||
|
||||
pieces := segment.Pieces
|
||||
if len(pieces) == 0 {
|
||||
obs.log.Debug("no pieces on remote segment")
|
||||
@ -344,6 +347,7 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo
|
||||
mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked
|
||||
|
||||
obs.monStats.remoteSegmentsLost++
|
||||
mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked
|
||||
// make an entry into the irreparable table
|
||||
segmentInfo := &internalpb.IrreparableSegment{
|
||||
Path: key,
|
||||
|
@ -141,7 +141,8 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
numHealthy := len(pieces) - len(missingPieces)
|
||||
// irreparable piece
|
||||
if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
|
||||
mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked
|
||||
mon.Counter("repairer_segments_below_min_req").Inc(1) //mon:locked
|
||||
mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked
|
||||
return true, &irreparableError{
|
||||
path: path,
|
||||
piecesAvailable: int32(numHealthy),
|
||||
@ -150,6 +151,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
}
|
||||
}
|
||||
|
||||
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
||||
mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked
|
||||
|
||||
repairThreshold := pointer.Remote.Redundancy.RepairThreshold
|
||||
if repairer.repairOverride != 0 {
|
||||
repairThreshold = int32(repairer.repairOverride)
|
||||
|
Loading…
Reference in New Issue
Block a user