From d5540c89a16e61f531fe6d654c4abd00145cae70 Mon Sep 17 00:00:00 2001 From: Moby von Briesen Date: Mon, 24 Feb 2020 18:06:52 -0500 Subject: [PATCH] satellite/repair/checker: add monkit metrics for segments immediately above repair threshold Record counts for segments at health=rt+1 through health=rt+5 for every checker iteration. Change-Id: I2a00c0bc34d17beb21cacdeab4dac77f755faefe --- monkit.lock | 5 +++++ satellite/repair/checker/checker.go | 26 +++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/monkit.lock b/monkit.lock index 23618ba86..3fab6e0e5 100644 --- a/monkit.lock +++ b/monkit.lock @@ -61,6 +61,11 @@ storj.io/storj/satellite/repair/checker."remote_files_lost" IntVal storj.io/storj/satellite/repair/checker."remote_segments_checked" IntVal storj.io/storj/satellite/repair/checker."remote_segments_lost" IntVal storj.io/storj/satellite/repair/checker."remote_segments_needing_repair" IntVal +storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_1" IntVal +storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_2" IntVal +storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_3" IntVal +storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_4" IntVal +storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_5" IntVal storj.io/storj/satellite/repair/repairer."download_failed_not_enough_pieces_repair" Meter storj.io/storj/satellite/repair/repairer."healthy_ratio_after_repair" FloatVal storj.io/storj/satellite/repair/repairer."healthy_ratio_before_repair" FloatVal diff --git a/satellite/repair/checker/checker.go b/satellite/repair/checker/checker.go index 964381218..fe1a8281d 100644 --- a/satellite/repair/checker/checker.go +++ b/satellite/repair/checker/checker.go @@ -44,6 +44,8 @@ type durabilityStats struct { remoteSegmentsNeedingRepair int64 remoteSegmentsLost int64 remoteSegmentInfo []string + // remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc... + remoteSegmentsOverThreshold [5]int64 } // Checker contains the information needed to do checks for missing pieces @@ -126,11 +128,16 @@ func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error) return err } - mon.IntVal("remote_files_checked").Observe(observer.monStats.objectsChecked) //locked - mon.IntVal("remote_segments_checked").Observe(observer.monStats.remoteSegmentsChecked) //locked - mon.IntVal("remote_segments_needing_repair").Observe(observer.monStats.remoteSegmentsNeedingRepair) //locked - mon.IntVal("remote_segments_lost").Observe(observer.monStats.remoteSegmentsLost) //locked - mon.IntVal("remote_files_lost").Observe(int64(len(observer.monStats.remoteSegmentInfo))) //locked + mon.IntVal("remote_files_checked").Observe(observer.monStats.objectsChecked) //locked + mon.IntVal("remote_segments_checked").Observe(observer.monStats.remoteSegmentsChecked) //locked + mon.IntVal("remote_segments_needing_repair").Observe(observer.monStats.remoteSegmentsNeedingRepair) //locked + mon.IntVal("remote_segments_lost").Observe(observer.monStats.remoteSegmentsLost) //locked + mon.IntVal("remote_files_lost").Observe(int64(len(observer.monStats.remoteSegmentInfo))) //locked + mon.IntVal("remote_segments_over_threshold_1").Observe(observer.monStats.remoteSegmentsOverThreshold[0]) //locked + mon.IntVal("remote_segments_over_threshold_2").Observe(observer.monStats.remoteSegmentsOverThreshold[1]) //locked + mon.IntVal("remote_segments_over_threshold_3").Observe(observer.monStats.remoteSegmentsOverThreshold[2]) //locked + mon.IntVal("remote_segments_over_threshold_4").Observe(observer.monStats.remoteSegmentsOverThreshold[3]) //locked + mon.IntVal("remote_segments_over_threshold_5").Observe(observer.monStats.remoteSegmentsOverThreshold[4]) //locked return nil } @@ -312,6 +319,15 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, path metainfo.Sco obs.log.Error("error handling irreparable segment to queue", zap.Error(err)) return nil } + } else if numHealthy > redundancy.RepairThreshold && numHealthy <= (redundancy.RepairThreshold+int32(len(obs.monStats.remoteSegmentsOverThreshold))) { + // record metrics for segments right above repair threshold + // numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5 + for i := range obs.monStats.remoteSegmentsOverThreshold { + if numHealthy == (redundancy.RepairThreshold + int32(i) + 1) { + obs.monStats.remoteSegmentsOverThreshold[i]++ + break + } + } } return nil