satellite/repair/checker: add monkit metrics for segments immediately above repair threshold

Record counts for segments at health=rt+1 through health=rt+5 for every checker
iteration.

Change-Id: I2a00c0bc34d17beb21cacdeab4dac77f755faefe
This commit is contained in:
Moby von Briesen 2020-02-24 18:06:52 -05:00 committed by Jennifer Li Johnson
parent 46228fee92
commit d5540c89a1
2 changed files with 26 additions and 5 deletions

View File

@ -61,6 +61,11 @@ storj.io/storj/satellite/repair/checker."remote_files_lost" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_checked" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_lost" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_needing_repair" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_1" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_2" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_3" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_4" IntVal
storj.io/storj/satellite/repair/checker."remote_segments_over_threshold_5" IntVal
storj.io/storj/satellite/repair/repairer."download_failed_not_enough_pieces_repair" Meter
storj.io/storj/satellite/repair/repairer."healthy_ratio_after_repair" FloatVal
storj.io/storj/satellite/repair/repairer."healthy_ratio_before_repair" FloatVal

View File

@ -44,6 +44,8 @@ type durabilityStats struct {
remoteSegmentsNeedingRepair int64
remoteSegmentsLost int64
remoteSegmentInfo []string
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
remoteSegmentsOverThreshold [5]int64
}
// Checker contains the information needed to do checks for missing pieces
@ -131,6 +133,11 @@ func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error)
mon.IntVal("remote_segments_needing_repair").Observe(observer.monStats.remoteSegmentsNeedingRepair) //locked
mon.IntVal("remote_segments_lost").Observe(observer.monStats.remoteSegmentsLost) //locked
mon.IntVal("remote_files_lost").Observe(int64(len(observer.monStats.remoteSegmentInfo))) //locked
mon.IntVal("remote_segments_over_threshold_1").Observe(observer.monStats.remoteSegmentsOverThreshold[0]) //locked
mon.IntVal("remote_segments_over_threshold_2").Observe(observer.monStats.remoteSegmentsOverThreshold[1]) //locked
mon.IntVal("remote_segments_over_threshold_3").Observe(observer.monStats.remoteSegmentsOverThreshold[2]) //locked
mon.IntVal("remote_segments_over_threshold_4").Observe(observer.monStats.remoteSegmentsOverThreshold[3]) //locked
mon.IntVal("remote_segments_over_threshold_5").Observe(observer.monStats.remoteSegmentsOverThreshold[4]) //locked
return nil
}
@ -312,6 +319,15 @@ func (obs *checkerObserver) RemoteSegment(ctx context.Context, path metainfo.Sco
obs.log.Error("error handling irreparable segment to queue", zap.Error(err))
return nil
}
} else if numHealthy > redundancy.RepairThreshold && numHealthy <= (redundancy.RepairThreshold+int32(len(obs.monStats.remoteSegmentsOverThreshold))) {
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
for i := range obs.monStats.remoteSegmentsOverThreshold {
if numHealthy == (redundancy.RepairThreshold + int32(i) + 1) {
obs.monStats.remoteSegmentsOverThreshold[i]++
break
}
}
}
return nil