// Copyright (C) 2020 Storj Labs, Inc. // See LICENSE for copying information. package checker import ( "fmt" "github.com/spacemonkeygo/monkit/v3" "storj.io/common/uuid" ) // statsCollector holds a *stats for each redundancy scheme // seen by the checker. These are chained into the monkit scope for // monitoring as they are initialized. type statsCollector struct { stats map[string]*stats } func newStatsCollector() *statsCollector { return &statsCollector{ stats: make(map[string]*stats), } } func (collector *statsCollector) getStatsByRS(rs string) *stats { stats, ok := collector.stats[rs] if !ok { stats = newStats(rs) mon.Chain(stats) collector.stats[rs] = stats } return stats } // collectAggregates transfers the iteration aggregates into the // respective stats monkit metrics at the end of each checker iteration. // iterationAggregates is then cleared. func (collector *statsCollector) collectAggregates() { for _, stats := range collector.stats { stats.collectAggregates() stats.iterationAggregates = new(aggregateStats) } } // stats is used for collecting and reporting checker metrics. // // add any new metrics tagged with rs_scheme to this struct and set them // in newStats. type stats struct { iterationAggregates *aggregateStats objectsChecked *monkit.IntVal remoteSegmentsChecked *monkit.IntVal remoteSegmentsNeedingRepair *monkit.IntVal newRemoteSegmentsNeedingRepair *monkit.IntVal remoteSegmentsLost *monkit.IntVal objectsLost *monkit.IntVal remoteSegmentsFailedToCheck *monkit.IntVal remoteSegmentsHealthyPercentage *monkit.FloatVal // remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc... remoteSegmentsOverThreshold1 *monkit.IntVal remoteSegmentsOverThreshold2 *monkit.IntVal remoteSegmentsOverThreshold3 *monkit.IntVal remoteSegmentsOverThreshold4 *monkit.IntVal remoteSegmentsOverThreshold5 *monkit.IntVal segmentsBelowMinReq *monkit.Counter segmentTotalCount *monkit.IntVal segmentHealthyCount *monkit.IntVal segmentAge *monkit.IntVal segmentHealth *monkit.FloatVal injuredSegmentHealth *monkit.FloatVal segmentTimeUntilIrreparable *monkit.IntVal } // aggregateStats tallies data over the full checker iteration. type aggregateStats struct { objectsChecked int64 remoteSegmentsChecked int64 remoteSegmentsNeedingRepair int64 newRemoteSegmentsNeedingRepair int64 remoteSegmentsLost int64 remoteSegmentsFailedToCheck int64 objectsLost []uuid.UUID // remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc... remoteSegmentsOverThreshold [5]int64 } func newStats(rs string) *stats { return &stats{ iterationAggregates: new(aggregateStats), objectsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_objects_checked").WithTag("rs_scheme", rs)), remoteSegmentsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_checked").WithTag("rs_scheme", rs)), remoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_needing_repair").WithTag("rs_scheme", rs)), newRemoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "new_remote_segments_needing_repair").WithTag("rs_scheme", rs)), remoteSegmentsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_lost").WithTag("rs_scheme", rs)), objectsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "objects_lost").WithTag("rs_scheme", rs)), remoteSegmentsFailedToCheck: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_failed_to_check").WithTag("rs_scheme", rs)), remoteSegmentsHealthyPercentage: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_healthy_percentage").WithTag("rs_scheme", rs)), remoteSegmentsOverThreshold1: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_1").WithTag("rs_scheme", rs)), remoteSegmentsOverThreshold2: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_2").WithTag("rs_scheme", rs)), remoteSegmentsOverThreshold3: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_3").WithTag("rs_scheme", rs)), remoteSegmentsOverThreshold4: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_4").WithTag("rs_scheme", rs)), remoteSegmentsOverThreshold5: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_5").WithTag("rs_scheme", rs)), segmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segments_below_min_req").WithTag("rs_scheme", rs)), segmentTotalCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_total_count").WithTag("rs_scheme", rs)), segmentHealthyCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_healthy_count").WithTag("rs_scheme", rs)), segmentAge: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_age").WithTag("rs_scheme", rs)), segmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_health").WithTag("rs_scheme", rs)), injuredSegmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_injured_segment_health").WithTag("rs_scheme", rs)), segmentTimeUntilIrreparable: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_time_until_irreparable").WithTag("rs_scheme", rs)), } } func (stats *stats) collectAggregates() { stats.objectsChecked.Observe(stats.iterationAggregates.objectsChecked) stats.remoteSegmentsChecked.Observe(stats.iterationAggregates.remoteSegmentsChecked) stats.remoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.remoteSegmentsNeedingRepair) stats.newRemoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.newRemoteSegmentsNeedingRepair) stats.remoteSegmentsLost.Observe(stats.iterationAggregates.remoteSegmentsLost) stats.objectsLost.Observe(int64(len(stats.iterationAggregates.objectsLost))) stats.remoteSegmentsFailedToCheck.Observe(stats.iterationAggregates.remoteSegmentsFailedToCheck) stats.remoteSegmentsOverThreshold1.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[0]) stats.remoteSegmentsOverThreshold2.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[1]) stats.remoteSegmentsOverThreshold3.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[2]) stats.remoteSegmentsOverThreshold4.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[3]) stats.remoteSegmentsOverThreshold5.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[4]) allUnhealthy := stats.iterationAggregates.remoteSegmentsNeedingRepair + stats.iterationAggregates.remoteSegmentsFailedToCheck allChecked := stats.iterationAggregates.remoteSegmentsChecked allHealthy := allChecked - allUnhealthy stats.remoteSegmentsHealthyPercentage.Observe(100 * float64(allHealthy) / float64(allChecked)) } // Stats implements the monkit.StatSource interface. func (stats *stats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) { stats.objectsChecked.Stats(cb) stats.remoteSegmentsChecked.Stats(cb) stats.remoteSegmentsNeedingRepair.Stats(cb) stats.newRemoteSegmentsNeedingRepair.Stats(cb) stats.remoteSegmentsLost.Stats(cb) stats.objectsLost.Stats(cb) stats.remoteSegmentsFailedToCheck.Stats(cb) stats.remoteSegmentsOverThreshold1.Stats(cb) stats.remoteSegmentsOverThreshold2.Stats(cb) stats.remoteSegmentsOverThreshold3.Stats(cb) stats.remoteSegmentsOverThreshold4.Stats(cb) stats.remoteSegmentsOverThreshold5.Stats(cb) stats.remoteSegmentsHealthyPercentage.Stats(cb) stats.segmentsBelowMinReq.Stats(cb) stats.segmentTotalCount.Stats(cb) stats.segmentHealthyCount.Stats(cb) stats.segmentAge.Stats(cb) stats.segmentHealth.Stats(cb) stats.injuredSegmentHealth.Stats(cb) stats.segmentTimeUntilIrreparable.Stats(cb) } func getRSString(min, repair, success, total int) string { return fmt.Sprintf("%d/%d/%d/%d", min, repair, success, total) }