2ac72eaf16
There is a new checker field called statsCollector. This contains a map of stats pointers where the key is a stringified redundancy scheme. stats contains all tagged monkit metrics. These metrics exist under the key name, "tagged_repair_stats", which is tagged with the name of each metric and a corresponding rs scheme. As the metainfo observer works on a segment, it checks statsCollector for a stats corresponding to the segment's redundancy scheme. If one doesn't exist, it is created and chained to the monkit scope. Now we can call Observe, Inc, etc on the fields just like before, and they have tags! durabilityStats has also been renamed to aggregateStats. At the end of the metainfo loop, we insert the aggregateStats totals into the corresponding stats fields for metric reporting. Change-Id: I8aa1918351d246a8ef818b9712ed4cb39d1ea9c6
167 lines
8.8 KiB
Go
167 lines
8.8 KiB
Go
// Copyright (C) 2020 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package checker
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
|
|
|
"storj.io/storj/satellite/metainfo/metabase"
|
|
)
|
|
|
|
// statsCollector holds a *stats for each redundancy scheme
|
|
// seen by the checker. These are chained into the monkit scope for
|
|
// monitoring as they are initialized.
|
|
type statsCollector struct {
|
|
stats map[string]*stats
|
|
}
|
|
|
|
func newStatsCollector() *statsCollector {
|
|
return &statsCollector{
|
|
stats: make(map[string]*stats),
|
|
}
|
|
}
|
|
|
|
func (collector *statsCollector) getStatsByRS(rs string) *stats {
|
|
stats, ok := collector.stats[rs]
|
|
if !ok {
|
|
stats = newStats(rs)
|
|
mon.Chain(stats)
|
|
collector.stats[rs] = stats
|
|
}
|
|
return stats
|
|
}
|
|
|
|
// collectAggregates transfers the iteration aggregates into the
|
|
// respective stats monkit metrics at the end of each checker iteration.
|
|
// iterationAggregates is then cleared.
|
|
func (collector *statsCollector) collectAggregates() {
|
|
for _, stats := range collector.stats {
|
|
stats.collectAggregates()
|
|
stats.iterationAggregates = new(aggregateStats)
|
|
}
|
|
}
|
|
|
|
// stats is used for collecting and reporting checker metrics.
|
|
//
|
|
// add any new metrics tagged with rs_scheme to this struct and set them
|
|
// in newStats.
|
|
type stats struct {
|
|
iterationAggregates *aggregateStats
|
|
|
|
objectsChecked *monkit.IntVal
|
|
remoteSegmentsChecked *monkit.IntVal
|
|
remoteSegmentsNeedingRepair *monkit.IntVal
|
|
newRemoteSegmentsNeedingRepair *monkit.IntVal
|
|
remoteSegmentsLost *monkit.IntVal
|
|
objectsLost *monkit.IntVal
|
|
remoteSegmentsFailedToCheck *monkit.IntVal
|
|
remoteSegmentsHealthyPercentage *monkit.FloatVal
|
|
|
|
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
|
|
remoteSegmentsOverThreshold1 *monkit.IntVal
|
|
remoteSegmentsOverThreshold2 *monkit.IntVal
|
|
remoteSegmentsOverThreshold3 *monkit.IntVal
|
|
remoteSegmentsOverThreshold4 *monkit.IntVal
|
|
remoteSegmentsOverThreshold5 *monkit.IntVal
|
|
|
|
segmentsBelowMinReq *monkit.Counter
|
|
segmentTotalCount *monkit.IntVal
|
|
segmentHealthyCount *monkit.IntVal
|
|
segmentAge *monkit.IntVal
|
|
segmentHealth *monkit.FloatVal
|
|
injuredSegmentHealth *monkit.FloatVal
|
|
segmentTimeUntilIrreparable *monkit.IntVal
|
|
}
|
|
|
|
// aggregateStats tallies data over the full checker iteration.
|
|
type aggregateStats struct {
|
|
objectsChecked int64
|
|
remoteSegmentsChecked int64
|
|
remoteSegmentsNeedingRepair int64
|
|
newRemoteSegmentsNeedingRepair int64
|
|
remoteSegmentsLost int64
|
|
remoteSegmentsFailedToCheck int64
|
|
remoteSegmentInfo []metabase.ObjectLocation
|
|
|
|
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
|
|
remoteSegmentsOverThreshold [5]int64
|
|
}
|
|
|
|
func newStats(rs string) *stats {
|
|
return &stats{
|
|
iterationAggregates: new(aggregateStats),
|
|
objectsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_objects_checked").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_checked").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_needing_repair").WithTag("rs_scheme", rs)),
|
|
newRemoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "new_remote_segments_needing_repair").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_lost").WithTag("rs_scheme", rs)),
|
|
objectsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "objects_lost").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsFailedToCheck: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_failed_to_check").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsHealthyPercentage: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_healthy_percentage").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold1: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_1").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold2: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_2").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold3: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_3").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold4: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_4").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold5: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_5").WithTag("rs_scheme", rs)),
|
|
segmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segments_below_min_req").WithTag("rs_scheme", rs)),
|
|
segmentTotalCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_total_count").WithTag("rs_scheme", rs)),
|
|
segmentHealthyCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_healthy_count").WithTag("rs_scheme", rs)),
|
|
segmentAge: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_age").WithTag("rs_scheme", rs)),
|
|
segmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_health").WithTag("rs_scheme", rs)),
|
|
injuredSegmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_injured_segment_health").WithTag("rs_scheme", rs)),
|
|
segmentTimeUntilIrreparable: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_time_until_irreparable").WithTag("rs_scheme", rs)),
|
|
}
|
|
}
|
|
|
|
func (stats *stats) collectAggregates() {
|
|
stats.objectsChecked.Observe(stats.iterationAggregates.objectsChecked)
|
|
stats.remoteSegmentsChecked.Observe(stats.iterationAggregates.remoteSegmentsChecked)
|
|
stats.remoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.remoteSegmentsNeedingRepair)
|
|
stats.newRemoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.newRemoteSegmentsNeedingRepair)
|
|
stats.remoteSegmentsLost.Observe(stats.iterationAggregates.remoteSegmentsLost)
|
|
stats.objectsLost.Observe(int64(len(stats.iterationAggregates.remoteSegmentInfo)))
|
|
stats.remoteSegmentsFailedToCheck.Observe(stats.iterationAggregates.remoteSegmentsFailedToCheck)
|
|
stats.remoteSegmentsOverThreshold1.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[0])
|
|
stats.remoteSegmentsOverThreshold2.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[1])
|
|
stats.remoteSegmentsOverThreshold3.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[2])
|
|
stats.remoteSegmentsOverThreshold4.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[3])
|
|
stats.remoteSegmentsOverThreshold5.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[4])
|
|
|
|
allUnhealthy := stats.iterationAggregates.remoteSegmentsNeedingRepair + stats.iterationAggregates.remoteSegmentsFailedToCheck
|
|
allChecked := stats.iterationAggregates.remoteSegmentsChecked
|
|
allHealthy := allChecked - allUnhealthy
|
|
|
|
stats.remoteSegmentsHealthyPercentage.Observe(100 * float64(allHealthy) / float64(allChecked))
|
|
}
|
|
|
|
// Stats implements the monkit.StatSource interface.
|
|
func (stats *stats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) {
|
|
stats.objectsChecked.Stats(cb)
|
|
stats.remoteSegmentsChecked.Stats(cb)
|
|
stats.remoteSegmentsNeedingRepair.Stats(cb)
|
|
stats.newRemoteSegmentsNeedingRepair.Stats(cb)
|
|
stats.remoteSegmentsLost.Stats(cb)
|
|
stats.objectsLost.Stats(cb)
|
|
stats.remoteSegmentsFailedToCheck.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold1.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold2.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold3.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold4.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold5.Stats(cb)
|
|
stats.remoteSegmentsHealthyPercentage.Stats(cb)
|
|
stats.segmentsBelowMinReq.Stats(cb)
|
|
stats.segmentTotalCount.Stats(cb)
|
|
stats.segmentHealthyCount.Stats(cb)
|
|
stats.segmentAge.Stats(cb)
|
|
stats.segmentHealth.Stats(cb)
|
|
stats.injuredSegmentHealth.Stats(cb)
|
|
stats.segmentTimeUntilIrreparable.Stats(cb)
|
|
}
|
|
|
|
func getRSString(min, repair, success, total int) string {
|
|
return fmt.Sprintf("%d/%d/%d/%d", min, repair, success, total)
|
|
}
|