storj/satellite/repair/checker/checkerstats.go
Egon Elbre 267506bb20 satellite/metabase: move package one level higher
metabase has become a central concept and it's more suitable for it to
be directly nested under satellite rather than being part of metainfo.

metainfo is going to be the "endpoint" logic for handling requests.

Change-Id: I53770d6761ac1e9a1283b5aa68f471b21e784198
2021-04-21 15:54:22 +03:00

167 lines
8.8 KiB
Go

// Copyright (C) 2020 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"fmt"
"github.com/spacemonkeygo/monkit/v3"
"storj.io/storj/satellite/metabase"
)
// statsCollector holds a *stats for each redundancy scheme
// seen by the checker. These are chained into the monkit scope for
// monitoring as they are initialized.
type statsCollector struct {
stats map[string]*stats
}
func newStatsCollector() *statsCollector {
return &statsCollector{
stats: make(map[string]*stats),
}
}
func (collector *statsCollector) getStatsByRS(rs string) *stats {
stats, ok := collector.stats[rs]
if !ok {
stats = newStats(rs)
mon.Chain(stats)
collector.stats[rs] = stats
}
return stats
}
// collectAggregates transfers the iteration aggregates into the
// respective stats monkit metrics at the end of each checker iteration.
// iterationAggregates is then cleared.
func (collector *statsCollector) collectAggregates() {
for _, stats := range collector.stats {
stats.collectAggregates()
stats.iterationAggregates = new(aggregateStats)
}
}
// stats is used for collecting and reporting checker metrics.
//
// add any new metrics tagged with rs_scheme to this struct and set them
// in newStats.
type stats struct {
iterationAggregates *aggregateStats
objectsChecked *monkit.IntVal
remoteSegmentsChecked *monkit.IntVal
remoteSegmentsNeedingRepair *monkit.IntVal
newRemoteSegmentsNeedingRepair *monkit.IntVal
remoteSegmentsLost *monkit.IntVal
objectsLost *monkit.IntVal
remoteSegmentsFailedToCheck *monkit.IntVal
remoteSegmentsHealthyPercentage *monkit.FloatVal
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
remoteSegmentsOverThreshold1 *monkit.IntVal
remoteSegmentsOverThreshold2 *monkit.IntVal
remoteSegmentsOverThreshold3 *monkit.IntVal
remoteSegmentsOverThreshold4 *monkit.IntVal
remoteSegmentsOverThreshold5 *monkit.IntVal
segmentsBelowMinReq *monkit.Counter
segmentTotalCount *monkit.IntVal
segmentHealthyCount *monkit.IntVal
segmentAge *monkit.IntVal
segmentHealth *monkit.FloatVal
injuredSegmentHealth *monkit.FloatVal
segmentTimeUntilIrreparable *monkit.IntVal
}
// aggregateStats tallies data over the full checker iteration.
type aggregateStats struct {
objectsChecked int64
remoteSegmentsChecked int64
remoteSegmentsNeedingRepair int64
newRemoteSegmentsNeedingRepair int64
remoteSegmentsLost int64
remoteSegmentsFailedToCheck int64
remoteSegmentInfo []metabase.ObjectLocation
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
remoteSegmentsOverThreshold [5]int64
}
func newStats(rs string) *stats {
return &stats{
iterationAggregates: new(aggregateStats),
objectsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_objects_checked").WithTag("rs_scheme", rs)),
remoteSegmentsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_checked").WithTag("rs_scheme", rs)),
remoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_needing_repair").WithTag("rs_scheme", rs)),
newRemoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "new_remote_segments_needing_repair").WithTag("rs_scheme", rs)),
remoteSegmentsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_lost").WithTag("rs_scheme", rs)),
objectsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "objects_lost").WithTag("rs_scheme", rs)),
remoteSegmentsFailedToCheck: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_failed_to_check").WithTag("rs_scheme", rs)),
remoteSegmentsHealthyPercentage: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_healthy_percentage").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold1: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_1").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold2: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_2").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold3: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_3").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold4: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_4").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold5: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_5").WithTag("rs_scheme", rs)),
segmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segments_below_min_req").WithTag("rs_scheme", rs)),
segmentTotalCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_total_count").WithTag("rs_scheme", rs)),
segmentHealthyCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_healthy_count").WithTag("rs_scheme", rs)),
segmentAge: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_age").WithTag("rs_scheme", rs)),
segmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_health").WithTag("rs_scheme", rs)),
injuredSegmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_injured_segment_health").WithTag("rs_scheme", rs)),
segmentTimeUntilIrreparable: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_time_until_irreparable").WithTag("rs_scheme", rs)),
}
}
func (stats *stats) collectAggregates() {
stats.objectsChecked.Observe(stats.iterationAggregates.objectsChecked)
stats.remoteSegmentsChecked.Observe(stats.iterationAggregates.remoteSegmentsChecked)
stats.remoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.remoteSegmentsNeedingRepair)
stats.newRemoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.newRemoteSegmentsNeedingRepair)
stats.remoteSegmentsLost.Observe(stats.iterationAggregates.remoteSegmentsLost)
stats.objectsLost.Observe(int64(len(stats.iterationAggregates.remoteSegmentInfo)))
stats.remoteSegmentsFailedToCheck.Observe(stats.iterationAggregates.remoteSegmentsFailedToCheck)
stats.remoteSegmentsOverThreshold1.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[0])
stats.remoteSegmentsOverThreshold2.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[1])
stats.remoteSegmentsOverThreshold3.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[2])
stats.remoteSegmentsOverThreshold4.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[3])
stats.remoteSegmentsOverThreshold5.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[4])
allUnhealthy := stats.iterationAggregates.remoteSegmentsNeedingRepair + stats.iterationAggregates.remoteSegmentsFailedToCheck
allChecked := stats.iterationAggregates.remoteSegmentsChecked
allHealthy := allChecked - allUnhealthy
stats.remoteSegmentsHealthyPercentage.Observe(100 * float64(allHealthy) / float64(allChecked))
}
// Stats implements the monkit.StatSource interface.
func (stats *stats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) {
stats.objectsChecked.Stats(cb)
stats.remoteSegmentsChecked.Stats(cb)
stats.remoteSegmentsNeedingRepair.Stats(cb)
stats.newRemoteSegmentsNeedingRepair.Stats(cb)
stats.remoteSegmentsLost.Stats(cb)
stats.objectsLost.Stats(cb)
stats.remoteSegmentsFailedToCheck.Stats(cb)
stats.remoteSegmentsOverThreshold1.Stats(cb)
stats.remoteSegmentsOverThreshold2.Stats(cb)
stats.remoteSegmentsOverThreshold3.Stats(cb)
stats.remoteSegmentsOverThreshold4.Stats(cb)
stats.remoteSegmentsOverThreshold5.Stats(cb)
stats.remoteSegmentsHealthyPercentage.Stats(cb)
stats.segmentsBelowMinReq.Stats(cb)
stats.segmentTotalCount.Stats(cb)
stats.segmentHealthyCount.Stats(cb)
stats.segmentAge.Stats(cb)
stats.segmentHealth.Stats(cb)
stats.injuredSegmentHealth.Stats(cb)
stats.segmentTimeUntilIrreparable.Stats(cb)
}
func getRSString(min, repair, success, total int) string {
return fmt.Sprintf("%d/%d/%d/%d", min, repair, success, total)
}