267506bb20
metabase has become a central concept and it's more suitable for it to be directly nested under satellite rather than being part of metainfo. metainfo is going to be the "endpoint" logic for handling requests. Change-Id: I53770d6761ac1e9a1283b5aa68f471b21e784198
167 lines
8.8 KiB
Go
167 lines
8.8 KiB
Go
// Copyright (C) 2020 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package checker
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
|
|
|
"storj.io/storj/satellite/metabase"
|
|
)
|
|
|
|
// statsCollector holds a *stats for each redundancy scheme
|
|
// seen by the checker. These are chained into the monkit scope for
|
|
// monitoring as they are initialized.
|
|
type statsCollector struct {
|
|
stats map[string]*stats
|
|
}
|
|
|
|
func newStatsCollector() *statsCollector {
|
|
return &statsCollector{
|
|
stats: make(map[string]*stats),
|
|
}
|
|
}
|
|
|
|
func (collector *statsCollector) getStatsByRS(rs string) *stats {
|
|
stats, ok := collector.stats[rs]
|
|
if !ok {
|
|
stats = newStats(rs)
|
|
mon.Chain(stats)
|
|
collector.stats[rs] = stats
|
|
}
|
|
return stats
|
|
}
|
|
|
|
// collectAggregates transfers the iteration aggregates into the
|
|
// respective stats monkit metrics at the end of each checker iteration.
|
|
// iterationAggregates is then cleared.
|
|
func (collector *statsCollector) collectAggregates() {
|
|
for _, stats := range collector.stats {
|
|
stats.collectAggregates()
|
|
stats.iterationAggregates = new(aggregateStats)
|
|
}
|
|
}
|
|
|
|
// stats is used for collecting and reporting checker metrics.
|
|
//
|
|
// add any new metrics tagged with rs_scheme to this struct and set them
|
|
// in newStats.
|
|
type stats struct {
|
|
iterationAggregates *aggregateStats
|
|
|
|
objectsChecked *monkit.IntVal
|
|
remoteSegmentsChecked *monkit.IntVal
|
|
remoteSegmentsNeedingRepair *monkit.IntVal
|
|
newRemoteSegmentsNeedingRepair *monkit.IntVal
|
|
remoteSegmentsLost *monkit.IntVal
|
|
objectsLost *monkit.IntVal
|
|
remoteSegmentsFailedToCheck *monkit.IntVal
|
|
remoteSegmentsHealthyPercentage *monkit.FloatVal
|
|
|
|
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
|
|
remoteSegmentsOverThreshold1 *monkit.IntVal
|
|
remoteSegmentsOverThreshold2 *monkit.IntVal
|
|
remoteSegmentsOverThreshold3 *monkit.IntVal
|
|
remoteSegmentsOverThreshold4 *monkit.IntVal
|
|
remoteSegmentsOverThreshold5 *monkit.IntVal
|
|
|
|
segmentsBelowMinReq *monkit.Counter
|
|
segmentTotalCount *monkit.IntVal
|
|
segmentHealthyCount *monkit.IntVal
|
|
segmentAge *monkit.IntVal
|
|
segmentHealth *monkit.FloatVal
|
|
injuredSegmentHealth *monkit.FloatVal
|
|
segmentTimeUntilIrreparable *monkit.IntVal
|
|
}
|
|
|
|
// aggregateStats tallies data over the full checker iteration.
|
|
type aggregateStats struct {
|
|
objectsChecked int64
|
|
remoteSegmentsChecked int64
|
|
remoteSegmentsNeedingRepair int64
|
|
newRemoteSegmentsNeedingRepair int64
|
|
remoteSegmentsLost int64
|
|
remoteSegmentsFailedToCheck int64
|
|
remoteSegmentInfo []metabase.ObjectLocation
|
|
|
|
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
|
|
remoteSegmentsOverThreshold [5]int64
|
|
}
|
|
|
|
func newStats(rs string) *stats {
|
|
return &stats{
|
|
iterationAggregates: new(aggregateStats),
|
|
objectsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_objects_checked").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_checked").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_needing_repair").WithTag("rs_scheme", rs)),
|
|
newRemoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "new_remote_segments_needing_repair").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_lost").WithTag("rs_scheme", rs)),
|
|
objectsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "objects_lost").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsFailedToCheck: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_failed_to_check").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsHealthyPercentage: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_healthy_percentage").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold1: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_1").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold2: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_2").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold3: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_3").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold4: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_4").WithTag("rs_scheme", rs)),
|
|
remoteSegmentsOverThreshold5: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_5").WithTag("rs_scheme", rs)),
|
|
segmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segments_below_min_req").WithTag("rs_scheme", rs)),
|
|
segmentTotalCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_total_count").WithTag("rs_scheme", rs)),
|
|
segmentHealthyCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_healthy_count").WithTag("rs_scheme", rs)),
|
|
segmentAge: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_age").WithTag("rs_scheme", rs)),
|
|
segmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_health").WithTag("rs_scheme", rs)),
|
|
injuredSegmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_injured_segment_health").WithTag("rs_scheme", rs)),
|
|
segmentTimeUntilIrreparable: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_time_until_irreparable").WithTag("rs_scheme", rs)),
|
|
}
|
|
}
|
|
|
|
func (stats *stats) collectAggregates() {
|
|
stats.objectsChecked.Observe(stats.iterationAggregates.objectsChecked)
|
|
stats.remoteSegmentsChecked.Observe(stats.iterationAggregates.remoteSegmentsChecked)
|
|
stats.remoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.remoteSegmentsNeedingRepair)
|
|
stats.newRemoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.newRemoteSegmentsNeedingRepair)
|
|
stats.remoteSegmentsLost.Observe(stats.iterationAggregates.remoteSegmentsLost)
|
|
stats.objectsLost.Observe(int64(len(stats.iterationAggregates.remoteSegmentInfo)))
|
|
stats.remoteSegmentsFailedToCheck.Observe(stats.iterationAggregates.remoteSegmentsFailedToCheck)
|
|
stats.remoteSegmentsOverThreshold1.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[0])
|
|
stats.remoteSegmentsOverThreshold2.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[1])
|
|
stats.remoteSegmentsOverThreshold3.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[2])
|
|
stats.remoteSegmentsOverThreshold4.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[3])
|
|
stats.remoteSegmentsOverThreshold5.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[4])
|
|
|
|
allUnhealthy := stats.iterationAggregates.remoteSegmentsNeedingRepair + stats.iterationAggregates.remoteSegmentsFailedToCheck
|
|
allChecked := stats.iterationAggregates.remoteSegmentsChecked
|
|
allHealthy := allChecked - allUnhealthy
|
|
|
|
stats.remoteSegmentsHealthyPercentage.Observe(100 * float64(allHealthy) / float64(allChecked))
|
|
}
|
|
|
|
// Stats implements the monkit.StatSource interface.
|
|
func (stats *stats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) {
|
|
stats.objectsChecked.Stats(cb)
|
|
stats.remoteSegmentsChecked.Stats(cb)
|
|
stats.remoteSegmentsNeedingRepair.Stats(cb)
|
|
stats.newRemoteSegmentsNeedingRepair.Stats(cb)
|
|
stats.remoteSegmentsLost.Stats(cb)
|
|
stats.objectsLost.Stats(cb)
|
|
stats.remoteSegmentsFailedToCheck.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold1.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold2.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold3.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold4.Stats(cb)
|
|
stats.remoteSegmentsOverThreshold5.Stats(cb)
|
|
stats.remoteSegmentsHealthyPercentage.Stats(cb)
|
|
stats.segmentsBelowMinReq.Stats(cb)
|
|
stats.segmentTotalCount.Stats(cb)
|
|
stats.segmentHealthyCount.Stats(cb)
|
|
stats.segmentAge.Stats(cb)
|
|
stats.segmentHealth.Stats(cb)
|
|
stats.injuredSegmentHealth.Stats(cb)
|
|
stats.segmentTimeUntilIrreparable.Stats(cb)
|
|
}
|
|
|
|
func getRSString(min, repair, success, total int) string {
|
|
return fmt.Sprintf("%d/%d/%d/%d", min, repair, success, total)
|
|
}
|