satellite/repair/repairer: a new set of rs_scheme tagged metrics

Change-Id: Ibecd9265da881247eeb85ba185ee8877a7243777
This commit is contained in:
Cameron Ayer 2021-02-03 17:22:46 -05:00 committed by Yingrong Zhao
parent 520da5b1e4
commit 4a797baa73
2 changed files with 136 additions and 11 deletions

View File

@ -50,12 +50,13 @@ func (ie *irreparableError) Error() string {
// SegmentRepairer for segments.
type SegmentRepairer struct {
log *zap.Logger
metainfo *metainfo.Service
orders *orders.Service
overlay *overlay.Service
ec *ECRepairer
timeout time.Duration
log *zap.Logger
statsCollector *statsCollector
metainfo *metainfo.Service
orders *orders.Service
overlay *overlay.Service
ec *ECRepairer
timeout time.Duration
// multiplierOptimalThreshold is the value that multiplied by the optimal
// threshold results in the maximum limit of number of nodes to upload
@ -85,6 +86,7 @@ func NewSegmentRepairer(
return &SegmentRepairer{
log: log,
statsCollector: newStatsCollector(),
metainfo: metainfo,
orders: orders,
overlay: overlay,
@ -117,14 +119,18 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
return true, invalidRepairError.New("cannot repair inline segment")
}
mon.Meter("repair_attempts").Mark(1) //mon:locked
mon.IntVal("repair_segment_size").Observe(pointer.GetSegmentSize()) //mon:locked
redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())
if err != nil {
return true, invalidRepairError.New("invalid redundancy strategy: %w", err)
}
stats := repairer.getStatsByRS(pointer.Remote.GetRedundancy())
mon.Meter("repair_attempts").Mark(1) //mon:locked
stats.repairAttempts.Mark(1)
mon.IntVal("repair_segment_size").Observe(pointer.GetSegmentSize()) //mon:locked
stats.repairSegmentSize.Observe(pointer.GetSegmentSize())
var excludeNodeIDs storj.NodeIDList
var healthyPieces, unhealthyPieces []*pb.RemotePiece
healthyMap := make(map[int32]bool)
@ -138,7 +144,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
// irreparable piece
if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
mon.Counter("repairer_segments_below_min_req").Inc(1) //mon:locked
mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked
stats.repairerSegmentsBelowMinReq.Inc(1)
mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked
stats.repairerNodesUnavailable.Mark(1)
return true, &irreparableError{
path: path,
piecesAvailable: int32(numHealthy),
@ -149,6 +157,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked
stats.repairerSegmentsBelowMinReq.Inc(0)
repairThreshold := pointer.Remote.Redundancy.RepairThreshold
overrideValue := repairer.repairOverrides.GetOverrideValuePB(pointer.Remote.Redundancy)
@ -159,6 +168,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
// repair not needed
if int32(numHealthy) > repairThreshold {
mon.Meter("repair_unnecessary").Mark(1) //mon:locked
stats.repairUnnecessary.Mark(1)
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold))
return true, nil
}
@ -168,6 +178,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
healthyRatioBeforeRepair = float64(numHealthy) / float64(pointer.Remote.Redundancy.Total)
}
mon.FloatVal("healthy_ratio_before_repair").Observe(healthyRatioBeforeRepair) //mon:locked
stats.healthyRatioBeforeRepair.Observe(healthyRatioBeforeRepair)
lostPiecesSet := sliceToSet(missingPieces)
@ -257,6 +268,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
// to wait for nodes to come back online.
if irreparableErr, ok := err.(*irreparableError); ok {
mon.Meter("repair_too_many_nodes_failed").Mark(1) //mon:locked
stats.repairTooManyNodesFailed.Mark(1)
irreparableErr.segmentInfo = pointer
return true, irreparableErr
}
@ -296,17 +308,22 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
// repair "succeeded" in that the segment is now healthier than it was, but it is
// not as healthy as we want it to be.
mon.Meter("repair_failed").Mark(1) //mon:locked
stats.repairFailed.Mark(1)
case healthyAfterRepair < pointer.Remote.Redundancy.SuccessThreshold:
mon.Meter("repair_partial").Mark(1) //mon:locked
stats.repairPartial.Mark(1)
default:
mon.Meter("repair_success").Mark(1) //mon:locked
stats.repairSuccess.Mark(1)
}
healthyRatioAfterRepair := 0.0
if pointer.Remote.Redundancy.Total != 0 {
healthyRatioAfterRepair = float64(healthyAfterRepair) / float64(pointer.Remote.Redundancy.Total)
}
mon.FloatVal("healthy_ratio_after_repair").Observe(healthyRatioAfterRepair) //mon:locked
stats.healthyRatioAfterRepair.Observe(healthyRatioAfterRepair)
var toRemove []*pb.RemotePiece
if healthyAfterRepair >= pointer.Remote.Redundancy.SuccessThreshold {
@ -343,11 +360,27 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
}
mon.IntVal("segment_time_until_repair").Observe(int64(segmentAge.Seconds())) //mon:locked
mon.IntVal("segment_repair_count").Observe(int64(pointer.RepairCount)) //mon:locked
stats.segmentTimeUntilRepair.Observe((int64(segmentAge.Seconds())))
mon.IntVal("segment_repair_count").Observe(int64(pointer.RepairCount)) //mon:locked
stats.segmentRepairCount.Observe(int64(pointer.RepairCount))
return true, nil
}
func (repairer *SegmentRepairer) getStatsByRS(redundancy *pb.RedundancyScheme) *stats {
rsString := getRSString(repairer.loadRedundancy(redundancy))
return repairer.statsCollector.getStatsByRS(rsString)
}
func (repairer *SegmentRepairer) loadRedundancy(redundancy *pb.RedundancyScheme) (int, int, int, int) {
repair := int(redundancy.RepairThreshold)
overrideValue := repairer.repairOverrides.GetOverrideValuePB(redundancy)
if overrideValue != 0 {
repair = int(overrideValue)
}
return int(redundancy.MinReq), repair, int(redundancy.SuccessThreshold), int(redundancy.Total)
}
func (repairer *SegmentRepairer) updateAuditFailStatus(ctx context.Context, failedAuditNodeIDs storj.NodeIDList) (failedNum int, err error) {
updateRequests := make([]*overlay.UpdateRequest, len(failedAuditNodeIDs))
for i, nodeID := range failedAuditNodeIDs {

View File

@ -0,0 +1,92 @@
// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package repairer
import (
"fmt"
"github.com/spacemonkeygo/monkit/v3"
)
// statsCollector holds a *stats for each redundancy scheme
// seen by the repairer. These are chained into the monkit scope for
// monitoring as they are initialized.
type statsCollector struct {
stats map[string]*stats
}
func newStatsCollector() *statsCollector {
return &statsCollector{
stats: make(map[string]*stats),
}
}
func (collector *statsCollector) getStatsByRS(rs string) *stats {
stats, ok := collector.stats[rs]
if !ok {
stats = newStats(rs)
mon.Chain(stats)
collector.stats[rs] = stats
}
return stats
}
// stats is used for collecting and reporting repairer metrics.
//
// add any new metrics tagged with rs_scheme to this struct and set them
// in newStats.
type stats struct {
repairAttempts *monkit.Meter
repairSegmentSize *monkit.IntVal
repairerSegmentsBelowMinReq *monkit.Counter
repairerNodesUnavailable *monkit.Meter
repairUnnecessary *monkit.Meter
healthyRatioBeforeRepair *monkit.FloatVal
repairTooManyNodesFailed *monkit.Meter
repairFailed *monkit.Meter
repairPartial *monkit.Meter
repairSuccess *monkit.Meter
healthyRatioAfterRepair *monkit.FloatVal
segmentTimeUntilRepair *monkit.IntVal
segmentRepairCount *monkit.IntVal
}
func newStats(rs string) *stats {
return &stats{
repairAttempts: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_attempts").WithTag("rs_scheme", rs)),
repairSegmentSize: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_segment_size").WithTag("rs_scheme", rs)),
repairerSegmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repairer_segments_below_min_req").WithTag("rs_scheme", rs)),
repairerNodesUnavailable: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repairer_nodes_unavailable").WithTag("rs_scheme", rs)),
repairUnnecessary: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_unnecessary").WithTag("rs_scheme", rs)),
healthyRatioBeforeRepair: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "healthy_ratio_before_repair").WithTag("rs_scheme", rs)),
repairTooManyNodesFailed: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_too_many_nodes_failed").WithTag("rs_scheme", rs)),
repairFailed: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_failed").WithTag("rs_scheme", rs)),
repairPartial: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_partial").WithTag("rs_scheme", rs)),
repairSuccess: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_success").WithTag("rs_scheme", rs)),
healthyRatioAfterRepair: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "healthy_ratio_after_repair").WithTag("rs_scheme", rs)),
segmentTimeUntilRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "segment_time_until_repair").WithTag("rs_scheme", rs)),
segmentRepairCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "segment_repair_count").WithTag("rs_scheme", rs)),
}
}
// Stats implements the monkit.StatSource interface.
func (stats *stats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) {
stats.repairAttempts.Stats(cb)
stats.repairSegmentSize.Stats(cb)
stats.repairerSegmentsBelowMinReq.Stats(cb)
stats.repairerNodesUnavailable.Stats(cb)
stats.repairUnnecessary.Stats(cb)
stats.healthyRatioBeforeRepair.Stats(cb)
stats.repairTooManyNodesFailed.Stats(cb)
stats.repairFailed.Stats(cb)
stats.repairPartial.Stats(cb)
stats.repairSuccess.Stats(cb)
stats.healthyRatioAfterRepair.Stats(cb)
stats.segmentTimeUntilRepair.Stats(cb)
stats.segmentRepairCount.Stats(cb)
}
func getRSString(min, repair, success, total int) string {
return fmt.Sprintf("%d/%d/%d/%d", min, repair, success, total)
}