From 4a797baa73b42e65d8e39f587bff42bfafda85d1 Mon Sep 17 00:00:00 2001 From: Cameron Ayer Date: Wed, 3 Feb 2021 17:22:46 -0500 Subject: [PATCH] satellite/repair/repairer: a new set of rs_scheme tagged metrics Change-Id: Ibecd9265da881247eeb85ba185ee8877a7243777 --- satellite/repair/repairer/segments.go | 55 ++++++++++++---- satellite/repair/repairer/stats.go | 92 +++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 11 deletions(-) create mode 100644 satellite/repair/repairer/stats.go diff --git a/satellite/repair/repairer/segments.go b/satellite/repair/repairer/segments.go index 061cc5b88..7a8c21c23 100644 --- a/satellite/repair/repairer/segments.go +++ b/satellite/repair/repairer/segments.go @@ -50,12 +50,13 @@ func (ie *irreparableError) Error() string { // SegmentRepairer for segments. type SegmentRepairer struct { - log *zap.Logger - metainfo *metainfo.Service - orders *orders.Service - overlay *overlay.Service - ec *ECRepairer - timeout time.Duration + log *zap.Logger + statsCollector *statsCollector + metainfo *metainfo.Service + orders *orders.Service + overlay *overlay.Service + ec *ECRepairer + timeout time.Duration // multiplierOptimalThreshold is the value that multiplied by the optimal // threshold results in the maximum limit of number of nodes to upload @@ -85,6 +86,7 @@ func NewSegmentRepairer( return &SegmentRepairer{ log: log, + statsCollector: newStatsCollector(), metainfo: metainfo, orders: orders, overlay: overlay, @@ -117,14 +119,18 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s return true, invalidRepairError.New("cannot repair inline segment") } - mon.Meter("repair_attempts").Mark(1) //mon:locked - mon.IntVal("repair_segment_size").Observe(pointer.GetSegmentSize()) //mon:locked - redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy()) if err != nil { return true, invalidRepairError.New("invalid redundancy strategy: %w", err) } + stats := repairer.getStatsByRS(pointer.Remote.GetRedundancy()) + + mon.Meter("repair_attempts").Mark(1) //mon:locked + stats.repairAttempts.Mark(1) + mon.IntVal("repair_segment_size").Observe(pointer.GetSegmentSize()) //mon:locked + stats.repairSegmentSize.Observe(pointer.GetSegmentSize()) + var excludeNodeIDs storj.NodeIDList var healthyPieces, unhealthyPieces []*pb.RemotePiece healthyMap := make(map[int32]bool) @@ -138,7 +144,9 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s // irreparable piece if int32(numHealthy) < pointer.Remote.Redundancy.MinReq { mon.Counter("repairer_segments_below_min_req").Inc(1) //mon:locked - mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked + stats.repairerSegmentsBelowMinReq.Inc(1) + mon.Meter("repair_nodes_unavailable").Mark(1) //mon:locked + stats.repairerNodesUnavailable.Mark(1) return true, &irreparableError{ path: path, piecesAvailable: int32(numHealthy), @@ -149,6 +157,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s // ensure we get values, even if only zero values, so that redash can have an alert based on this mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked + stats.repairerSegmentsBelowMinReq.Inc(0) repairThreshold := pointer.Remote.Redundancy.RepairThreshold overrideValue := repairer.repairOverrides.GetOverrideValuePB(pointer.Remote.Redundancy) @@ -159,6 +168,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s // repair not needed if int32(numHealthy) > repairThreshold { mon.Meter("repair_unnecessary").Mark(1) //mon:locked + stats.repairUnnecessary.Mark(1) repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold)) return true, nil } @@ -168,6 +178,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s healthyRatioBeforeRepair = float64(numHealthy) / float64(pointer.Remote.Redundancy.Total) } mon.FloatVal("healthy_ratio_before_repair").Observe(healthyRatioBeforeRepair) //mon:locked + stats.healthyRatioBeforeRepair.Observe(healthyRatioBeforeRepair) lostPiecesSet := sliceToSet(missingPieces) @@ -257,6 +268,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s // to wait for nodes to come back online. if irreparableErr, ok := err.(*irreparableError); ok { mon.Meter("repair_too_many_nodes_failed").Mark(1) //mon:locked + stats.repairTooManyNodesFailed.Mark(1) irreparableErr.segmentInfo = pointer return true, irreparableErr } @@ -296,17 +308,22 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s // repair "succeeded" in that the segment is now healthier than it was, but it is // not as healthy as we want it to be. mon.Meter("repair_failed").Mark(1) //mon:locked + stats.repairFailed.Mark(1) case healthyAfterRepair < pointer.Remote.Redundancy.SuccessThreshold: mon.Meter("repair_partial").Mark(1) //mon:locked + stats.repairPartial.Mark(1) default: mon.Meter("repair_success").Mark(1) //mon:locked + stats.repairSuccess.Mark(1) } healthyRatioAfterRepair := 0.0 if pointer.Remote.Redundancy.Total != 0 { healthyRatioAfterRepair = float64(healthyAfterRepair) / float64(pointer.Remote.Redundancy.Total) } + mon.FloatVal("healthy_ratio_after_repair").Observe(healthyRatioAfterRepair) //mon:locked + stats.healthyRatioAfterRepair.Observe(healthyRatioAfterRepair) var toRemove []*pb.RemotePiece if healthyAfterRepair >= pointer.Remote.Redundancy.SuccessThreshold { @@ -343,11 +360,27 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s } mon.IntVal("segment_time_until_repair").Observe(int64(segmentAge.Seconds())) //mon:locked - mon.IntVal("segment_repair_count").Observe(int64(pointer.RepairCount)) //mon:locked + stats.segmentTimeUntilRepair.Observe((int64(segmentAge.Seconds()))) + mon.IntVal("segment_repair_count").Observe(int64(pointer.RepairCount)) //mon:locked + stats.segmentRepairCount.Observe(int64(pointer.RepairCount)) return true, nil } +func (repairer *SegmentRepairer) getStatsByRS(redundancy *pb.RedundancyScheme) *stats { + rsString := getRSString(repairer.loadRedundancy(redundancy)) + return repairer.statsCollector.getStatsByRS(rsString) +} + +func (repairer *SegmentRepairer) loadRedundancy(redundancy *pb.RedundancyScheme) (int, int, int, int) { + repair := int(redundancy.RepairThreshold) + overrideValue := repairer.repairOverrides.GetOverrideValuePB(redundancy) + if overrideValue != 0 { + repair = int(overrideValue) + } + return int(redundancy.MinReq), repair, int(redundancy.SuccessThreshold), int(redundancy.Total) +} + func (repairer *SegmentRepairer) updateAuditFailStatus(ctx context.Context, failedAuditNodeIDs storj.NodeIDList) (failedNum int, err error) { updateRequests := make([]*overlay.UpdateRequest, len(failedAuditNodeIDs)) for i, nodeID := range failedAuditNodeIDs { diff --git a/satellite/repair/repairer/stats.go b/satellite/repair/repairer/stats.go new file mode 100644 index 000000000..169e651cf --- /dev/null +++ b/satellite/repair/repairer/stats.go @@ -0,0 +1,92 @@ +// Copyright (C) 2019 Storj Labs, Inc. +// See LICENSE for copying information. + +package repairer + +import ( + "fmt" + + "github.com/spacemonkeygo/monkit/v3" +) + +// statsCollector holds a *stats for each redundancy scheme +// seen by the repairer. These are chained into the monkit scope for +// monitoring as they are initialized. +type statsCollector struct { + stats map[string]*stats +} + +func newStatsCollector() *statsCollector { + return &statsCollector{ + stats: make(map[string]*stats), + } +} + +func (collector *statsCollector) getStatsByRS(rs string) *stats { + stats, ok := collector.stats[rs] + if !ok { + stats = newStats(rs) + mon.Chain(stats) + collector.stats[rs] = stats + } + return stats +} + +// stats is used for collecting and reporting repairer metrics. +// +// add any new metrics tagged with rs_scheme to this struct and set them +// in newStats. +type stats struct { + repairAttempts *monkit.Meter + repairSegmentSize *monkit.IntVal + repairerSegmentsBelowMinReq *monkit.Counter + repairerNodesUnavailable *monkit.Meter + repairUnnecessary *monkit.Meter + healthyRatioBeforeRepair *monkit.FloatVal + repairTooManyNodesFailed *monkit.Meter + repairFailed *monkit.Meter + repairPartial *monkit.Meter + repairSuccess *monkit.Meter + healthyRatioAfterRepair *monkit.FloatVal + segmentTimeUntilRepair *monkit.IntVal + segmentRepairCount *monkit.IntVal +} + +func newStats(rs string) *stats { + return &stats{ + repairAttempts: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_attempts").WithTag("rs_scheme", rs)), + repairSegmentSize: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_segment_size").WithTag("rs_scheme", rs)), + repairerSegmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repairer_segments_below_min_req").WithTag("rs_scheme", rs)), + repairerNodesUnavailable: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repairer_nodes_unavailable").WithTag("rs_scheme", rs)), + repairUnnecessary: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_unnecessary").WithTag("rs_scheme", rs)), + healthyRatioBeforeRepair: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "healthy_ratio_before_repair").WithTag("rs_scheme", rs)), + repairTooManyNodesFailed: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_too_many_nodes_failed").WithTag("rs_scheme", rs)), + repairFailed: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_failed").WithTag("rs_scheme", rs)), + repairPartial: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_partial").WithTag("rs_scheme", rs)), + repairSuccess: monkit.NewMeter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "repair_success").WithTag("rs_scheme", rs)), + healthyRatioAfterRepair: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "healthy_ratio_after_repair").WithTag("rs_scheme", rs)), + segmentTimeUntilRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "segment_time_until_repair").WithTag("rs_scheme", rs)), + segmentRepairCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "segment_repair_count").WithTag("rs_scheme", rs)), + } +} + +// Stats implements the monkit.StatSource interface. +func (stats *stats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) { + stats.repairAttempts.Stats(cb) + stats.repairSegmentSize.Stats(cb) + stats.repairerSegmentsBelowMinReq.Stats(cb) + stats.repairerNodesUnavailable.Stats(cb) + stats.repairUnnecessary.Stats(cb) + stats.healthyRatioBeforeRepair.Stats(cb) + stats.repairTooManyNodesFailed.Stats(cb) + stats.repairFailed.Stats(cb) + stats.repairPartial.Stats(cb) + stats.repairSuccess.Stats(cb) + stats.healthyRatioAfterRepair.Stats(cb) + stats.segmentTimeUntilRepair.Stats(cb) + stats.segmentRepairCount.Stats(cb) +} + +func getRSString(min, repair, success, total int) string { + return fmt.Sprintf("%d/%d/%d/%d", min, repair, success, total) +}