satellite/repair: lock monkit stats in checker and repairer
Change-Id: Ia10fc8da0177389a500359ce51d21a5806f3f7b1
This commit is contained in:
parent
8dea4f52db
commit
006a2824ba
17
monkit.lock
17
monkit.lock
@ -61,6 +61,23 @@ storj.io/storj/satellite/repair/checker."remote_segments_checked" IntVal
|
||||
storj.io/storj/satellite/repair/checker."remote_segments_lost" IntVal
|
||||
storj.io/storj/satellite/repair/checker."remote_segments_needing_repair" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."download_failed_not_enough_pieces_repair" Meter
|
||||
storj.io/storj/satellite/repair/repairer."healthy_ratio_after_repair" FloatVal
|
||||
storj.io/storj/satellite/repair/repairer."healthy_ratio_before_repair" FloatVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_attempts" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_failed" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_nodes_unavailable" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_partial" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_canceled" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_failed" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_successful" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_segment_pieces_total" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_segment_size" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."repair_success" Meter
|
||||
storj.io/storj/satellite/repair/repairer."repair_unnecessary" Meter
|
||||
storj.io/storj/satellite/repair/repairer."segment_repair_count" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."segment_time_until_repair" IntVal
|
||||
storj.io/storj/satellite/repair/repairer."time_for_repair" FloatVal
|
||||
storj.io/storj/satellite/repair/repairer."time_since_checker_queue" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_reputation_alpha" FloatVal
|
||||
storj.io/storj/satellite/satellitedb."audit_reputation_beta" FloatVal
|
||||
storj.io/storj/storage/filestore."open_file_in_trash" Meter
|
||||
|
@ -351,10 +351,10 @@ func (ec *ECRepairer) Repair(ctx context.Context, limits []*pb.AddressedOrderLim
|
||||
zap.Int32("Success Count", atomic.LoadInt32(&successfulCount)),
|
||||
)
|
||||
|
||||
mon.IntVal("repair_segment_pieces_total").Observe(int64(pieceCount))
|
||||
mon.IntVal("repair_segment_pieces_successful").Observe(int64(successfulCount))
|
||||
mon.IntVal("repair_segment_pieces_failed").Observe(int64(failureCount))
|
||||
mon.IntVal("repair_segment_pieces_canceled").Observe(int64(cancellationCount))
|
||||
mon.IntVal("repair_segment_pieces_total").Observe(int64(pieceCount)) //locked
|
||||
mon.IntVal("repair_segment_pieces_successful").Observe(int64(successfulCount)) //locked
|
||||
mon.IntVal("repair_segment_pieces_failed").Observe(int64(failureCount)) //locked
|
||||
mon.IntVal("repair_segment_pieces_canceled").Observe(int64(cancellationCount)) //locked
|
||||
|
||||
return successfulNodes, successfulHashes, nil
|
||||
}
|
||||
|
@ -129,13 +129,13 @@ func (service *Service) worker(ctx context.Context, seg *pb.InjuredSegment) (err
|
||||
|
||||
repairedTime := time.Now().UTC()
|
||||
timeForRepair := repairedTime.Sub(workerStartTime)
|
||||
mon.FloatVal("time_for_repair").Observe(timeForRepair.Seconds())
|
||||
mon.FloatVal("time_for_repair").Observe(timeForRepair.Seconds()) //locked
|
||||
|
||||
insertedTime := seg.GetInsertedTime()
|
||||
// do not send metrics if segment was added before the InsertedTime field was added
|
||||
if !insertedTime.IsZero() {
|
||||
timeSinceQueued := workerStartTime.Sub(insertedTime)
|
||||
mon.FloatVal("time_since_checker_queue").Observe(timeSinceQueued.Seconds())
|
||||
mon.FloatVal("time_since_checker_queue").Observe(timeSinceQueued.Seconds()) //locked
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -81,7 +81,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
pointer, err := repairer.metainfo.Get(ctx, path)
|
||||
if err != nil {
|
||||
if storj.ErrObjectNotFound.Has(err) {
|
||||
mon.Meter("repair_unnecessary").Mark(1)
|
||||
mon.Meter("repair_unnecessary").Mark(1) //locked
|
||||
repairer.log.Debug("segment was deleted", zap.Binary("Segment", []byte(path)))
|
||||
return true, nil
|
||||
}
|
||||
@ -92,8 +92,8 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
return true, Error.New("cannot repair inline segment")
|
||||
}
|
||||
|
||||
mon.Meter("repair_attempts").Mark(1)
|
||||
mon.IntVal("repair_segment_size").Observe(pointer.GetSegmentSize())
|
||||
mon.Meter("repair_attempts").Mark(1) //locked
|
||||
mon.IntVal("repair_segment_size").Observe(pointer.GetSegmentSize()) //locked
|
||||
|
||||
redundancy, err := eestream.NewRedundancyStrategyFromProto(pointer.GetRemote().GetRedundancy())
|
||||
if err != nil {
|
||||
@ -114,7 +114,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
numHealthy := len(pieces) - len(missingPieces)
|
||||
// irreparable piece
|
||||
if int32(numHealthy) < pointer.Remote.Redundancy.MinReq {
|
||||
mon.Meter("repair_nodes_unavailable").Mark(1)
|
||||
mon.Meter("repair_nodes_unavailable").Mark(1) //locked
|
||||
return true, Error.Wrap(IrreparableError.New("segment cannot be repaired: only %d healthy pieces, %d required", numHealthy, pointer.Remote.Redundancy.MinReq+1))
|
||||
}
|
||||
|
||||
@ -125,7 +125,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
|
||||
// repair not needed
|
||||
if int32(numHealthy) > repairThreshold {
|
||||
mon.Meter("repair_unnecessary").Mark(1)
|
||||
mon.Meter("repair_unnecessary").Mark(1) //locked
|
||||
repairer.log.Debug("segment above repair threshold", zap.Int("numHealthy", numHealthy), zap.Int32("repairThreshold", repairThreshold))
|
||||
return true, nil
|
||||
}
|
||||
@ -134,7 +134,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
if pointer.Remote.Redundancy.Total != 0 {
|
||||
healthyRatioBeforeRepair = float64(numHealthy) / float64(pointer.Remote.Redundancy.Total)
|
||||
}
|
||||
mon.FloatVal("healthy_ratio_before_repair").Observe(healthyRatioBeforeRepair)
|
||||
mon.FloatVal("healthy_ratio_before_repair").Observe(healthyRatioBeforeRepair) //locked
|
||||
|
||||
lostPiecesSet := sliceToSet(missingPieces)
|
||||
|
||||
@ -232,18 +232,18 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
healthyAfterRepair := int32(len(healthyPieces) + len(repairedPieces))
|
||||
switch {
|
||||
case healthyAfterRepair <= pointer.Remote.Redundancy.RepairThreshold:
|
||||
mon.Meter("repair_failed").Mark(1)
|
||||
mon.Meter("repair_failed").Mark(1) //locked
|
||||
case healthyAfterRepair < pointer.Remote.Redundancy.SuccessThreshold:
|
||||
mon.Meter("repair_partial").Mark(1)
|
||||
mon.Meter("repair_partial").Mark(1) //locked
|
||||
default:
|
||||
mon.Meter("repair_success").Mark(1)
|
||||
mon.Meter("repair_success").Mark(1) //locked
|
||||
}
|
||||
|
||||
healthyRatioAfterRepair := 0.0
|
||||
if pointer.Remote.Redundancy.Total != 0 {
|
||||
healthyRatioAfterRepair = float64(healthyAfterRepair) / float64(pointer.Remote.Redundancy.Total)
|
||||
}
|
||||
mon.FloatVal("healthy_ratio_after_repair").Observe(healthyRatioAfterRepair)
|
||||
mon.FloatVal("healthy_ratio_after_repair").Observe(healthyRatioAfterRepair) //locked
|
||||
|
||||
var toRemove []*pb.RemotePiece
|
||||
if healthyAfterRepair >= pointer.Remote.Redundancy.SuccessThreshold {
|
||||
@ -279,8 +279,8 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, path storj.Path) (s
|
||||
return false, err
|
||||
}
|
||||
|
||||
mon.IntVal("segment_time_until_repair").Observe(int64(segmentAge.Seconds()))
|
||||
mon.IntVal("segment_repair_count").Observe(int64(pointer.RepairCount))
|
||||
mon.IntVal("segment_time_until_repair").Observe(int64(segmentAge.Seconds())) //locked
|
||||
mon.IntVal("segment_repair_count").Observe(int64(pointer.RepairCount)) //locked
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user