Add checker metrics (#2487)
checker_segment_total_count - Number of total segments in pointer during checker iteration checker_segment_healthy_count - Number of healthy segments in pointer during checker iterationn time_since_checker_queue - Seconds elapsed between checker queue and beginning repair time_for_repair - Seconds elapsed between beginning repair and ending repair/dequeueing
This commit is contained in:
parent
0e463dccfd
commit
de85d17069
@ -193,6 +193,9 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
|
||||
|
||||
numHealthy := int32(len(pieces) - len(missingPieces))
|
||||
redundancy := pointer.Remote.Redundancy
|
||||
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces)))
|
||||
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy))
|
||||
|
||||
// we repair when the number of healthy pieces is less than or equal to the repair threshold
|
||||
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
|
||||
if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
|
||||
@ -202,8 +205,9 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
|
||||
}
|
||||
monStats.remoteSegmentsNeedingRepair++
|
||||
err = checker.repairQueue.Insert(ctx, &pb.InjuredSegment{
|
||||
Path: path,
|
||||
LostPieces: missingPieces,
|
||||
Path: path,
|
||||
LostPieces: missingPieces,
|
||||
InsertedTime: time.Now().UTC(),
|
||||
})
|
||||
if err != nil {
|
||||
return Error.New("error adding injured segment to queue %s", err)
|
||||
|
@ -140,6 +140,9 @@ func (service *Service) process(ctx context.Context) (err error) {
|
||||
|
||||
func (service *Service) worker(ctx context.Context, seg *pb.InjuredSegment) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
workerStartTime := time.Now().UTC()
|
||||
|
||||
zap.L().Info("Limiter running repair on segment", zap.String("segment", seg.GetPath()))
|
||||
err = service.repairer.Repair(ctx, seg.GetPath())
|
||||
if err != nil {
|
||||
@ -150,5 +153,16 @@ func (service *Service) worker(ctx context.Context, seg *pb.InjuredSegment) (err
|
||||
if err != nil {
|
||||
return Error.New("repair delete failed: %v", err)
|
||||
}
|
||||
|
||||
insertedTime := seg.GetInsertedTime()
|
||||
// do not send metrics if segment was added before the InsertedTime field was added
|
||||
if !insertedTime.IsZero() {
|
||||
timeSinceQueued := workerStartTime.Sub(insertedTime)
|
||||
repairedTime := time.Now().UTC()
|
||||
timeForRepair := repairedTime.Sub(workerStartTime)
|
||||
mon.FloatVal("time_since_checker_queue").Observe(timeSinceQueued.Seconds())
|
||||
mon.FloatVal("time_for_repair").Observe(timeForRepair.Seconds())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -5,14 +5,18 @@ package pb
|
||||
|
||||
import (
|
||||
fmt "fmt"
|
||||
_ "github.com/gogo/protobuf/gogoproto"
|
||||
proto "github.com/gogo/protobuf/proto"
|
||||
_ "github.com/golang/protobuf/ptypes/timestamp"
|
||||
math "math"
|
||||
time "time"
|
||||
)
|
||||
|
||||
// Reference imports to suppress errors if they are not otherwise used.
|
||||
var _ = proto.Marshal
|
||||
var _ = fmt.Errorf
|
||||
var _ = math.Inf
|
||||
var _ = time.Kitchen
|
||||
|
||||
// This is a compile-time assertion to ensure that this generated file
|
||||
// is compatible with the proto package it is being compiled against.
|
||||
@ -22,11 +26,12 @@ const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package
|
||||
|
||||
// InjuredSegment is the queue item used for the data repair queue
|
||||
type InjuredSegment struct {
|
||||
Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"`
|
||||
LostPieces []int32 `protobuf:"varint,2,rep,packed,name=lost_pieces,json=lostPieces,proto3" json:"lost_pieces,omitempty"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
XXX_unrecognized []byte `json:"-"`
|
||||
XXX_sizecache int32 `json:"-"`
|
||||
Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"`
|
||||
LostPieces []int32 `protobuf:"varint,2,rep,packed,name=lost_pieces,json=lostPieces,proto3" json:"lost_pieces,omitempty"`
|
||||
InsertedTime time.Time `protobuf:"bytes,3,opt,name=inserted_time,json=insertedTime,proto3,stdtime" json:"inserted_time"`
|
||||
XXX_NoUnkeyedLiteral struct{} `json:"-"`
|
||||
XXX_unrecognized []byte `json:"-"`
|
||||
XXX_sizecache int32 `json:"-"`
|
||||
}
|
||||
|
||||
func (m *InjuredSegment) Reset() { *m = InjuredSegment{} }
|
||||
@ -67,6 +72,13 @@ func (m *InjuredSegment) GetLostPieces() []int32 {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *InjuredSegment) GetInsertedTime() time.Time {
|
||||
if m != nil {
|
||||
return m.InsertedTime
|
||||
}
|
||||
return time.Time{}
|
||||
}
|
||||
|
||||
func init() {
|
||||
proto.RegisterType((*InjuredSegment)(nil), "repair.InjuredSegment")
|
||||
}
|
||||
@ -74,13 +86,18 @@ func init() {
|
||||
func init() { proto.RegisterFile("datarepair.proto", fileDescriptor_b1b08e6fe9398aa6) }
|
||||
|
||||
var fileDescriptor_b1b08e6fe9398aa6 = []byte{
|
||||
// 119 bytes of a gzipped FileDescriptorProto
|
||||
// 204 bytes of a gzipped FileDescriptorProto
|
||||
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0x48, 0x49, 0x2c, 0x49,
|
||||
0x2c, 0x4a, 0x2d, 0x48, 0xcc, 0x2c, 0xd2, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0x83, 0xf0,
|
||||
0x94, 0x5c, 0xb9, 0xf8, 0x3c, 0xf3, 0xb2, 0x4a, 0x8b, 0x52, 0x53, 0x82, 0x53, 0xd3, 0x73, 0x53,
|
||||
0xf3, 0x4a, 0x84, 0x84, 0xb8, 0x58, 0x0a, 0x12, 0x4b, 0x32, 0x24, 0x18, 0x15, 0x18, 0x35, 0x38,
|
||||
0x83, 0xc0, 0x6c, 0x21, 0x79, 0x2e, 0xee, 0x9c, 0xfc, 0xe2, 0x92, 0xf8, 0x82, 0xcc, 0xd4, 0xe4,
|
||||
0xd4, 0x62, 0x09, 0x26, 0x05, 0x66, 0x0d, 0xd6, 0x20, 0x2e, 0x90, 0x50, 0x00, 0x58, 0xc4, 0x89,
|
||||
0x25, 0x8a, 0xa9, 0x20, 0x29, 0x89, 0x0d, 0x6c, 0xb6, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x13,
|
||||
0xff, 0xff, 0x1e, 0x6f, 0x00, 0x00, 0x00,
|
||||
0xa4, 0xb8, 0xd2, 0xf3, 0xd3, 0xf3, 0x21, 0x62, 0x52, 0xf2, 0xe9, 0xf9, 0xf9, 0xe9, 0x39, 0xa9,
|
||||
0xfa, 0x60, 0x5e, 0x52, 0x69, 0x9a, 0x7e, 0x49, 0x66, 0x6e, 0x6a, 0x71, 0x49, 0x62, 0x6e, 0x01,
|
||||
0x44, 0x81, 0xd2, 0x04, 0x46, 0x2e, 0x3e, 0xcf, 0xbc, 0xac, 0xd2, 0xa2, 0xd4, 0x94, 0xe0, 0xd4,
|
||||
0xf4, 0xdc, 0xd4, 0xbc, 0x12, 0x21, 0x21, 0x2e, 0x96, 0x82, 0xc4, 0x92, 0x0c, 0x09, 0x46, 0x05,
|
||||
0x46, 0x0d, 0xce, 0x20, 0x30, 0x5b, 0x48, 0x9e, 0x8b, 0x3b, 0x27, 0xbf, 0xb8, 0x24, 0xbe, 0x20,
|
||||
0x33, 0x35, 0x39, 0xb5, 0x58, 0x82, 0x49, 0x81, 0x59, 0x83, 0x35, 0x88, 0x0b, 0x24, 0x14, 0x00,
|
||||
0x16, 0x11, 0xf2, 0xe4, 0xe2, 0xcd, 0xcc, 0x2b, 0x4e, 0x2d, 0x2a, 0x49, 0x4d, 0x89, 0x07, 0xd9,
|
||||
0x21, 0xc1, 0xac, 0xc0, 0xa8, 0xc1, 0x6d, 0x24, 0xa5, 0x07, 0x71, 0x80, 0x1e, 0xcc, 0x01, 0x7a,
|
||||
0x21, 0x30, 0x07, 0x38, 0x71, 0x9c, 0xb8, 0x27, 0xcf, 0x30, 0xe1, 0xbe, 0x3c, 0x63, 0x10, 0x0f,
|
||||
0x4c, 0x2b, 0x48, 0xd2, 0x89, 0x25, 0x8a, 0xa9, 0x20, 0x29, 0x89, 0x0d, 0xac, 0xc3, 0x18, 0x10,
|
||||
0x00, 0x00, 0xff, 0xff, 0xca, 0x5a, 0x32, 0x32, 0xe8, 0x00, 0x00, 0x00,
|
||||
}
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
syntax = "proto3";
|
||||
option go_package = "pb";
|
||||
import "gogo.proto";
|
||||
import "google/protobuf/timestamp.proto";
|
||||
|
||||
package repair;
|
||||
|
||||
@ -10,4 +12,5 @@ package repair;
|
||||
message InjuredSegment {
|
||||
string path = 1;
|
||||
repeated int32 lost_pieces = 2;
|
||||
google.protobuf.Timestamp inserted_time = 3 [(gogoproto.stdtime) = true, (gogoproto.nullable) = false];
|
||||
}
|
||||
|
23
proto.lock
23
proto.lock
@ -169,10 +169,33 @@
|
||||
"name": "lost_pieces",
|
||||
"type": "int32",
|
||||
"is_repeated": true
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "inserted_time",
|
||||
"type": "google.protobuf.Timestamp",
|
||||
"options": [
|
||||
{
|
||||
"name": "(gogoproto.stdtime)",
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "(gogoproto.nullable)",
|
||||
"value": "false"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"imports": [
|
||||
{
|
||||
"path": "gogo.proto"
|
||||
},
|
||||
{
|
||||
"path": "google/protobuf/timestamp.proto"
|
||||
}
|
||||
],
|
||||
"package": {
|
||||
"name": "repair"
|
||||
},
|
||||
|
Loading…
Reference in New Issue
Block a user