Add checker metrics (#2487)

checker_segment_total_count - Number of total segments in pointer during checker iteration
checker_segment_healthy_count - Number of healthy segments in pointer during checker iterationn
time_since_checker_queue - Seconds elapsed between checker queue and beginning repair
time_for_repair - Seconds elapsed between beginning repair and ending repair/dequeueing
This commit is contained in:
Maximillian von Briesen 2019-07-10 17:27:46 -04:00 committed by GitHub
parent 0e463dccfd
commit de85d17069
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 75 additions and 14 deletions

View File

@ -193,6 +193,9 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
numHealthy := int32(len(pieces) - len(missingPieces))
redundancy := pointer.Remote.Redundancy
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces)))
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy))
// we repair when the number of healthy pieces is less than or equal to the repair threshold
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy > redundancy.MinReq && numHealthy <= redundancy.RepairThreshold && numHealthy < redundancy.SuccessThreshold {
@ -202,8 +205,9 @@ func (checker *Checker) updateSegmentStatus(ctx context.Context, pointer *pb.Poi
}
monStats.remoteSegmentsNeedingRepair++
err = checker.repairQueue.Insert(ctx, &pb.InjuredSegment{
Path: path,
LostPieces: missingPieces,
Path: path,
LostPieces: missingPieces,
InsertedTime: time.Now().UTC(),
})
if err != nil {
return Error.New("error adding injured segment to queue %s", err)

View File

@ -140,6 +140,9 @@ func (service *Service) process(ctx context.Context) (err error) {
func (service *Service) worker(ctx context.Context, seg *pb.InjuredSegment) (err error) {
defer mon.Task()(&ctx)(&err)
workerStartTime := time.Now().UTC()
zap.L().Info("Limiter running repair on segment", zap.String("segment", seg.GetPath()))
err = service.repairer.Repair(ctx, seg.GetPath())
if err != nil {
@ -150,5 +153,16 @@ func (service *Service) worker(ctx context.Context, seg *pb.InjuredSegment) (err
if err != nil {
return Error.New("repair delete failed: %v", err)
}
insertedTime := seg.GetInsertedTime()
// do not send metrics if segment was added before the InsertedTime field was added
if !insertedTime.IsZero() {
timeSinceQueued := workerStartTime.Sub(insertedTime)
repairedTime := time.Now().UTC()
timeForRepair := repairedTime.Sub(workerStartTime)
mon.FloatVal("time_since_checker_queue").Observe(timeSinceQueued.Seconds())
mon.FloatVal("time_for_repair").Observe(timeForRepair.Seconds())
}
return nil
}

View File

@ -5,14 +5,18 @@ package pb
import (
fmt "fmt"
_ "github.com/gogo/protobuf/gogoproto"
proto "github.com/gogo/protobuf/proto"
_ "github.com/golang/protobuf/ptypes/timestamp"
math "math"
time "time"
)
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf
var _ = time.Kitchen
// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
@ -22,11 +26,12 @@ const _ = proto.GoGoProtoPackageIsVersion2 // please upgrade the proto package
// InjuredSegment is the queue item used for the data repair queue
type InjuredSegment struct {
Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"`
LostPieces []int32 `protobuf:"varint,2,rep,packed,name=lost_pieces,json=lostPieces,proto3" json:"lost_pieces,omitempty"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"`
LostPieces []int32 `protobuf:"varint,2,rep,packed,name=lost_pieces,json=lostPieces,proto3" json:"lost_pieces,omitempty"`
InsertedTime time.Time `protobuf:"bytes,3,opt,name=inserted_time,json=insertedTime,proto3,stdtime" json:"inserted_time"`
XXX_NoUnkeyedLiteral struct{} `json:"-"`
XXX_unrecognized []byte `json:"-"`
XXX_sizecache int32 `json:"-"`
}
func (m *InjuredSegment) Reset() { *m = InjuredSegment{} }
@ -67,6 +72,13 @@ func (m *InjuredSegment) GetLostPieces() []int32 {
return nil
}
func (m *InjuredSegment) GetInsertedTime() time.Time {
if m != nil {
return m.InsertedTime
}
return time.Time{}
}
func init() {
proto.RegisterType((*InjuredSegment)(nil), "repair.InjuredSegment")
}
@ -74,13 +86,18 @@ func init() {
func init() { proto.RegisterFile("datarepair.proto", fileDescriptor_b1b08e6fe9398aa6) }
var fileDescriptor_b1b08e6fe9398aa6 = []byte{
// 119 bytes of a gzipped FileDescriptorProto
// 204 bytes of a gzipped FileDescriptorProto
0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x12, 0x48, 0x49, 0x2c, 0x49,
0x2c, 0x4a, 0x2d, 0x48, 0xcc, 0x2c, 0xd2, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0x83, 0xf0,
0x94, 0x5c, 0xb9, 0xf8, 0x3c, 0xf3, 0xb2, 0x4a, 0x8b, 0x52, 0x53, 0x82, 0x53, 0xd3, 0x73, 0x53,
0xf3, 0x4a, 0x84, 0x84, 0xb8, 0x58, 0x0a, 0x12, 0x4b, 0x32, 0x24, 0x18, 0x15, 0x18, 0x35, 0x38,
0x83, 0xc0, 0x6c, 0x21, 0x79, 0x2e, 0xee, 0x9c, 0xfc, 0xe2, 0x92, 0xf8, 0x82, 0xcc, 0xd4, 0xe4,
0xd4, 0x62, 0x09, 0x26, 0x05, 0x66, 0x0d, 0xd6, 0x20, 0x2e, 0x90, 0x50, 0x00, 0x58, 0xc4, 0x89,
0x25, 0x8a, 0xa9, 0x20, 0x29, 0x89, 0x0d, 0x6c, 0xb6, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x13,
0xff, 0xff, 0x1e, 0x6f, 0x00, 0x00, 0x00,
0xa4, 0xb8, 0xd2, 0xf3, 0xd3, 0xf3, 0x21, 0x62, 0x52, 0xf2, 0xe9, 0xf9, 0xf9, 0xe9, 0x39, 0xa9,
0xfa, 0x60, 0x5e, 0x52, 0x69, 0x9a, 0x7e, 0x49, 0x66, 0x6e, 0x6a, 0x71, 0x49, 0x62, 0x6e, 0x01,
0x44, 0x81, 0xd2, 0x04, 0x46, 0x2e, 0x3e, 0xcf, 0xbc, 0xac, 0xd2, 0xa2, 0xd4, 0x94, 0xe0, 0xd4,
0xf4, 0xdc, 0xd4, 0xbc, 0x12, 0x21, 0x21, 0x2e, 0x96, 0x82, 0xc4, 0x92, 0x0c, 0x09, 0x46, 0x05,
0x46, 0x0d, 0xce, 0x20, 0x30, 0x5b, 0x48, 0x9e, 0x8b, 0x3b, 0x27, 0xbf, 0xb8, 0x24, 0xbe, 0x20,
0x33, 0x35, 0x39, 0xb5, 0x58, 0x82, 0x49, 0x81, 0x59, 0x83, 0x35, 0x88, 0x0b, 0x24, 0x14, 0x00,
0x16, 0x11, 0xf2, 0xe4, 0xe2, 0xcd, 0xcc, 0x2b, 0x4e, 0x2d, 0x2a, 0x49, 0x4d, 0x89, 0x07, 0xd9,
0x21, 0xc1, 0xac, 0xc0, 0xa8, 0xc1, 0x6d, 0x24, 0xa5, 0x07, 0x71, 0x80, 0x1e, 0xcc, 0x01, 0x7a,
0x21, 0x30, 0x07, 0x38, 0x71, 0x9c, 0xb8, 0x27, 0xcf, 0x30, 0xe1, 0xbe, 0x3c, 0x63, 0x10, 0x0f,
0x4c, 0x2b, 0x48, 0xd2, 0x89, 0x25, 0x8a, 0xa9, 0x20, 0x29, 0x89, 0x0d, 0xac, 0xc3, 0x18, 0x10,
0x00, 0x00, 0xff, 0xff, 0xca, 0x5a, 0x32, 0x32, 0xe8, 0x00, 0x00, 0x00,
}

View File

@ -3,6 +3,8 @@
syntax = "proto3";
option go_package = "pb";
import "gogo.proto";
import "google/protobuf/timestamp.proto";
package repair;
@ -10,4 +12,5 @@ package repair;
message InjuredSegment {
string path = 1;
repeated int32 lost_pieces = 2;
google.protobuf.Timestamp inserted_time = 3 [(gogoproto.stdtime) = true, (gogoproto.nullable) = false];
}

View File

@ -169,10 +169,33 @@
"name": "lost_pieces",
"type": "int32",
"is_repeated": true
},
{
"id": 3,
"name": "inserted_time",
"type": "google.protobuf.Timestamp",
"options": [
{
"name": "(gogoproto.stdtime)",
"value": "true"
},
{
"name": "(gogoproto.nullable)",
"value": "false"
}
]
}
]
}
],
"imports": [
{
"path": "gogo.proto"
},
{
"path": "google/protobuf/timestamp.proto"
}
],
"package": {
"name": "repair"
},