satellite/metabase/rangedloop: few additions for monitoring
Additional elements added: * monkit metric for observers methods like Start/Fork/Join/Finish to be able to check how much time those methods are taking * few more logs e.g. entries with processed range * segmentsProcessed metric to be able to check loop progress Change-Id: I65dd51f7f5c4bdbb4014fbf04e5b6b10bdb035ec
This commit is contained in:
parent
9ac314e482
commit
aba2f14595
@ -44,6 +44,8 @@ func NewRangedLoopObserver(log *zap.Logger, accounting accounting.StoragenodeAcc
|
|||||||
|
|
||||||
// Start implements ranged loop observer start method.
|
// Start implements ranged loop observer start method.
|
||||||
func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (err error) {
|
func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
observer.Node = map[storj.NodeID]float64{}
|
observer.Node = map[storj.NodeID]float64{}
|
||||||
observer.lastTallyTime, err = observer.accounting.LastTimestamp(ctx, accounting.LastAtRestTally)
|
observer.lastTallyTime, err = observer.accounting.LastTimestamp(ctx, accounting.LastAtRestTally)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -56,12 +58,16 @@ func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fork forks new node tally ranged loop partial.
|
// Fork forks new node tally ranged loop partial.
|
||||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
func (observer *RangedLoopObserver) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
return NewRangedLoopPartial(observer.log, observer.nowFn), nil
|
return NewRangedLoopPartial(observer.log, observer.nowFn), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Join joins node tally ranged loop partial to main observer updating main per node usage map.
|
// Join joins node tally ranged loop partial to main observer updating main per node usage map.
|
||||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
|
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
tallyPartial, ok := partial.(*RangedLoopPartial)
|
tallyPartial, ok := partial.(*RangedLoopPartial)
|
||||||
if !ok {
|
if !ok {
|
||||||
return Error.New("expected partial type %T but got %T", tallyPartial, partial)
|
return Error.New("expected partial type %T but got %T", tallyPartial, partial)
|
||||||
@ -78,7 +84,9 @@ func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop
|
|||||||
var monRangedTally = monkit.ScopeNamed("storj.io/storj/satellite/accounting/tally")
|
var monRangedTally = monkit.ScopeNamed("storj.io/storj/satellite/accounting/tally")
|
||||||
|
|
||||||
// Finish calculates byte*hours from per node storage usage and save tallies to DB.
|
// Finish calculates byte*hours from per node storage usage and save tallies to DB.
|
||||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
func (observer *RangedLoopObserver) Finish(ctx context.Context) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
finishTime := observer.nowFn()
|
finishTime := observer.nowFn()
|
||||||
|
|
||||||
// calculate byte hours, not just bytes
|
// calculate byte hours, not just bytes
|
||||||
@ -92,7 +100,7 @@ func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
|||||||
|
|
||||||
monRangedTally.IntVal("nodetallies.totalsum").Observe(int64(totalSum)) //mon:locked
|
monRangedTally.IntVal("nodetallies.totalsum").Observe(int64(totalSum)) //mon:locked
|
||||||
|
|
||||||
err := observer.accounting.SaveTallies(ctx, finishTime, byteHours)
|
err = observer.accounting.SaveTallies(ctx, finishTime, byteHours)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Error.New("StorageNodeAccounting.SaveTallies failed: %v", err)
|
return Error.New("StorageNodeAccounting.SaveTallies failed: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -43,13 +43,17 @@ func NewObserver(log *zap.Logger, queue VerifyQueue, config Config) *Observer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Start prepares the observer for audit segment collection.
|
// Start prepares the observer for audit segment collection.
|
||||||
func (obs *Observer) Start(ctx context.Context, startTime time.Time) error {
|
func (obs *Observer) Start(ctx context.Context, startTime time.Time) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
obs.reservoirs = make(map[storj.NodeID]*Reservoir)
|
obs.reservoirs = make(map[storj.NodeID]*Reservoir)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fork returns a new audit reservoir collector for the range.
|
// Fork returns a new audit reservoir collector for the range.
|
||||||
func (obs *Observer) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
func (obs *Observer) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
// Each collector needs an RNG for sampling. On systems where time
|
// Each collector needs an RNG for sampling. On systems where time
|
||||||
// resolution is low (e.g. windows is 15ms), seeding an RNG using the
|
// resolution is low (e.g. windows is 15ms), seeding an RNG using the
|
||||||
// current time (even with nanosecond precision) may end up reusing a seed
|
// current time (even with nanosecond precision) may end up reusing a seed
|
||||||
@ -60,7 +64,9 @@ func (obs *Observer) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Join merges the audit reservoir collector into the per-node reservoirs.
|
// Join merges the audit reservoir collector into the per-node reservoirs.
|
||||||
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) error {
|
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
collector, ok := partial.(*Collector)
|
collector, ok := partial.(*Collector)
|
||||||
if !ok {
|
if !ok {
|
||||||
return errs.New("expected partial type %T but got %T", collector, partial)
|
return errs.New("expected partial type %T but got %T", collector, partial)
|
||||||
@ -80,7 +86,9 @@ func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Finish builds and dedups an audit queue from the merged per-node reservoirs.
|
// Finish builds and dedups an audit queue from the merged per-node reservoirs.
|
||||||
func (obs *Observer) Finish(ctx context.Context) error {
|
func (obs *Observer) Finish(ctx context.Context) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
type SegmentKey struct {
|
type SegmentKey struct {
|
||||||
StreamID uuid.UUID
|
StreamID uuid.UUID
|
||||||
Position uint64
|
Position uint64
|
||||||
|
@ -55,7 +55,9 @@ func (o *LiveCountObserver) Finish(ctx context.Context) error {
|
|||||||
|
|
||||||
// Process increments the counter.
|
// Process increments the counter.
|
||||||
func (o *LiveCountObserver) Process(ctx context.Context, segments []segmentloop.Segment) error {
|
func (o *LiveCountObserver) Process(ctx context.Context, segments []segmentloop.Segment) error {
|
||||||
atomic.AddInt64(&o.numSegments, int64(len(segments)))
|
processed := atomic.AddInt64(&o.numSegments, int64(len(segments)))
|
||||||
|
|
||||||
|
mon.IntVal("segmentsProcessed").Observe(processed)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,5 +19,6 @@ type RangeSplitter interface {
|
|||||||
|
|
||||||
// SegmentProvider iterates through a range of segments.
|
// SegmentProvider iterates through a range of segments.
|
||||||
type SegmentProvider interface {
|
type SegmentProvider interface {
|
||||||
|
Range() UUIDRange
|
||||||
Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error
|
Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error
|
||||||
}
|
}
|
||||||
|
@ -62,6 +62,11 @@ func (provider *MetabaseRangeSplitter) CreateRanges(nRanges int, batchSize int)
|
|||||||
return rangeProviders, err
|
return rangeProviders, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Range returns range which is processed by this provider.
|
||||||
|
func (provider *MetabaseSegmentProvider) Range() UUIDRange {
|
||||||
|
return provider.uuidRange
|
||||||
|
}
|
||||||
|
|
||||||
// Iterate loops over a part of the segment table.
|
// Iterate loops over a part of the segment table.
|
||||||
func (provider *MetabaseSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
func (provider *MetabaseSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
||||||
var startStreamID uuid.UUID
|
var startStreamID uuid.UUID
|
||||||
|
@ -25,6 +25,11 @@ func (m *InfiniteSegmentProvider) CreateRanges(nRanges int, batchSize int) (segm
|
|||||||
return segmentsProviders, nil
|
return segmentsProviders, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Range returns range which is processed by this provider.
|
||||||
|
func (m *InfiniteSegmentProvider) Range() rangedloop.UUIDRange {
|
||||||
|
return rangedloop.UUIDRange{}
|
||||||
|
}
|
||||||
|
|
||||||
// Iterate allows to loop over the segments stored in the provider.
|
// Iterate allows to loop over the segments stored in the provider.
|
||||||
func (m *InfiniteSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
func (m *InfiniteSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
||||||
for {
|
for {
|
||||||
|
@ -50,6 +50,11 @@ func (m *RangeSplitter) CreateRanges(nRanges int, batchSize int) ([]rangedloop.S
|
|||||||
return rangeProviders, nil
|
return rangeProviders, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Range returns range which is processed by this provider.
|
||||||
|
func (m *SegmentProvider) Range() rangedloop.UUIDRange {
|
||||||
|
return rangedloop.UUIDRange{}
|
||||||
|
}
|
||||||
|
|
||||||
// Iterate allows to loop over the segments stored in the provider.
|
// Iterate allows to loop over the segments stored in the provider.
|
||||||
func (m *SegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
func (m *SegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
||||||
for offset := 0; offset < len(m.Segments); offset += m.batchSize {
|
for offset := 0; offset < len(m.Segments); offset += m.batchSize {
|
||||||
|
@ -90,7 +90,10 @@ func (service *Service) Run(ctx context.Context) (err error) {
|
|||||||
service.log.Info("ranged loop initialized")
|
service.log.Info("ranged loop initialized")
|
||||||
|
|
||||||
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
||||||
service.log.Info("ranged loop started")
|
service.log.Info("ranged loop started",
|
||||||
|
zap.Int("parallelism", service.config.Parallelism),
|
||||||
|
zap.Int("batchSize", service.config.BatchSize),
|
||||||
|
)
|
||||||
_, err := service.RunOnce(ctx)
|
_, err := service.RunOnce(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
service.log.Error("ranged loop failure", zap.Error(err))
|
service.log.Error("ranged loop failure", zap.Error(err))
|
||||||
@ -126,7 +129,10 @@ func (service *Service) RunOnce(ctx context.Context) (observerDurations []Observ
|
|||||||
}
|
}
|
||||||
|
|
||||||
group := errs2.Group{}
|
group := errs2.Group{}
|
||||||
for _, rangeProvider := range rangeProviders {
|
for index, rangeProvider := range rangeProviders {
|
||||||
|
uuidRange := rangeProvider.Range()
|
||||||
|
service.log.Debug("creating range", zap.Int("index", index), zap.Stringer("start", uuidRange.Start), zap.Stringer("end", uuidRange.End))
|
||||||
|
|
||||||
rangeObservers := []*rangeObserverState{}
|
rangeObservers := []*rangeObserverState{}
|
||||||
for i, observerState := range observerStates {
|
for i, observerState := range observerStates {
|
||||||
if observerState.err != nil {
|
if observerState.err != nil {
|
||||||
|
@ -114,7 +114,9 @@ func (observer *RangedLoopObserver) TestingCompareInjuredSegmentIDs(ctx context.
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Start starts parallel segments loop.
|
// Start starts parallel segments loop.
|
||||||
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) error {
|
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
observer.startTime = startTime
|
observer.startTime = startTime
|
||||||
observer.TotalStats = aggregateStats{}
|
observer.TotalStats = aggregateStats{}
|
||||||
|
|
||||||
@ -122,13 +124,17 @@ func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Ti
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fork creates a Partial to process a chunk of all the segments.
|
// Fork creates a Partial to process a chunk of all the segments.
|
||||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
func (observer *RangedLoopObserver) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
return newRangedLoopCheckerPartial(observer), nil
|
return newRangedLoopCheckerPartial(observer), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Join is called after the chunk for Partial is done.
|
// Join is called after the chunk for Partial is done.
|
||||||
// This gives the opportunity to merge the output like in a reduce step.
|
// This gives the opportunity to merge the output like in a reduce step.
|
||||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
|
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
repPartial, ok := partial.(*repairPartial)
|
repPartial, ok := partial.(*repairPartial)
|
||||||
if !ok {
|
if !ok {
|
||||||
return Error.New("expected partial type %T but got %T", repPartial, partial)
|
return Error.New("expected partial type %T but got %T", repPartial, partial)
|
||||||
@ -148,7 +154,9 @@ func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Finish is called after all segments are processed by all observers.
|
// Finish is called after all segments are processed by all observers.
|
||||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
func (observer *RangedLoopObserver) Finish(ctx context.Context) (err error) {
|
||||||
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
// remove all segments which were not seen as unhealthy by this checker iteration
|
// remove all segments which were not seen as unhealthy by this checker iteration
|
||||||
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
|
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user