satellite/metabase/rangedloop: few additions for monitoring
Additional elements added: * monkit metric for observers methods like Start/Fork/Join/Finish to be able to check how much time those methods are taking * few more logs e.g. entries with processed range * segmentsProcessed metric to be able to check loop progress Change-Id: I65dd51f7f5c4bdbb4014fbf04e5b6b10bdb035ec
This commit is contained in:
parent
9ac314e482
commit
aba2f14595
@ -44,6 +44,8 @@ func NewRangedLoopObserver(log *zap.Logger, accounting accounting.StoragenodeAcc
|
||||
|
||||
// Start implements ranged loop observer start method.
|
||||
func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
observer.Node = map[storj.NodeID]float64{}
|
||||
observer.lastTallyTime, err = observer.accounting.LastTimestamp(ctx, accounting.LastAtRestTally)
|
||||
if err != nil {
|
||||
@ -56,12 +58,16 @@ func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (
|
||||
}
|
||||
|
||||
// Fork forks new node tally ranged loop partial.
|
||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
return NewRangedLoopPartial(observer.log, observer.nowFn), nil
|
||||
}
|
||||
|
||||
// Join joins node tally ranged loop partial to main observer updating main per node usage map.
|
||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
|
||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
tallyPartial, ok := partial.(*RangedLoopPartial)
|
||||
if !ok {
|
||||
return Error.New("expected partial type %T but got %T", tallyPartial, partial)
|
||||
@ -78,7 +84,9 @@ func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop
|
||||
var monRangedTally = monkit.ScopeNamed("storj.io/storj/satellite/accounting/tally")
|
||||
|
||||
// Finish calculates byte*hours from per node storage usage and save tallies to DB.
|
||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
finishTime := observer.nowFn()
|
||||
|
||||
// calculate byte hours, not just bytes
|
||||
@ -92,7 +100,7 @@ func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
||||
|
||||
monRangedTally.IntVal("nodetallies.totalsum").Observe(int64(totalSum)) //mon:locked
|
||||
|
||||
err := observer.accounting.SaveTallies(ctx, finishTime, byteHours)
|
||||
err = observer.accounting.SaveTallies(ctx, finishTime, byteHours)
|
||||
if err != nil {
|
||||
return Error.New("StorageNodeAccounting.SaveTallies failed: %v", err)
|
||||
}
|
||||
|
@ -43,13 +43,17 @@ func NewObserver(log *zap.Logger, queue VerifyQueue, config Config) *Observer {
|
||||
}
|
||||
|
||||
// Start prepares the observer for audit segment collection.
|
||||
func (obs *Observer) Start(ctx context.Context, startTime time.Time) error {
|
||||
func (obs *Observer) Start(ctx context.Context, startTime time.Time) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
obs.reservoirs = make(map[storj.NodeID]*Reservoir)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fork returns a new audit reservoir collector for the range.
|
||||
func (obs *Observer) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
||||
func (obs *Observer) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
// Each collector needs an RNG for sampling. On systems where time
|
||||
// resolution is low (e.g. windows is 15ms), seeding an RNG using the
|
||||
// current time (even with nanosecond precision) may end up reusing a seed
|
||||
@ -60,7 +64,9 @@ func (obs *Observer) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
||||
}
|
||||
|
||||
// Join merges the audit reservoir collector into the per-node reservoirs.
|
||||
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) error {
|
||||
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
collector, ok := partial.(*Collector)
|
||||
if !ok {
|
||||
return errs.New("expected partial type %T but got %T", collector, partial)
|
||||
@ -80,7 +86,9 @@ func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) error
|
||||
}
|
||||
|
||||
// Finish builds and dedups an audit queue from the merged per-node reservoirs.
|
||||
func (obs *Observer) Finish(ctx context.Context) error {
|
||||
func (obs *Observer) Finish(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
type SegmentKey struct {
|
||||
StreamID uuid.UUID
|
||||
Position uint64
|
||||
|
@ -55,7 +55,9 @@ func (o *LiveCountObserver) Finish(ctx context.Context) error {
|
||||
|
||||
// Process increments the counter.
|
||||
func (o *LiveCountObserver) Process(ctx context.Context, segments []segmentloop.Segment) error {
|
||||
atomic.AddInt64(&o.numSegments, int64(len(segments)))
|
||||
processed := atomic.AddInt64(&o.numSegments, int64(len(segments)))
|
||||
|
||||
mon.IntVal("segmentsProcessed").Observe(processed)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -19,5 +19,6 @@ type RangeSplitter interface {
|
||||
|
||||
// SegmentProvider iterates through a range of segments.
|
||||
type SegmentProvider interface {
|
||||
Range() UUIDRange
|
||||
Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error
|
||||
}
|
||||
|
@ -62,6 +62,11 @@ func (provider *MetabaseRangeSplitter) CreateRanges(nRanges int, batchSize int)
|
||||
return rangeProviders, err
|
||||
}
|
||||
|
||||
// Range returns range which is processed by this provider.
|
||||
func (provider *MetabaseSegmentProvider) Range() UUIDRange {
|
||||
return provider.uuidRange
|
||||
}
|
||||
|
||||
// Iterate loops over a part of the segment table.
|
||||
func (provider *MetabaseSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
||||
var startStreamID uuid.UUID
|
||||
|
@ -25,6 +25,11 @@ func (m *InfiniteSegmentProvider) CreateRanges(nRanges int, batchSize int) (segm
|
||||
return segmentsProviders, nil
|
||||
}
|
||||
|
||||
// Range returns range which is processed by this provider.
|
||||
func (m *InfiniteSegmentProvider) Range() rangedloop.UUIDRange {
|
||||
return rangedloop.UUIDRange{}
|
||||
}
|
||||
|
||||
// Iterate allows to loop over the segments stored in the provider.
|
||||
func (m *InfiniteSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
||||
for {
|
||||
|
@ -50,6 +50,11 @@ func (m *RangeSplitter) CreateRanges(nRanges int, batchSize int) ([]rangedloop.S
|
||||
return rangeProviders, nil
|
||||
}
|
||||
|
||||
// Range returns range which is processed by this provider.
|
||||
func (m *SegmentProvider) Range() rangedloop.UUIDRange {
|
||||
return rangedloop.UUIDRange{}
|
||||
}
|
||||
|
||||
// Iterate allows to loop over the segments stored in the provider.
|
||||
func (m *SegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
|
||||
for offset := 0; offset < len(m.Segments); offset += m.batchSize {
|
||||
|
@ -90,7 +90,10 @@ func (service *Service) Run(ctx context.Context) (err error) {
|
||||
service.log.Info("ranged loop initialized")
|
||||
|
||||
return service.Loop.Run(ctx, func(ctx context.Context) error {
|
||||
service.log.Info("ranged loop started")
|
||||
service.log.Info("ranged loop started",
|
||||
zap.Int("parallelism", service.config.Parallelism),
|
||||
zap.Int("batchSize", service.config.BatchSize),
|
||||
)
|
||||
_, err := service.RunOnce(ctx)
|
||||
if err != nil {
|
||||
service.log.Error("ranged loop failure", zap.Error(err))
|
||||
@ -126,7 +129,10 @@ func (service *Service) RunOnce(ctx context.Context) (observerDurations []Observ
|
||||
}
|
||||
|
||||
group := errs2.Group{}
|
||||
for _, rangeProvider := range rangeProviders {
|
||||
for index, rangeProvider := range rangeProviders {
|
||||
uuidRange := rangeProvider.Range()
|
||||
service.log.Debug("creating range", zap.Int("index", index), zap.Stringer("start", uuidRange.Start), zap.Stringer("end", uuidRange.End))
|
||||
|
||||
rangeObservers := []*rangeObserverState{}
|
||||
for i, observerState := range observerStates {
|
||||
if observerState.err != nil {
|
||||
|
@ -114,7 +114,9 @@ func (observer *RangedLoopObserver) TestingCompareInjuredSegmentIDs(ctx context.
|
||||
}
|
||||
|
||||
// Start starts parallel segments loop.
|
||||
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) error {
|
||||
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
observer.startTime = startTime
|
||||
observer.TotalStats = aggregateStats{}
|
||||
|
||||
@ -122,13 +124,17 @@ func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Ti
|
||||
}
|
||||
|
||||
// Fork creates a Partial to process a chunk of all the segments.
|
||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
return newRangedLoopCheckerPartial(observer), nil
|
||||
}
|
||||
|
||||
// Join is called after the chunk for Partial is done.
|
||||
// This gives the opportunity to merge the output like in a reduce step.
|
||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
|
||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
repPartial, ok := partial.(*repairPartial)
|
||||
if !ok {
|
||||
return Error.New("expected partial type %T but got %T", repPartial, partial)
|
||||
@ -148,7 +154,9 @@ func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop
|
||||
}
|
||||
|
||||
// Finish is called after all segments are processed by all observers.
|
||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) (err error) {
|
||||
defer mon.Task()(&ctx)(&err)
|
||||
|
||||
// remove all segments which were not seen as unhealthy by this checker iteration
|
||||
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
|
||||
if err != nil {
|
||||
|
Loading…
Reference in New Issue
Block a user