satellite/metabase/rangedloop: few additions for monitoring

Additional elements added:
* monkit metric for observers methods like Start/Fork/Join/Finish to
be able to check how much time those methods are taking
* few more logs e.g. entries with processed range
* segmentsProcessed metric to be able to check loop progress

Change-Id: I65dd51f7f5c4bdbb4014fbf04e5b6b10bdb035ec
This commit is contained in:
Michal Niewrzal 2023-02-03 11:04:53 +01:00 committed by Storj Robot
parent 9ac314e482
commit aba2f14595
9 changed files with 63 additions and 15 deletions

View File

@ -44,6 +44,8 @@ func NewRangedLoopObserver(log *zap.Logger, accounting accounting.StoragenodeAcc
// Start implements ranged loop observer start method.
func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (err error) {
defer mon.Task()(&ctx)(&err)
observer.Node = map[storj.NodeID]float64{}
observer.lastTallyTime, err = observer.accounting.LastTimestamp(ctx, accounting.LastAtRestTally)
if err != nil {
@ -56,12 +58,16 @@ func (observer *RangedLoopObserver) Start(ctx context.Context, time time.Time) (
}
// Fork forks new node tally ranged loop partial.
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
func (observer *RangedLoopObserver) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
defer mon.Task()(&ctx)(&err)
return NewRangedLoopPartial(observer.log, observer.nowFn), nil
}
// Join joins node tally ranged loop partial to main observer updating main per node usage map.
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
defer mon.Task()(&ctx)(&err)
tallyPartial, ok := partial.(*RangedLoopPartial)
if !ok {
return Error.New("expected partial type %T but got %T", tallyPartial, partial)
@ -78,7 +84,9 @@ func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop
var monRangedTally = monkit.ScopeNamed("storj.io/storj/satellite/accounting/tally")
// Finish calculates byte*hours from per node storage usage and save tallies to DB.
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
func (observer *RangedLoopObserver) Finish(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
finishTime := observer.nowFn()
// calculate byte hours, not just bytes
@ -92,7 +100,7 @@ func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
monRangedTally.IntVal("nodetallies.totalsum").Observe(int64(totalSum)) //mon:locked
err := observer.accounting.SaveTallies(ctx, finishTime, byteHours)
err = observer.accounting.SaveTallies(ctx, finishTime, byteHours)
if err != nil {
return Error.New("StorageNodeAccounting.SaveTallies failed: %v", err)
}

View File

@ -43,13 +43,17 @@ func NewObserver(log *zap.Logger, queue VerifyQueue, config Config) *Observer {
}
// Start prepares the observer for audit segment collection.
func (obs *Observer) Start(ctx context.Context, startTime time.Time) error {
func (obs *Observer) Start(ctx context.Context, startTime time.Time) (err error) {
defer mon.Task()(&ctx)(&err)
obs.reservoirs = make(map[storj.NodeID]*Reservoir)
return nil
}
// Fork returns a new audit reservoir collector for the range.
func (obs *Observer) Fork(ctx context.Context) (rangedloop.Partial, error) {
func (obs *Observer) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
defer mon.Task()(&ctx)(&err)
// Each collector needs an RNG for sampling. On systems where time
// resolution is low (e.g. windows is 15ms), seeding an RNG using the
// current time (even with nanosecond precision) may end up reusing a seed
@ -60,7 +64,9 @@ func (obs *Observer) Fork(ctx context.Context) (rangedloop.Partial, error) {
}
// Join merges the audit reservoir collector into the per-node reservoirs.
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) error {
func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
defer mon.Task()(&ctx)(&err)
collector, ok := partial.(*Collector)
if !ok {
return errs.New("expected partial type %T but got %T", collector, partial)
@ -80,7 +86,9 @@ func (obs *Observer) Join(ctx context.Context, partial rangedloop.Partial) error
}
// Finish builds and dedups an audit queue from the merged per-node reservoirs.
func (obs *Observer) Finish(ctx context.Context) error {
func (obs *Observer) Finish(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
type SegmentKey struct {
StreamID uuid.UUID
Position uint64

View File

@ -55,7 +55,9 @@ func (o *LiveCountObserver) Finish(ctx context.Context) error {
// Process increments the counter.
func (o *LiveCountObserver) Process(ctx context.Context, segments []segmentloop.Segment) error {
atomic.AddInt64(&o.numSegments, int64(len(segments)))
processed := atomic.AddInt64(&o.numSegments, int64(len(segments)))
mon.IntVal("segmentsProcessed").Observe(processed)
return nil
}

View File

@ -19,5 +19,6 @@ type RangeSplitter interface {
// SegmentProvider iterates through a range of segments.
type SegmentProvider interface {
Range() UUIDRange
Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error
}

View File

@ -62,6 +62,11 @@ func (provider *MetabaseRangeSplitter) CreateRanges(nRanges int, batchSize int)
return rangeProviders, err
}
// Range returns range which is processed by this provider.
func (provider *MetabaseSegmentProvider) Range() UUIDRange {
return provider.uuidRange
}
// Iterate loops over a part of the segment table.
func (provider *MetabaseSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
var startStreamID uuid.UUID

View File

@ -25,6 +25,11 @@ func (m *InfiniteSegmentProvider) CreateRanges(nRanges int, batchSize int) (segm
return segmentsProviders, nil
}
// Range returns range which is processed by this provider.
func (m *InfiniteSegmentProvider) Range() rangedloop.UUIDRange {
return rangedloop.UUIDRange{}
}
// Iterate allows to loop over the segments stored in the provider.
func (m *InfiniteSegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
for {

View File

@ -50,6 +50,11 @@ func (m *RangeSplitter) CreateRanges(nRanges int, batchSize int) ([]rangedloop.S
return rangeProviders, nil
}
// Range returns range which is processed by this provider.
func (m *SegmentProvider) Range() rangedloop.UUIDRange {
return rangedloop.UUIDRange{}
}
// Iterate allows to loop over the segments stored in the provider.
func (m *SegmentProvider) Iterate(ctx context.Context, fn func([]segmentloop.Segment) error) error {
for offset := 0; offset < len(m.Segments); offset += m.batchSize {

View File

@ -90,7 +90,10 @@ func (service *Service) Run(ctx context.Context) (err error) {
service.log.Info("ranged loop initialized")
return service.Loop.Run(ctx, func(ctx context.Context) error {
service.log.Info("ranged loop started")
service.log.Info("ranged loop started",
zap.Int("parallelism", service.config.Parallelism),
zap.Int("batchSize", service.config.BatchSize),
)
_, err := service.RunOnce(ctx)
if err != nil {
service.log.Error("ranged loop failure", zap.Error(err))
@ -126,7 +129,10 @@ func (service *Service) RunOnce(ctx context.Context) (observerDurations []Observ
}
group := errs2.Group{}
for _, rangeProvider := range rangeProviders {
for index, rangeProvider := range rangeProviders {
uuidRange := rangeProvider.Range()
service.log.Debug("creating range", zap.Int("index", index), zap.Stringer("start", uuidRange.Start), zap.Stringer("end", uuidRange.End))
rangeObservers := []*rangeObserverState{}
for i, observerState := range observerStates {
if observerState.err != nil {

View File

@ -114,7 +114,9 @@ func (observer *RangedLoopObserver) TestingCompareInjuredSegmentIDs(ctx context.
}
// Start starts parallel segments loop.
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) error {
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) (err error) {
defer mon.Task()(&ctx)(&err)
observer.startTime = startTime
observer.TotalStats = aggregateStats{}
@ -122,13 +124,17 @@ func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Ti
}
// Fork creates a Partial to process a chunk of all the segments.
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
func (observer *RangedLoopObserver) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
defer mon.Task()(&ctx)(&err)
return newRangedLoopCheckerPartial(observer), nil
}
// Join is called after the chunk for Partial is done.
// This gives the opportunity to merge the output like in a reduce step.
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
defer mon.Task()(&ctx)(&err)
repPartial, ok := partial.(*repairPartial)
if !ok {
return Error.New("expected partial type %T but got %T", repPartial, partial)
@ -148,7 +154,9 @@ func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop
}
// Finish is called after all segments are processed by all observers.
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
func (observer *RangedLoopObserver) Finish(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
// remove all segments which were not seen as unhealthy by this checker iteration
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
if err != nil {