storj/satellite/metabase/rangedloop/service.go
Michal Niewrzal aba2f14595 satellite/metabase/rangedloop: few additions for monitoring
Additional elements added:
* monkit metric for observers methods like Start/Fork/Join/Finish to
be able to check how much time those methods are taking
* few more logs e.g. entries with processed range
* segmentsProcessed metric to be able to check loop progress

Change-Id: I65dd51f7f5c4bdbb4014fbf04e5b6b10bdb035ec
2023-02-17 08:46:00 +00:00

294 lines
8.1 KiB
Go

// Copyright (C) 2022 Storj Labs, Inc.
// See LICENSE for copying information.
package rangedloop
import (
"context"
"fmt"
"time"
"github.com/spacemonkeygo/monkit/v3"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/errs2"
"storj.io/common/sync2"
"storj.io/storj/satellite/metabase/segmentloop"
)
var (
mon = monkit.Package()
)
// Config contains configurable values for the shared loop.
type Config struct {
Parallelism int `help:"how many chunks of segments to process in parallel" default:"2"`
BatchSize int `help:"how many items to query in a batch" default:"2500"`
AsOfSystemInterval time.Duration `help:"as of system interval" releaseDefault:"-5m" devDefault:"-1us" testDefault:"-1us"`
Interval time.Duration `help:"how often to run the loop" releaseDefault:"2h" devDefault:"10s" testDefault:"10s"`
}
// Service iterates through all segments and calls the attached observers for every segment
//
// architecture: Service
type Service struct {
log *zap.Logger
config Config
provider RangeSplitter
observers []Observer
Loop *sync2.Cycle
}
// NewService creates a new instance of the ranged loop service.
func NewService(log *zap.Logger, config Config, provider RangeSplitter, observers []Observer) *Service {
return &Service{
log: log,
config: config,
provider: provider,
observers: observers,
Loop: sync2.NewCycle(config.Interval),
}
}
// observerState contains information to manage an observer during a loop iteration.
type observerState struct {
observer Observer
rangeObservers []*rangeObserverState
// err is the error that occurred during the observer's Start method.
// If err is set, the observer will be skipped during the loop iteration.
err error
}
type rangeObserverState struct {
rangeObserver Partial
duration time.Duration
// err is the error that is returned by the observer's Fork or Process method.
// If err is set, the range observer will be skipped during the loop iteration.
err error
}
// ObserverDuration reports back on how long it took the observer to process all the segments.
type ObserverDuration struct {
Observer Observer
// Duration is set to -1 when the observer has errored out
// so someone watching metrics can tell that something went wrong.
Duration time.Duration
}
// Close stops the ranged loop.
func (service *Service) Close() error {
service.Loop.Close()
return nil
}
// Run starts the looping service.
func (service *Service) Run(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
service.log.Info("ranged loop initialized")
return service.Loop.Run(ctx, func(ctx context.Context) error {
service.log.Info("ranged loop started",
zap.Int("parallelism", service.config.Parallelism),
zap.Int("batchSize", service.config.BatchSize),
)
_, err := service.RunOnce(ctx)
if err != nil {
service.log.Error("ranged loop failure", zap.Error(err))
if errs2.IsCanceled(err) {
return err
}
if ctx.Err() != nil {
return errs.Combine(err, ctx.Err())
}
mon.Event("rangedloop_error") //mon:locked
}
service.log.Info("ranged loop finished")
return nil
})
}
// RunOnce goes through one time and sends information to observers.
func (service *Service) RunOnce(ctx context.Context) (observerDurations []ObserverDuration, err error) {
defer mon.Task()(&ctx)(&err)
observerStates, err := startObservers(ctx, service.log, service.observers)
if err != nil {
return nil, err
}
rangeProviders, err := service.provider.CreateRanges(service.config.Parallelism, service.config.BatchSize)
if err != nil {
return nil, err
}
group := errs2.Group{}
for index, rangeProvider := range rangeProviders {
uuidRange := rangeProvider.Range()
service.log.Debug("creating range", zap.Int("index", index), zap.Stringer("start", uuidRange.Start), zap.Stringer("end", uuidRange.End))
rangeObservers := []*rangeObserverState{}
for i, observerState := range observerStates {
if observerState.err != nil {
continue
}
rangeObserver, err := observerState.observer.Fork(ctx)
rangeState := &rangeObserverState{
rangeObserver: rangeObserver,
err: err,
}
rangeObservers = append(rangeObservers, rangeState)
observerStates[i].rangeObservers = append(observerStates[i].rangeObservers, rangeState)
}
// Create closure to capture loop variables.
group.Go(createGoroutineClosure(ctx, rangeProvider, rangeObservers))
}
// Improvement: stop all ranges when one has an error.
errList := group.Wait()
if errList != nil {
return nil, errs.Combine(errList...)
}
return finishObservers(ctx, service.log, observerStates)
}
func createGoroutineClosure(ctx context.Context, rangeProvider SegmentProvider, states []*rangeObserverState) func() error {
return func() (err error) {
defer mon.Task()(&ctx)(&err)
return rangeProvider.Iterate(ctx, func(segments []segmentloop.Segment) error {
// check for cancellation every segment batch
select {
case <-ctx.Done():
return ctx.Err()
default:
return processBatch(ctx, states, segments)
}
})
}
}
func startObservers(ctx context.Context, log *zap.Logger, observers []Observer) (observerStates []observerState, err error) {
startTime := time.Now()
for _, obs := range observers {
observerStates = append(observerStates, startObserver(ctx, log, startTime, obs))
}
return observerStates, nil
}
func startObserver(ctx context.Context, log *zap.Logger, startTime time.Time, observer Observer) observerState {
err := observer.Start(ctx, startTime)
if err != nil {
log.Error(
"Starting observer failed. This observer will be excluded from this run of the ranged segment loop.",
zap.String("observer", fmt.Sprintf("%T", observer)),
zap.Error(err),
)
}
return observerState{
observer: observer,
err: err,
}
}
func finishObservers(ctx context.Context, log *zap.Logger, observerStates []observerState) (observerDurations []ObserverDuration, err error) {
for _, state := range observerStates {
observerDurations = append(observerDurations, finishObserver(ctx, log, state))
}
sendObserverDurations(observerDurations)
return observerDurations, nil
}
// Iterating over the segments is done.
// This is the reduce step.
func finishObserver(ctx context.Context, log *zap.Logger, state observerState) ObserverDuration {
if state.err != nil {
return ObserverDuration{
Observer: state.observer,
Duration: -1 * time.Second,
}
}
for _, rangeObserver := range state.rangeObservers {
if rangeObserver.err != nil {
log.Error(
"Observer failed during Process(), it will not be finalized in this run of the ranged segment loop",
zap.String("observer", fmt.Sprintf("%T", state.observer)),
zap.Error(rangeObserver.err),
)
return ObserverDuration{
Observer: state.observer,
Duration: -1 * time.Second,
}
}
}
var duration time.Duration
for _, rangeObserver := range state.rangeObservers {
err := state.observer.Join(ctx, rangeObserver.rangeObserver)
if err != nil {
log.Error(
"Observer failed during Join(), it will not be finalized in this run of the ranged segment loop",
zap.String("observer", fmt.Sprintf("%T", state.observer)),
zap.Error(rangeObserver.err),
)
return ObserverDuration{
Observer: state.observer,
Duration: -1 * time.Second,
}
}
duration += rangeObserver.duration
}
err := state.observer.Finish(ctx)
if err != nil {
log.Error(
"Observer failed during Finish()",
zap.String("observer", fmt.Sprintf("%T", state.observer)),
zap.Error(err),
)
return ObserverDuration{
Observer: state.observer,
Duration: -1 * time.Second,
}
}
return ObserverDuration{
Duration: duration,
Observer: state.observer,
}
}
func processBatch(ctx context.Context, states []*rangeObserverState, segments []segmentloop.Segment) (err error) {
for _, state := range states {
if state.err != nil {
// this observer has errored in a previous batch
continue
}
start := time.Now()
err := state.rangeObserver.Process(ctx, segments)
state.duration += time.Since(start)
if err != nil {
// unsure if this is necessary here
if errs2.IsCanceled(err) {
return err
}
state.err = err
}
}
return nil
}