satellite/metabase/segmentloop: don't shutdown satellite on loop error

We made decision to avoid satellite shutdown when segment loop
will return error. Loop still can reeturn error but it will be logged
and we will make monitoring/alert around that error.

Change-Id: I6aa8e284406edf644a09d6b1fe00c3155c5430c9
This commit is contained in:
Michał Niewrzał 2021-07-29 10:28:23 +02:00 committed by Michal Niewrzal
parent 615aae6bdd
commit 55d7bcc59b
6 changed files with 19 additions and 5 deletions

View File

@ -42,7 +42,7 @@ func New(log *zap.Logger, mdb segmentloop.MetabaseDB, config Config) *Chore {
// RunOnce creates a new segmentloop and runs the verifications.
func (chore *Chore) RunOnce(ctx context.Context) error {
loop := segmentloop.New(chore.Config.Loop, chore.DB)
loop := segmentloop.New(chore.Log, chore.Config.Loop, chore.DB)
var group errs2.Group
group.Go(func() error {

View File

@ -55,6 +55,7 @@ storj.io/storj/satellite/gracefulexit."graceful_exit_success" Meter
storj.io/storj/satellite/gracefulexit."graceful_exit_successful_pieces_transfer_ratio" IntVal
storj.io/storj/satellite/gracefulexit."graceful_exit_transfer_piece_fail" Meter
storj.io/storj/satellite/gracefulexit."graceful_exit_transfer_piece_success" Meter
storj.io/storj/satellite/metabase/segmentloop."segmentloop_error" Event
storj.io/storj/satellite/metabase/segmentloop."segmentsProcessed" IntVal
storj.io/storj/satellite/metabase/segmentloop.*Service.RunOnce Task
storj.io/storj/satellite/metainfo."metainfo_rate_limit_exceeded" Event

View File

@ -252,6 +252,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
peer.Metainfo.Metabase,
)
peer.Metainfo.SegmentLoop = segmentloop.New(
peer.Log.Named("metainfo:segmentloop"),
config.Metainfo.SegmentLoop,
peer.Metainfo.Metabase,
)

View File

@ -133,6 +133,7 @@ func NewGarbageCollection(log *zap.Logger, full *identity.FullIdentity, db DB,
// As long as garbage collection is the only observer joining the loop, then by default
// the loop will only run when the garbage collection joins (which happens every GarbageCollection.Interval)
peer.Metainfo.SegmentLoop = segmentloop.New(
log.Named("segmentloop"),
config.Metainfo.SegmentLoop,
metabaseDB,
)

View File

@ -11,8 +11,10 @@ import (
"github.com/spacemonkeygo/monkit/v3"
"github.com/zeebo/errs"
"go.uber.org/zap"
"golang.org/x/time/rate"
"storj.io/common/errs2"
"storj.io/storj/satellite/metabase"
)
@ -160,6 +162,7 @@ type MetabaseDB interface {
//
// architecture: Service
type Service struct {
log *zap.Logger
config Config
metabaseDB MetabaseDB
join chan *observerContext
@ -167,8 +170,9 @@ type Service struct {
}
// New creates a new segments loop service.
func New(config Config, metabaseDB MetabaseDB) *Service {
func New(log *zap.Logger, config Config, metabaseDB MetabaseDB) *Service {
return &Service{
log: log,
metabaseDB: metabaseDB,
config: config,
join: make(chan *observerContext),
@ -223,7 +227,13 @@ func (loop *Service) Run(ctx context.Context) (err error) {
for {
err := loop.RunOnce(ctx)
if err != nil {
return err
loop.log.Error("segment loop failure", zap.Error(err))
if errs2.IsCanceled(err) {
return err
}
mon.Event("segmentloop_error") //mon:locked
}
}
}

View File

@ -15,6 +15,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"go.uber.org/zap/zaptest"
"golang.org/x/sync/errgroup"
"storj.io/common/errs2"
@ -250,7 +251,7 @@ func TestSegmentsLoopCancel(t *testing.T) {
require.NoError(t, err)
}
loop := segmentloop.New(segmentloop.Config{
loop := segmentloop.New(zaptest.NewLogger(t), segmentloop.Config{
CoalesceDuration: 1 * time.Second,
ListLimit: 10000,
}, satellite.Metainfo.Metabase)
@ -322,7 +323,7 @@ func TestSegmentsLoop_MonitorCancel(t *testing.T) {
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
loop := segmentloop.New(segmentloop.Config{
loop := segmentloop.New(zaptest.NewLogger(t), segmentloop.Config{
CoalesceDuration: time.Nanosecond,
ListLimit: 10000,
}, satellite.Metainfo.Metabase)