From 55d7bcc59b3a748dafba6800f47bb7672fb9bfa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Niewrza=C5=82?= Date: Thu, 29 Jul 2021 10:28:23 +0200 Subject: [PATCH] satellite/metabase/segmentloop: don't shutdown satellite on loop error We made decision to avoid satellite shutdown when segment loop will return error. Loop still can reeturn error but it will be logged and we will make monitoring/alert around that error. Change-Id: I6aa8e284406edf644a09d6b1fe00c3155c5430c9 --- cmd/metabase-verify/verify/verify.go | 2 +- monkit.lock | 1 + satellite/core.go | 1 + satellite/gc.go | 1 + satellite/metabase/segmentloop/service.go | 14 ++++++++++++-- satellite/metabase/segmentloop/service_test.go | 5 +++-- 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/cmd/metabase-verify/verify/verify.go b/cmd/metabase-verify/verify/verify.go index 017bfde50..13f4a1395 100644 --- a/cmd/metabase-verify/verify/verify.go +++ b/cmd/metabase-verify/verify/verify.go @@ -42,7 +42,7 @@ func New(log *zap.Logger, mdb segmentloop.MetabaseDB, config Config) *Chore { // RunOnce creates a new segmentloop and runs the verifications. func (chore *Chore) RunOnce(ctx context.Context) error { - loop := segmentloop.New(chore.Config.Loop, chore.DB) + loop := segmentloop.New(chore.Log, chore.Config.Loop, chore.DB) var group errs2.Group group.Go(func() error { diff --git a/monkit.lock b/monkit.lock index 36a4f6def..541d93472 100644 --- a/monkit.lock +++ b/monkit.lock @@ -55,6 +55,7 @@ storj.io/storj/satellite/gracefulexit."graceful_exit_success" Meter storj.io/storj/satellite/gracefulexit."graceful_exit_successful_pieces_transfer_ratio" IntVal storj.io/storj/satellite/gracefulexit."graceful_exit_transfer_piece_fail" Meter storj.io/storj/satellite/gracefulexit."graceful_exit_transfer_piece_success" Meter +storj.io/storj/satellite/metabase/segmentloop."segmentloop_error" Event storj.io/storj/satellite/metabase/segmentloop."segmentsProcessed" IntVal storj.io/storj/satellite/metabase/segmentloop.*Service.RunOnce Task storj.io/storj/satellite/metainfo."metainfo_rate_limit_exceeded" Event diff --git a/satellite/core.go b/satellite/core.go index 865b2190e..03b30d779 100644 --- a/satellite/core.go +++ b/satellite/core.go @@ -252,6 +252,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB, peer.Metainfo.Metabase, ) peer.Metainfo.SegmentLoop = segmentloop.New( + peer.Log.Named("metainfo:segmentloop"), config.Metainfo.SegmentLoop, peer.Metainfo.Metabase, ) diff --git a/satellite/gc.go b/satellite/gc.go index f19abb91b..1b7e0d1c9 100644 --- a/satellite/gc.go +++ b/satellite/gc.go @@ -133,6 +133,7 @@ func NewGarbageCollection(log *zap.Logger, full *identity.FullIdentity, db DB, // As long as garbage collection is the only observer joining the loop, then by default // the loop will only run when the garbage collection joins (which happens every GarbageCollection.Interval) peer.Metainfo.SegmentLoop = segmentloop.New( + log.Named("segmentloop"), config.Metainfo.SegmentLoop, metabaseDB, ) diff --git a/satellite/metabase/segmentloop/service.go b/satellite/metabase/segmentloop/service.go index 4cd990a99..2eb7d4f2d 100644 --- a/satellite/metabase/segmentloop/service.go +++ b/satellite/metabase/segmentloop/service.go @@ -11,8 +11,10 @@ import ( "github.com/spacemonkeygo/monkit/v3" "github.com/zeebo/errs" + "go.uber.org/zap" "golang.org/x/time/rate" + "storj.io/common/errs2" "storj.io/storj/satellite/metabase" ) @@ -160,6 +162,7 @@ type MetabaseDB interface { // // architecture: Service type Service struct { + log *zap.Logger config Config metabaseDB MetabaseDB join chan *observerContext @@ -167,8 +170,9 @@ type Service struct { } // New creates a new segments loop service. -func New(config Config, metabaseDB MetabaseDB) *Service { +func New(log *zap.Logger, config Config, metabaseDB MetabaseDB) *Service { return &Service{ + log: log, metabaseDB: metabaseDB, config: config, join: make(chan *observerContext), @@ -223,7 +227,13 @@ func (loop *Service) Run(ctx context.Context) (err error) { for { err := loop.RunOnce(ctx) if err != nil { - return err + loop.log.Error("segment loop failure", zap.Error(err)) + + if errs2.IsCanceled(err) { + return err + } + + mon.Event("segmentloop_error") //mon:locked } } } diff --git a/satellite/metabase/segmentloop/service_test.go b/satellite/metabase/segmentloop/service_test.go index 259316bcb..7d1c7fab8 100644 --- a/satellite/metabase/segmentloop/service_test.go +++ b/satellite/metabase/segmentloop/service_test.go @@ -15,6 +15,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap" + "go.uber.org/zap/zaptest" "golang.org/x/sync/errgroup" "storj.io/common/errs2" @@ -250,7 +251,7 @@ func TestSegmentsLoopCancel(t *testing.T) { require.NoError(t, err) } - loop := segmentloop.New(segmentloop.Config{ + loop := segmentloop.New(zaptest.NewLogger(t), segmentloop.Config{ CoalesceDuration: 1 * time.Second, ListLimit: 10000, }, satellite.Metainfo.Metabase) @@ -322,7 +323,7 @@ func TestSegmentsLoop_MonitorCancel(t *testing.T) { }, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { satellite := planet.Satellites[0] - loop := segmentloop.New(segmentloop.Config{ + loop := segmentloop.New(zaptest.NewLogger(t), segmentloop.Config{ CoalesceDuration: time.Nanosecond, ListLimit: 10000, }, satellite.Metainfo.Metabase)