storj/cmd/satellite/repairer.go
paul cannon 2b59640f18 cmd/satellite: ignore Canceled in exit from repair worker
Firstly, this changes the repair functionality to return Canceled errors
when a repair is canceled during the Get phase. Previously, because we
do not track individual errors per piece, this would just show up as a
failure to download enough pieces to repair the segment, which would
cause the segment to be added to the IrreparableDB, which is entirely
unhelpful.

Then, ignore Canceled errors in the return value of the repair worker.
Apparently, when the worker returns an error, that makes Cobra exit the
program with a nonzero exit code, which causes some piece of our
deployment automation to freak out and page people. And when we ask the
repair worker to shut down, "canceled" errors are what we _expect_, not
an error case.

Change-Id: Ia3eb1c60a8d6ec5d09e7cef55dea523be28e8435
2020-11-17 21:37:59 +00:00

105 lines
2.7 KiB
Go

// Copyright (C) 2019 Storj Labs, Inc.
// See LICENSE for copying information.
package main
import (
"github.com/spf13/cobra"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/context2"
"storj.io/common/errs2"
"storj.io/private/process"
"storj.io/private/version"
"storj.io/storj/pkg/revocation"
"storj.io/storj/satellite"
"storj.io/storj/satellite/metainfo"
"storj.io/storj/satellite/orders"
"storj.io/storj/satellite/satellitedb"
)
func cmdRepairerRun(cmd *cobra.Command, args []string) (err error) {
ctx, _ := process.Ctx(cmd)
log := zap.L()
runCfg.Debug.Address = *process.DebugAddrFlag
identity, err := runCfg.Identity.Load()
if err != nil {
log.Error("Failed to load identity.", zap.Error(err))
return errs.New("Failed to load identity: %+v", err)
}
db, err := satellitedb.Open(ctx, log.Named("db"), runCfg.Database, satellitedb.Options{})
if err != nil {
return errs.New("Error starting master database: %+v", err)
}
defer func() {
err = errs.Combine(err, db.Close())
}()
pointerDB, err := metainfo.OpenStore(ctx, log.Named("pointerdb"), runCfg.Metainfo.DatabaseURL)
if err != nil {
return errs.New("Error creating metainfo database connection: %+v", err)
}
defer func() {
err = errs.Combine(err, pointerDB.Close())
}()
revocationDB, err := revocation.OpenDBFromCfg(ctx, runCfg.Server.Config)
if err != nil {
return errs.New("Error creating revocation database: %+v", err)
}
defer func() {
err = errs.Combine(err, revocationDB.Close())
}()
rollupsWriteCache := orders.NewRollupsWriteCache(log.Named("orders-write-cache"), db.Orders(), runCfg.Orders.FlushBatchSize)
defer func() {
err = errs.Combine(err, rollupsWriteCache.CloseAndFlush(context2.WithoutCancellation(ctx)))
}()
peer, err := satellite.NewRepairer(
log,
identity,
pointerDB,
revocationDB,
db.RepairQueue(),
db.Buckets(),
db.OverlayCache(),
rollupsWriteCache,
db.Irreparable(),
version.Build,
&runCfg.Config,
process.AtomicLevel(cmd),
)
if err != nil {
return err
}
_, err = peer.Version.Service.CheckVersion(ctx)
if err != nil {
return err
}
if err := process.InitMetricsWithHostname(ctx, log, nil); err != nil {
log.Warn("Failed to initialize telemetry batcher on repairer", zap.Error(err))
}
err = pointerDB.MigrateToLatest(ctx)
if err != nil {
return errs.New("Error creating tables for metainfo database: %+v", err)
}
err = db.CheckVersion(ctx)
if err != nil {
log.Error("Failed satellite database version check.", zap.Error(err))
return errs.New("Error checking version for satellitedb: %+v", err)
}
runError := peer.Run(ctx)
closeError := peer.Close()
return errs2.IgnoreCanceled(errs.Combine(runError, closeError))
}