2019-10-29 14:55:57 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"github.com/spf13/cobra"
|
|
|
|
"github.com/zeebo/errs"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
|
2020-03-20 13:01:12 +00:00
|
|
|
"storj.io/common/context2"
|
cmd/satellite: ignore Canceled in exit from repair worker
Firstly, this changes the repair functionality to return Canceled errors
when a repair is canceled during the Get phase. Previously, because we
do not track individual errors per piece, this would just show up as a
failure to download enough pieces to repair the segment, which would
cause the segment to be added to the IrreparableDB, which is entirely
unhelpful.
Then, ignore Canceled errors in the return value of the repair worker.
Apparently, when the worker returns an error, that makes Cobra exit the
program with a nonzero exit code, which causes some piece of our
deployment automation to freak out and page people. And when we ask the
repair worker to shut down, "canceled" errors are what we _expect_, not
an error case.
Change-Id: Ia3eb1c60a8d6ec5d09e7cef55dea523be28e8435
2020-05-15 03:39:22 +01:00
|
|
|
"storj.io/common/errs2"
|
2020-03-23 19:18:20 +00:00
|
|
|
"storj.io/private/process"
|
2020-03-23 19:30:31 +00:00
|
|
|
"storj.io/private/version"
|
2019-10-29 14:55:57 +00:00
|
|
|
"storj.io/storj/pkg/revocation"
|
|
|
|
"storj.io/storj/satellite"
|
|
|
|
"storj.io/storj/satellite/metainfo"
|
2020-01-17 22:55:53 +00:00
|
|
|
"storj.io/storj/satellite/orders"
|
2019-10-29 14:55:57 +00:00
|
|
|
"storj.io/storj/satellite/satellitedb"
|
|
|
|
)
|
|
|
|
|
|
|
|
func cmdRepairerRun(cmd *cobra.Command, args []string) (err error) {
|
|
|
|
ctx, _ := process.Ctx(cmd)
|
|
|
|
log := zap.L()
|
|
|
|
|
2020-01-28 17:35:45 +00:00
|
|
|
runCfg.Debug.Address = *process.DebugAddrFlag
|
|
|
|
|
2019-10-29 14:55:57 +00:00
|
|
|
identity, err := runCfg.Identity.Load()
|
|
|
|
if err != nil {
|
2020-10-13 14:49:33 +01:00
|
|
|
log.Error("Failed to load identity.", zap.Error(err))
|
|
|
|
return errs.New("Failed to load identity: %+v", err)
|
2019-10-29 14:55:57 +00:00
|
|
|
}
|
|
|
|
|
2020-12-04 10:24:39 +00:00
|
|
|
db, err := satellitedb.Open(ctx, log.Named("db"), runCfg.Database, satellitedb.Options{ApplicationName: "satellite-repairer"})
|
2019-10-29 14:55:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return errs.New("Error starting master database: %+v", err)
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
err = errs.Combine(err, db.Close())
|
|
|
|
}()
|
|
|
|
|
2020-12-04 10:24:39 +00:00
|
|
|
pointerDB, err := metainfo.OpenStore(ctx, log.Named("pointerdb"), runCfg.Metainfo.DatabaseURL, "satellite-repairer")
|
2019-10-29 14:55:57 +00:00
|
|
|
if err != nil {
|
2020-04-24 20:15:27 +01:00
|
|
|
return errs.New("Error creating metainfo database connection: %+v", err)
|
2019-10-29 14:55:57 +00:00
|
|
|
}
|
|
|
|
defer func() {
|
2020-01-27 19:08:37 +00:00
|
|
|
err = errs.Combine(err, pointerDB.Close())
|
2019-10-29 14:55:57 +00:00
|
|
|
}()
|
|
|
|
|
2020-10-29 16:54:35 +00:00
|
|
|
metabaseDB, err := metainfo.OpenMetabase(ctx, log.Named("metabase"), runCfg.Metainfo.DatabaseURL)
|
|
|
|
if err != nil {
|
|
|
|
return errs.New("Error creating metabase connection: %+v", err)
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
err = errs.Combine(err, metabaseDB.Close())
|
|
|
|
}()
|
|
|
|
|
2020-10-28 14:01:41 +00:00
|
|
|
revocationDB, err := revocation.OpenDBFromCfg(ctx, runCfg.Server.Config)
|
2019-10-29 14:55:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return errs.New("Error creating revocation database: %+v", err)
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
err = errs.Combine(err, revocationDB.Close())
|
|
|
|
}()
|
|
|
|
|
2020-01-17 22:55:53 +00:00
|
|
|
rollupsWriteCache := orders.NewRollupsWriteCache(log.Named("orders-write-cache"), db.Orders(), runCfg.Orders.FlushBatchSize)
|
|
|
|
defer func() {
|
|
|
|
err = errs.Combine(err, rollupsWriteCache.CloseAndFlush(context2.WithoutCancellation(ctx)))
|
|
|
|
}()
|
|
|
|
|
2019-10-29 14:55:57 +00:00
|
|
|
peer, err := satellite.NewRepairer(
|
|
|
|
log,
|
|
|
|
identity,
|
|
|
|
pointerDB,
|
2020-10-29 16:54:35 +00:00
|
|
|
metabaseDB,
|
2019-10-29 14:55:57 +00:00
|
|
|
revocationDB,
|
|
|
|
db.RepairQueue(),
|
|
|
|
db.Buckets(),
|
|
|
|
db.OverlayCache(),
|
2020-01-17 22:55:53 +00:00
|
|
|
rollupsWriteCache,
|
2020-02-24 20:13:12 +00:00
|
|
|
db.Irreparable(),
|
2019-10-29 14:55:57 +00:00
|
|
|
version.Build,
|
|
|
|
&runCfg.Config,
|
2020-05-12 20:10:32 +01:00
|
|
|
process.AtomicLevel(cmd),
|
2019-10-29 14:55:57 +00:00
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-02-21 17:41:54 +00:00
|
|
|
_, err = peer.Version.Service.CheckVersion(ctx)
|
2019-10-29 14:55:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-02-18 16:16:51 +00:00
|
|
|
if err := process.InitMetricsWithHostname(ctx, log, nil); err != nil {
|
2020-04-13 10:31:17 +01:00
|
|
|
log.Warn("Failed to initialize telemetry batcher on repairer", zap.Error(err))
|
2019-10-29 14:55:57 +00:00
|
|
|
}
|
|
|
|
|
2020-04-24 20:15:27 +01:00
|
|
|
err = pointerDB.MigrateToLatest(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return errs.New("Error creating tables for metainfo database: %+v", err)
|
|
|
|
}
|
|
|
|
|
2020-10-29 16:54:35 +00:00
|
|
|
err = metabaseDB.MigrateToLatest(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return errs.New("Error creating tables for metabase: %+v", err)
|
|
|
|
}
|
|
|
|
|
2020-01-13 13:44:55 +00:00
|
|
|
err = db.CheckVersion(ctx)
|
2019-11-02 20:09:07 +00:00
|
|
|
if err != nil {
|
2020-10-13 14:49:33 +01:00
|
|
|
log.Error("Failed satellite database version check.", zap.Error(err))
|
2019-11-02 20:09:07 +00:00
|
|
|
return errs.New("Error checking version for satellitedb: %+v", err)
|
|
|
|
}
|
|
|
|
|
2019-10-29 14:55:57 +00:00
|
|
|
runError := peer.Run(ctx)
|
|
|
|
closeError := peer.Close()
|
cmd/satellite: ignore Canceled in exit from repair worker
Firstly, this changes the repair functionality to return Canceled errors
when a repair is canceled during the Get phase. Previously, because we
do not track individual errors per piece, this would just show up as a
failure to download enough pieces to repair the segment, which would
cause the segment to be added to the IrreparableDB, which is entirely
unhelpful.
Then, ignore Canceled errors in the return value of the repair worker.
Apparently, when the worker returns an error, that makes Cobra exit the
program with a nonzero exit code, which causes some piece of our
deployment automation to freak out and page people. And when we ask the
repair worker to shut down, "canceled" errors are what we _expect_, not
an error case.
Change-Id: Ia3eb1c60a8d6ec5d09e7cef55dea523be28e8435
2020-05-15 03:39:22 +01:00
|
|
|
return errs2.IgnoreCanceled(errs.Combine(runError, closeError))
|
2019-10-29 14:55:57 +00:00
|
|
|
}
|