2b59640f18
Firstly, this changes the repair functionality to return Canceled errors when a repair is canceled during the Get phase. Previously, because we do not track individual errors per piece, this would just show up as a failure to download enough pieces to repair the segment, which would cause the segment to be added to the IrreparableDB, which is entirely unhelpful. Then, ignore Canceled errors in the return value of the repair worker. Apparently, when the worker returns an error, that makes Cobra exit the program with a nonzero exit code, which causes some piece of our deployment automation to freak out and page people. And when we ask the repair worker to shut down, "canceled" errors are what we _expect_, not an error case. Change-Id: Ia3eb1c60a8d6ec5d09e7cef55dea523be28e8435
105 lines
2.7 KiB
Go
105 lines
2.7 KiB
Go
// Copyright (C) 2019 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package main
|
|
|
|
import (
|
|
"github.com/spf13/cobra"
|
|
"github.com/zeebo/errs"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/context2"
|
|
"storj.io/common/errs2"
|
|
"storj.io/private/process"
|
|
"storj.io/private/version"
|
|
"storj.io/storj/pkg/revocation"
|
|
"storj.io/storj/satellite"
|
|
"storj.io/storj/satellite/metainfo"
|
|
"storj.io/storj/satellite/orders"
|
|
"storj.io/storj/satellite/satellitedb"
|
|
)
|
|
|
|
func cmdRepairerRun(cmd *cobra.Command, args []string) (err error) {
|
|
ctx, _ := process.Ctx(cmd)
|
|
log := zap.L()
|
|
|
|
runCfg.Debug.Address = *process.DebugAddrFlag
|
|
|
|
identity, err := runCfg.Identity.Load()
|
|
if err != nil {
|
|
log.Error("Failed to load identity.", zap.Error(err))
|
|
return errs.New("Failed to load identity: %+v", err)
|
|
}
|
|
|
|
db, err := satellitedb.Open(ctx, log.Named("db"), runCfg.Database, satellitedb.Options{})
|
|
if err != nil {
|
|
return errs.New("Error starting master database: %+v", err)
|
|
}
|
|
defer func() {
|
|
err = errs.Combine(err, db.Close())
|
|
}()
|
|
|
|
pointerDB, err := metainfo.OpenStore(ctx, log.Named("pointerdb"), runCfg.Metainfo.DatabaseURL)
|
|
if err != nil {
|
|
return errs.New("Error creating metainfo database connection: %+v", err)
|
|
}
|
|
defer func() {
|
|
err = errs.Combine(err, pointerDB.Close())
|
|
}()
|
|
|
|
revocationDB, err := revocation.OpenDBFromCfg(ctx, runCfg.Server.Config)
|
|
if err != nil {
|
|
return errs.New("Error creating revocation database: %+v", err)
|
|
}
|
|
defer func() {
|
|
err = errs.Combine(err, revocationDB.Close())
|
|
}()
|
|
|
|
rollupsWriteCache := orders.NewRollupsWriteCache(log.Named("orders-write-cache"), db.Orders(), runCfg.Orders.FlushBatchSize)
|
|
defer func() {
|
|
err = errs.Combine(err, rollupsWriteCache.CloseAndFlush(context2.WithoutCancellation(ctx)))
|
|
}()
|
|
|
|
peer, err := satellite.NewRepairer(
|
|
log,
|
|
identity,
|
|
pointerDB,
|
|
revocationDB,
|
|
db.RepairQueue(),
|
|
db.Buckets(),
|
|
db.OverlayCache(),
|
|
rollupsWriteCache,
|
|
db.Irreparable(),
|
|
version.Build,
|
|
&runCfg.Config,
|
|
process.AtomicLevel(cmd),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = peer.Version.Service.CheckVersion(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := process.InitMetricsWithHostname(ctx, log, nil); err != nil {
|
|
log.Warn("Failed to initialize telemetry batcher on repairer", zap.Error(err))
|
|
}
|
|
|
|
err = pointerDB.MigrateToLatest(ctx)
|
|
if err != nil {
|
|
return errs.New("Error creating tables for metainfo database: %+v", err)
|
|
}
|
|
|
|
err = db.CheckVersion(ctx)
|
|
if err != nil {
|
|
log.Error("Failed satellite database version check.", zap.Error(err))
|
|
return errs.New("Error checking version for satellitedb: %+v", err)
|
|
}
|
|
|
|
runError := peer.Run(ctx)
|
|
closeError := peer.Close()
|
|
return errs2.IgnoreCanceled(errs.Combine(runError, closeError))
|
|
}
|