2019-01-24 20:15:10 +00:00
|
|
|
// Copyright (C) 2019 Storj Labs, Inc.
|
2018-10-02 20:46:29 +01:00
|
|
|
// See LICENSE for copying information.
|
|
|
|
|
|
|
|
package checker
|
2018-10-03 19:35:56 +01:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2018-10-30 19:16:40 +00:00
|
|
|
"time"
|
2018-10-04 22:40:34 +01:00
|
|
|
|
2019-11-08 20:40:39 +00:00
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
2019-01-23 19:58:44 +00:00
|
|
|
"github.com/zeebo/errs"
|
2018-10-04 22:40:34 +01:00
|
|
|
"go.uber.org/zap"
|
2019-05-31 15:12:49 +01:00
|
|
|
"golang.org/x/sync/errgroup"
|
2018-10-04 22:40:34 +01:00
|
|
|
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/errs2"
|
|
|
|
"storj.io/common/pb"
|
2020-11-20 22:20:03 +00:00
|
|
|
"storj.io/common/storj"
|
2019-12-27 11:48:47 +00:00
|
|
|
"storj.io/common/sync2"
|
2020-10-30 10:41:22 +00:00
|
|
|
"storj.io/storj/satellite/internalpb"
|
2019-04-25 09:46:32 +01:00
|
|
|
"storj.io/storj/satellite/metainfo"
|
2020-08-31 11:14:20 +01:00
|
|
|
"storj.io/storj/satellite/metainfo/metabase"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/overlay"
|
2020-10-21 23:02:54 +01:00
|
|
|
"storj.io/storj/satellite/repair"
|
2019-07-28 06:55:36 +01:00
|
|
|
"storj.io/storj/satellite/repair/irreparable"
|
|
|
|
"storj.io/storj/satellite/repair/queue"
|
2018-10-03 19:35:56 +01:00
|
|
|
)
|
|
|
|
|
2019-01-23 19:58:44 +00:00
|
|
|
// Error is a standard error class for this package.
|
|
|
|
var (
|
|
|
|
Error = errs.Class("checker error")
|
|
|
|
mon = monkit.Package()
|
|
|
|
)
|
|
|
|
|
2020-12-05 16:01:42 +00:00
|
|
|
// Checker contains the information needed to do checks for missing pieces.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Chore
|
2019-02-11 21:06:39 +00:00
|
|
|
type Checker struct {
|
2019-08-01 19:44:32 +01:00
|
|
|
logger *zap.Logger
|
2019-05-22 22:17:52 +01:00
|
|
|
repairQueue queue.RepairQueue
|
|
|
|
irrdb irreparable.DB
|
2019-08-01 19:44:32 +01:00
|
|
|
metaLoop *metainfo.Loop
|
|
|
|
nodestate *ReliabilityCache
|
2020-11-20 22:20:03 +00:00
|
|
|
statsCollector *statsCollector
|
2020-10-27 18:26:46 +00:00
|
|
|
repairOverrides RepairOverridesMap
|
2020-10-21 23:02:54 +01:00
|
|
|
nodeFailureRate float64
|
2020-01-30 13:06:43 +00:00
|
|
|
Loop *sync2.Cycle
|
|
|
|
IrreparableLoop *sync2.Cycle
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// NewChecker creates a new instance of checker.
|
2020-12-14 14:29:48 +00:00
|
|
|
func NewChecker(logger *zap.Logger, repairQueue queue.RepairQueue, irrdb irreparable.DB, metaLoop *metainfo.Loop, overlay *overlay.Service, config Config) *Checker {
|
2019-07-08 23:04:35 +01:00
|
|
|
return &Checker{
|
2019-08-01 19:44:32 +01:00
|
|
|
logger: logger,
|
|
|
|
|
2020-10-21 23:02:54 +01:00
|
|
|
repairQueue: repairQueue,
|
|
|
|
irrdb: irrdb,
|
|
|
|
metaLoop: metaLoop,
|
|
|
|
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
|
2020-11-20 22:20:03 +00:00
|
|
|
statsCollector: newStatsCollector(),
|
2020-10-27 18:26:46 +00:00
|
|
|
repairOverrides: config.RepairOverrides.GetMap(),
|
2020-10-21 23:02:54 +01:00
|
|
|
nodeFailureRate: config.NodeFailureRate,
|
2019-08-01 19:44:32 +01:00
|
|
|
|
2020-01-30 13:06:43 +00:00
|
|
|
Loop: sync2.NewCycle(config.Interval),
|
|
|
|
IrreparableLoop: sync2.NewCycle(config.IrreparableInterval),
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// Run the checker loop.
|
2019-02-11 21:06:39 +00:00
|
|
|
func (checker *Checker) Run(ctx context.Context) (err error) {
|
2018-11-01 14:03:45 +00:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2019-05-31 15:12:49 +01:00
|
|
|
group, ctx := errgroup.WithContext(ctx)
|
2019-05-30 16:18:20 +01:00
|
|
|
|
2019-05-31 15:12:49 +01:00
|
|
|
group.Go(func() error {
|
|
|
|
return checker.Loop.Run(ctx, checker.IdentifyInjuredSegments)
|
|
|
|
})
|
2019-05-30 16:18:20 +01:00
|
|
|
|
2019-05-31 15:12:49 +01:00
|
|
|
group.Go(func() error {
|
|
|
|
return checker.IrreparableLoop.Run(ctx, checker.IrreparableProcess)
|
|
|
|
})
|
2019-05-30 16:18:20 +01:00
|
|
|
|
2019-05-31 15:12:49 +01:00
|
|
|
return group.Wait()
|
2018-11-01 14:03:45 +00:00
|
|
|
}
|
|
|
|
|
2019-07-08 23:04:35 +01:00
|
|
|
// RefreshReliabilityCache forces refreshing node online status cache.
|
|
|
|
func (checker *Checker) RefreshReliabilityCache(ctx context.Context) error {
|
|
|
|
return checker.nodestate.Refresh(ctx)
|
|
|
|
}
|
|
|
|
|
2020-07-16 15:18:02 +01:00
|
|
|
// Close halts the Checker loop.
|
2019-02-14 12:33:41 +00:00
|
|
|
func (checker *Checker) Close() error {
|
|
|
|
checker.Loop.Close()
|
|
|
|
return nil
|
|
|
|
}
|
2019-01-18 13:54:08 +00:00
|
|
|
|
2019-08-06 17:35:59 +01:00
|
|
|
// IdentifyInjuredSegments checks for missing pieces off of the metainfo and overlay.
|
2019-02-11 21:06:39 +00:00
|
|
|
func (checker *Checker) IdentifyInjuredSegments(ctx context.Context) (err error) {
|
2018-10-09 17:09:33 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2020-09-09 21:52:22 +01:00
|
|
|
startTime := time.Now()
|
|
|
|
|
2019-08-01 19:44:32 +01:00
|
|
|
observer := &checkerObserver{
|
2020-10-21 23:02:54 +01:00
|
|
|
repairQueue: checker.repairQueue,
|
|
|
|
irrdb: checker.irrdb,
|
|
|
|
nodestate: checker.nodestate,
|
2020-11-20 22:20:03 +00:00
|
|
|
statsCollector: checker.statsCollector,
|
|
|
|
monStats: aggregateStats{},
|
2020-10-27 18:26:46 +00:00
|
|
|
repairOverrides: checker.repairOverrides,
|
2020-10-21 23:02:54 +01:00
|
|
|
nodeFailureRate: checker.nodeFailureRate,
|
|
|
|
log: checker.logger,
|
2019-08-01 19:44:32 +01:00
|
|
|
}
|
|
|
|
err = checker.metaLoop.Join(ctx, observer)
|
2019-02-26 15:17:51 +00:00
|
|
|
if err != nil {
|
2019-08-01 19:44:32 +01:00
|
|
|
if !errs2.IsCanceled(err) {
|
|
|
|
checker.logger.Error("IdentifyInjuredSegments error", zap.Error(err))
|
|
|
|
}
|
2019-02-26 15:17:51 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-09-09 21:52:22 +01:00
|
|
|
// remove all segments which were not seen as unhealthy by this checker iteration
|
|
|
|
healthyDeleted, err := checker.repairQueue.Clean(ctx, startTime)
|
|
|
|
if err != nil {
|
|
|
|
return Error.Wrap(err)
|
|
|
|
}
|
|
|
|
|
2020-11-20 22:20:03 +00:00
|
|
|
checker.statsCollector.collectAggregates()
|
|
|
|
|
2020-10-13 13:13:41 +01:00
|
|
|
mon.IntVal("remote_files_checked").Observe(observer.monStats.objectsChecked) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_checked").Observe(observer.monStats.remoteSegmentsChecked) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_failed_to_check").Observe(observer.monStats.remoteSegmentsFailedToCheck) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_needing_repair").Observe(observer.monStats.remoteSegmentsNeedingRepair) //mon:locked
|
|
|
|
mon.IntVal("new_remote_segments_needing_repair").Observe(observer.monStats.newRemoteSegmentsNeedingRepair) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_lost").Observe(observer.monStats.remoteSegmentsLost) //mon:locked
|
|
|
|
mon.IntVal("remote_files_lost").Observe(int64(len(observer.monStats.remoteSegmentInfo))) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_over_threshold_1").Observe(observer.monStats.remoteSegmentsOverThreshold[0]) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_over_threshold_2").Observe(observer.monStats.remoteSegmentsOverThreshold[1]) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_over_threshold_3").Observe(observer.monStats.remoteSegmentsOverThreshold[2]) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_over_threshold_4").Observe(observer.monStats.remoteSegmentsOverThreshold[3]) //mon:locked
|
|
|
|
mon.IntVal("remote_segments_over_threshold_5").Observe(observer.monStats.remoteSegmentsOverThreshold[4]) //mon:locked
|
|
|
|
mon.IntVal("healthy_segments_removed_from_queue").Observe(healthyDeleted) //mon:locked
|
2019-08-01 19:44:32 +01:00
|
|
|
|
2020-03-10 13:59:29 +00:00
|
|
|
allUnhealthy := observer.monStats.remoteSegmentsNeedingRepair + observer.monStats.remoteSegmentsFailedToCheck
|
|
|
|
allChecked := observer.monStats.remoteSegmentsChecked
|
|
|
|
allHealthy := allChecked - allUnhealthy
|
2020-10-13 13:13:41 +01:00
|
|
|
mon.FloatVal("remote_segments_healthy_percentage").Observe(100 * float64(allHealthy) / float64(allChecked)) //mon:locked
|
2020-03-10 13:59:29 +00:00
|
|
|
|
2019-02-26 15:17:51 +00:00
|
|
|
return nil
|
2018-10-09 17:09:33 +01:00
|
|
|
}
|
|
|
|
|
2020-08-31 11:14:20 +01:00
|
|
|
// checks for a object location in slice.
|
|
|
|
func containsObjectLocation(a []metabase.ObjectLocation, x metabase.ObjectLocation) bool {
|
2019-02-26 15:17:51 +00:00
|
|
|
for _, n := range a {
|
|
|
|
if x == n {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
2019-05-30 16:18:20 +01:00
|
|
|
|
2020-09-08 11:13:18 +01:00
|
|
|
func (checker *Checker) updateIrreparableSegmentStatus(ctx context.Context, pointer *pb.Pointer, key metabase.SegmentKey) (err error) {
|
2019-08-01 19:44:32 +01:00
|
|
|
// TODO figure out how to reduce duplicate code between here and checkerObs.RemoteSegment
|
2019-06-04 12:36:27 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-30 16:18:20 +01:00
|
|
|
remote := pointer.GetRemote()
|
2019-09-19 00:18:14 +01:00
|
|
|
if pointer.GetType() == pb.Pointer_INLINE || remote == nil {
|
2019-05-30 16:18:20 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
pieces := remote.GetRemotePieces()
|
|
|
|
if pieces == nil {
|
|
|
|
checker.logger.Debug("no pieces on remote segment")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-07-08 23:16:50 +01:00
|
|
|
missingPieces, err := checker.nodestate.MissingPieces(ctx, pointer.CreationDate, pieces)
|
2019-05-30 16:18:20 +01:00
|
|
|
if err != nil {
|
2019-08-01 19:44:32 +01:00
|
|
|
return errs.Combine(Error.New("error getting missing pieces"), err)
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
numHealthy := int32(len(pieces) - len(missingPieces))
|
|
|
|
redundancy := pointer.Remote.Redundancy
|
2019-07-10 22:27:46 +01:00
|
|
|
|
2020-03-06 20:39:53 +00:00
|
|
|
repairThreshold := redundancy.RepairThreshold
|
2020-10-27 18:26:46 +00:00
|
|
|
overrideValue := checker.repairOverrides.GetOverrideValuePB(redundancy)
|
|
|
|
if overrideValue != 0 {
|
|
|
|
repairThreshold = overrideValue
|
2020-03-06 20:39:53 +00:00
|
|
|
}
|
|
|
|
|
2019-09-06 20:20:36 +01:00
|
|
|
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
|
|
|
|
// minimum required pieces in redundancy
|
2019-05-30 16:18:20 +01:00
|
|
|
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
|
2020-06-24 19:56:15 +01:00
|
|
|
//
|
|
|
|
// If the segment is suddenly entirely healthy again, we don't need to repair and we don't need to
|
|
|
|
// keep it in the irreparabledb queue either.
|
2020-03-06 20:39:53 +00:00
|
|
|
if numHealthy >= redundancy.MinReq && numHealthy <= repairThreshold && numHealthy < redundancy.SuccessThreshold {
|
2020-11-28 17:10:28 +00:00
|
|
|
segmentHealth := float64(numHealthy)
|
2020-10-30 10:41:22 +00:00
|
|
|
_, err = checker.repairQueue.Insert(ctx, &internalpb.InjuredSegment{
|
2020-09-08 11:13:18 +01:00
|
|
|
Path: key,
|
2019-07-10 22:27:46 +01:00
|
|
|
LostPieces: missingPieces,
|
|
|
|
InsertedTime: time.Now().UTC(),
|
2020-10-21 23:02:54 +01:00
|
|
|
}, segmentHealth)
|
2019-05-30 16:18:20 +01:00
|
|
|
if err != nil {
|
2019-08-01 19:44:32 +01:00
|
|
|
return errs.Combine(Error.New("error adding injured segment to queue"), err)
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// delete always returns nil when something was deleted and also when element didn't exists
|
2020-09-08 11:13:18 +01:00
|
|
|
err = checker.irrdb.Delete(ctx, key)
|
2019-05-30 16:18:20 +01:00
|
|
|
if err != nil {
|
|
|
|
checker.logger.Error("error deleting entry from irreparable db: ", zap.Error(err))
|
|
|
|
}
|
2020-03-06 20:39:53 +00:00
|
|
|
} else if numHealthy < redundancy.MinReq && numHealthy < repairThreshold {
|
2019-08-01 19:44:32 +01:00
|
|
|
|
|
|
|
// make an entry into the irreparable table
|
2020-10-30 11:12:01 +00:00
|
|
|
segmentInfo := &internalpb.IrreparableSegment{
|
2020-09-08 11:13:18 +01:00
|
|
|
Path: key,
|
2019-08-01 19:44:32 +01:00
|
|
|
SegmentDetail: pointer,
|
|
|
|
LostPieces: int32(len(missingPieces)),
|
|
|
|
LastRepairAttempt: time.Now().Unix(),
|
|
|
|
RepairAttemptCount: int64(1),
|
|
|
|
}
|
|
|
|
|
|
|
|
// add the entry if new or update attempt count if already exists
|
|
|
|
err := checker.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
|
|
|
|
if err != nil {
|
|
|
|
return errs.Combine(Error.New("error handling irreparable segment to queue"), err)
|
|
|
|
}
|
2020-06-24 19:56:15 +01:00
|
|
|
} else if numHealthy > repairThreshold || numHealthy >= redundancy.SuccessThreshold {
|
2020-09-08 11:13:18 +01:00
|
|
|
err = checker.irrdb.Delete(ctx, key)
|
2020-06-24 19:56:15 +01:00
|
|
|
if err != nil {
|
|
|
|
return Error.New("error removing segment from irreparable queue: %v", err)
|
|
|
|
}
|
2019-08-01 19:44:32 +01:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-09-10 14:24:16 +01:00
|
|
|
var _ metainfo.Observer = (*checkerObserver)(nil)
|
|
|
|
|
2020-12-05 16:01:42 +00:00
|
|
|
// checkerObserver implements the metainfo loop Observer interface.
|
2019-09-10 14:24:16 +01:00
|
|
|
//
|
|
|
|
// architecture: Observer
|
2019-08-01 19:44:32 +01:00
|
|
|
type checkerObserver struct {
|
2020-10-21 23:02:54 +01:00
|
|
|
repairQueue queue.RepairQueue
|
|
|
|
irrdb irreparable.DB
|
|
|
|
nodestate *ReliabilityCache
|
2020-11-20 22:20:03 +00:00
|
|
|
statsCollector *statsCollector
|
|
|
|
monStats aggregateStats // TODO(cam): once we verify statsCollector reports data correctly, remove this
|
2020-10-27 18:26:46 +00:00
|
|
|
repairOverrides RepairOverridesMap
|
2020-10-21 23:02:54 +01:00
|
|
|
nodeFailureRate float64
|
|
|
|
log *zap.Logger
|
2019-08-01 19:44:32 +01:00
|
|
|
}
|
|
|
|
|
2020-11-20 22:20:03 +00:00
|
|
|
func (obs *checkerObserver) getStatsByRS(redundancy storj.RedundancyScheme) *stats {
|
|
|
|
rsString := getRSString(obs.loadRedundancy(redundancy))
|
|
|
|
return obs.statsCollector.getStatsByRS(rsString)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (obs *checkerObserver) loadRedundancy(redundancy storj.RedundancyScheme) (int, int, int, int) {
|
|
|
|
repair := int(redundancy.RepairShares)
|
|
|
|
overrideValue := obs.repairOverrides.GetOverrideValue(redundancy)
|
|
|
|
if overrideValue != 0 {
|
|
|
|
repair = int(overrideValue)
|
|
|
|
}
|
|
|
|
return int(redundancy.RequiredShares), repair, int(redundancy.OptimalShares), int(redundancy.TotalShares)
|
|
|
|
}
|
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
func (obs *checkerObserver) RemoteSegment(ctx context.Context, segment *metainfo.Segment) (err error) {
|
2019-08-01 19:44:32 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2020-04-15 20:20:16 +01:00
|
|
|
// ignore pointer if expired
|
2020-10-27 06:59:14 +00:00
|
|
|
if segment.Expired(time.Now()) {
|
2020-04-15 20:20:16 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-11-20 22:20:03 +00:00
|
|
|
stats := obs.getStatsByRS(segment.Redundancy)
|
|
|
|
|
2019-08-01 19:44:32 +01:00
|
|
|
obs.monStats.remoteSegmentsChecked++
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.iterationAggregates.remoteSegmentsChecked++
|
2019-08-01 19:44:32 +01:00
|
|
|
|
2020-11-10 14:49:19 +00:00
|
|
|
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
|
|
|
mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.segmentsBelowMinReq.Inc(0)
|
2020-11-10 14:49:19 +00:00
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
pieces := segment.Pieces
|
|
|
|
if len(pieces) == 0 {
|
2019-08-01 19:44:32 +01:00
|
|
|
obs.log.Debug("no pieces on remote segment")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
pbPieces := make([]*pb.RemotePiece, len(pieces))
|
|
|
|
for i, piece := range pieces {
|
|
|
|
pbPieces[i] = &pb.RemotePiece{
|
|
|
|
PieceNum: int32(piece.Number),
|
|
|
|
NodeId: piece.StorageNode,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: update MissingPieces to accept metabase.Pieces
|
|
|
|
missingPieces, err := obs.nodestate.MissingPieces(ctx, segment.CreationDate, pbPieces)
|
2019-08-01 19:44:32 +01:00
|
|
|
if err != nil {
|
2020-03-10 13:59:29 +00:00
|
|
|
obs.monStats.remoteSegmentsFailedToCheck++
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.iterationAggregates.remoteSegmentsFailedToCheck++
|
2019-08-01 19:44:32 +01:00
|
|
|
return errs.Combine(Error.New("error getting missing pieces"), err)
|
|
|
|
}
|
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
numHealthy := len(pieces) - len(missingPieces)
|
2020-11-20 22:20:03 +00:00
|
|
|
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
|
|
|
|
stats.segmentTotalCount.Observe(int64(len(pieces)))
|
2020-10-13 13:13:41 +01:00
|
|
|
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy)) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.segmentHealthyCount.Observe(int64(numHealthy))
|
2019-08-01 19:44:32 +01:00
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
segmentAge := time.Since(segment.CreationDate)
|
2020-10-13 13:13:41 +01:00
|
|
|
mon.IntVal("checker_segment_age").Observe(int64(segmentAge.Seconds())) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.segmentAge.Observe(int64(segmentAge.Seconds()))
|
|
|
|
|
|
|
|
required, repairThreshold, successThreshold, _ := obs.loadRedundancy(segment.Redundancy)
|
2019-09-17 20:18:48 +01:00
|
|
|
|
2020-10-21 23:02:54 +01:00
|
|
|
segmentHealth := repair.SegmentHealth(numHealthy, required, obs.nodeFailureRate)
|
|
|
|
mon.FloatVal("checker_segment_health").Observe(segmentHealth) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.segmentHealth.Observe(segmentHealth)
|
2019-10-02 13:58:37 +01:00
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
key := segment.Location.Encode()
|
2019-09-06 20:20:36 +01:00
|
|
|
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
|
|
|
|
// minimum required pieces in redundancy
|
2019-08-01 19:44:32 +01:00
|
|
|
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
|
2020-10-27 06:59:14 +00:00
|
|
|
if numHealthy >= required && numHealthy <= repairThreshold && numHealthy < successThreshold {
|
2020-10-21 23:02:54 +01:00
|
|
|
mon.FloatVal("checker_injured_segment_health").Observe(segmentHealth) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.injuredSegmentHealth.Observe(segmentHealth)
|
2019-08-01 19:44:32 +01:00
|
|
|
obs.monStats.remoteSegmentsNeedingRepair++
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.iterationAggregates.remoteSegmentsNeedingRepair++
|
2020-10-30 10:41:22 +00:00
|
|
|
alreadyInserted, err := obs.repairQueue.Insert(ctx, &internalpb.InjuredSegment{
|
2020-09-02 08:16:58 +01:00
|
|
|
Path: key,
|
2019-08-01 19:44:32 +01:00
|
|
|
LostPieces: missingPieces,
|
|
|
|
InsertedTime: time.Now().UTC(),
|
2020-12-15 22:16:54 +00:00
|
|
|
}, float64(numHealthy))
|
2019-08-01 19:44:32 +01:00
|
|
|
if err != nil {
|
|
|
|
obs.log.Error("error adding injured segment to queue", zap.Error(err))
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-05-22 20:54:05 +01:00
|
|
|
if !alreadyInserted {
|
|
|
|
obs.monStats.newRemoteSegmentsNeedingRepair++
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.iterationAggregates.newRemoteSegmentsNeedingRepair++
|
2020-05-22 20:54:05 +01:00
|
|
|
}
|
|
|
|
|
2019-08-01 19:44:32 +01:00
|
|
|
// delete always returns nil when something was deleted and also when element didn't exists
|
2020-09-02 08:16:58 +01:00
|
|
|
err = obs.irrdb.Delete(ctx, key)
|
2019-08-01 19:44:32 +01:00
|
|
|
if err != nil {
|
|
|
|
obs.log.Error("error deleting entry from irreparable db", zap.Error(err))
|
|
|
|
return nil
|
|
|
|
}
|
2020-10-27 06:59:14 +00:00
|
|
|
} else if numHealthy < required && numHealthy < repairThreshold {
|
|
|
|
lostSegInfo := segment.Location.Object()
|
2020-08-31 11:14:20 +01:00
|
|
|
if !containsObjectLocation(obs.monStats.remoteSegmentInfo, lostSegInfo) {
|
|
|
|
obs.monStats.remoteSegmentInfo = append(obs.monStats.remoteSegmentInfo, lostSegInfo)
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
2020-11-20 22:20:03 +00:00
|
|
|
if !containsObjectLocation(stats.iterationAggregates.remoteSegmentInfo, lostSegInfo) {
|
|
|
|
stats.iterationAggregates.remoteSegmentInfo = append(stats.iterationAggregates.remoteSegmentInfo, lostSegInfo)
|
|
|
|
}
|
2019-05-30 16:18:20 +01:00
|
|
|
|
2019-09-17 20:18:48 +01:00
|
|
|
var segmentAge time.Duration
|
2020-10-27 06:59:14 +00:00
|
|
|
if segment.CreationDate.Before(segment.LastRepaired) {
|
|
|
|
segmentAge = time.Since(segment.LastRepaired)
|
2019-09-17 20:18:48 +01:00
|
|
|
} else {
|
2020-10-27 06:59:14 +00:00
|
|
|
segmentAge = time.Since(segment.CreationDate)
|
2019-09-17 20:18:48 +01:00
|
|
|
}
|
2020-10-13 13:13:41 +01:00
|
|
|
mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.segmentTimeUntilIrreparable.Observe(int64(segmentAge.Seconds()))
|
2019-09-17 20:18:48 +01:00
|
|
|
|
2019-08-01 19:44:32 +01:00
|
|
|
obs.monStats.remoteSegmentsLost++
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.iterationAggregates.remoteSegmentsLost++
|
|
|
|
|
2020-11-10 14:49:19 +00:00
|
|
|
mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked
|
2020-11-20 22:20:03 +00:00
|
|
|
stats.segmentsBelowMinReq.Inc(1)
|
|
|
|
|
2019-08-01 19:44:32 +01:00
|
|
|
// make an entry into the irreparable table
|
2020-10-30 11:12:01 +00:00
|
|
|
segmentInfo := &internalpb.IrreparableSegment{
|
2020-09-02 08:16:58 +01:00
|
|
|
Path: key,
|
2020-10-27 06:59:14 +00:00
|
|
|
SegmentDetail: segment.Pointer, // TODO: replace with something better than pb.Pointer
|
2019-05-30 16:18:20 +01:00
|
|
|
LostPieces: int32(len(missingPieces)),
|
|
|
|
LastRepairAttempt: time.Now().Unix(),
|
|
|
|
RepairAttemptCount: int64(1),
|
|
|
|
}
|
|
|
|
|
|
|
|
// add the entry if new or update attempt count if already exists
|
2019-08-01 19:44:32 +01:00
|
|
|
err := obs.irrdb.IncrementRepairAttempts(ctx, segmentInfo)
|
2019-05-30 16:18:20 +01:00
|
|
|
if err != nil {
|
2019-08-01 19:44:32 +01:00
|
|
|
obs.log.Error("error handling irreparable segment to queue", zap.Error(err))
|
|
|
|
return nil
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
2020-11-20 22:20:03 +00:00
|
|
|
} else {
|
|
|
|
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(obs.monStats.remoteSegmentsOverThreshold)) {
|
|
|
|
// record metrics for segments right above repair threshold
|
|
|
|
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
|
|
|
|
for i := range obs.monStats.remoteSegmentsOverThreshold {
|
|
|
|
if numHealthy == (repairThreshold + i + 1) {
|
|
|
|
obs.monStats.remoteSegmentsOverThreshold[i]++
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(stats.iterationAggregates.remoteSegmentsOverThreshold)) {
|
|
|
|
// record metrics for segments right above repair threshold
|
|
|
|
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
|
|
|
|
for i := range stats.iterationAggregates.remoteSegmentsOverThreshold {
|
|
|
|
if numHealthy == (repairThreshold + i + 1) {
|
|
|
|
stats.iterationAggregates.remoteSegmentsOverThreshold[i]++
|
|
|
|
break
|
|
|
|
}
|
2020-02-24 23:06:52 +00:00
|
|
|
}
|
|
|
|
}
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
2019-08-01 19:44:32 +01:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
func (obs *checkerObserver) Object(ctx context.Context, object *metainfo.Object) (err error) {
|
2019-08-01 19:44:32 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
|
2019-09-13 14:51:41 +01:00
|
|
|
obs.monStats.objectsChecked++
|
2019-08-01 19:44:32 +01:00
|
|
|
|
2020-11-20 22:20:03 +00:00
|
|
|
stats := obs.getStatsByRS(object.LastSegment.Redundancy)
|
|
|
|
stats.iterationAggregates.objectsChecked++
|
|
|
|
|
2019-08-01 19:44:32 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-10-27 06:59:14 +00:00
|
|
|
func (obs *checkerObserver) InlineSegment(ctx context.Context, segment *metainfo.Segment) (err error) {
|
2019-08-01 19:44:32 +01:00
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-05-30 16:18:20 +01:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-07-18 17:21:21 +01:00
|
|
|
// IrreparableProcess iterates over all items in the irreparabledb. If an item can
|
|
|
|
// now be repaired then it is added to a worker queue.
|
2019-05-30 16:18:20 +01:00
|
|
|
func (checker *Checker) IrreparableProcess(ctx context.Context) (err error) {
|
|
|
|
defer mon.Task()(&ctx)(&err)
|
2019-07-18 17:21:21 +01:00
|
|
|
const limit = 1000
|
2020-11-03 17:09:50 +00:00
|
|
|
lastSeenSegmentKey := metabase.SegmentKey{}
|
2019-05-30 16:18:20 +01:00
|
|
|
|
|
|
|
for {
|
2020-09-08 11:13:18 +01:00
|
|
|
segments, err := checker.irrdb.GetLimited(ctx, limit, lastSeenSegmentKey)
|
2019-05-30 16:18:20 +01:00
|
|
|
if err != nil {
|
2019-08-01 19:44:32 +01:00
|
|
|
return errs.Combine(Error.New("error reading segment from the queue"), err)
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// zero segments returned with nil err
|
2019-07-18 17:21:21 +01:00
|
|
|
if len(segments) == 0 {
|
2019-05-30 16:18:20 +01:00
|
|
|
break
|
|
|
|
}
|
|
|
|
|
2020-09-08 11:13:18 +01:00
|
|
|
lastSeenSegmentKey = metabase.SegmentKey(segments[len(segments)-1].Path)
|
2019-07-18 17:21:21 +01:00
|
|
|
|
|
|
|
for _, segment := range segments {
|
2020-09-08 11:13:18 +01:00
|
|
|
err = checker.updateIrreparableSegmentStatus(ctx, segment.GetSegmentDetail(), metabase.SegmentKey(segment.GetPath()))
|
2019-07-18 17:21:21 +01:00
|
|
|
if err != nil {
|
|
|
|
checker.logger.Error("irrepair segment checker failed: ", zap.Error(err))
|
|
|
|
}
|
2019-05-30 16:18:20 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|