storj/satellite/repair/checker/observer.go
paul cannon 1f4f79b6b3 satellite/repair: don't mark clumped segments as irreparable
Clumped segments (segments with multiple pieces on the same subnet) may
need repair, but the clumped pieces are considered retrievable and we
don't need to call such segments irreparable.

We do want to know where they're coming from, though, if we can, because
we are seeing more than expected.

Change-Id: I41863b243f4bb007ef8929191a3fde1562565ef9
2023-05-17 16:24:15 +00:00

493 lines
18 KiB
Go

// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
"fmt"
"reflect"
"sort"
"strings"
"sync"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
"golang.org/x/exp/slices"
"storj.io/common/storj"
"storj.io/common/uuid"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/metabase/rangedloop"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/repair"
"storj.io/storj/satellite/repair/queue"
)
var _ rangedloop.Observer = (*Observer)(nil)
var _ rangedloop.Partial = (*observerFork)(nil)
// Observer implements the ranged loop Observer interface.
//
// architecture: Observer
type Observer struct {
logger *zap.Logger
repairQueue queue.RepairQueue
nodestate *ReliabilityCache
overlayService *overlay.Service
repairOverrides RepairOverridesMap
nodeFailureRate float64
repairQueueBatchSize int
// the following are reset on each iteration
startTime time.Time
TotalStats aggregateStats
mu sync.Mutex
statsCollector map[string]*observerRSStats
}
// NewObserver creates new checker observer instance.
func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, config Config) *Observer {
return &Observer{
logger: logger,
repairQueue: repairQueue,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
overlayService: overlay,
repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate,
repairQueueBatchSize: config.RepairQueueInsertBatchSize,
statsCollector: make(map[string]*observerRSStats),
}
}
// getNodesEstimate updates the estimate of the total number of nodes. It is guaranteed
// to return a number greater than 0 when the error is nil.
//
// We can't calculate this upon first starting a Ranged Loop Observer, because there may not be any
// nodes yet. We expect that there will be nodes before there are segments, though.
func (observer *Observer) getNodesEstimate(ctx context.Context) (int, error) {
// this should be safe to call frequently; it is an efficient caching lookup.
totalNumNodes, err := observer.nodestate.NumNodes(ctx)
if err != nil {
// We could proceed here by returning the last good value, or by returning a fallback
// constant estimate, like "20000", and we'd probably be fine, but it would be better
// not to have that happen silently for too long. Also, if we can't get this from the
// database, we probably can't modify the injured segments queue, so it won't help to
// proceed with this repair operation.
return 0, err
}
if totalNumNodes == 0 {
return 0, Error.New("segment health is meaningless: there are no nodes")
}
return totalNumNodes, nil
}
func (observer *Observer) createInsertBuffer() *queue.InsertBuffer {
return queue.NewInsertBuffer(observer.repairQueue, observer.repairQueueBatchSize)
}
// TestingCompareInjuredSegmentIDs compares stream id of injured segment.
func (observer *Observer) TestingCompareInjuredSegmentIDs(ctx context.Context, streamIDs []uuid.UUID) error {
injuredSegments, err := observer.repairQueue.SelectN(ctx, 100)
if err != nil {
return err
}
var injuredSegmentsIds []uuid.UUID
for _, segment := range injuredSegments {
injuredSegmentsIds = append(injuredSegmentsIds, segment.StreamID)
}
sort.Slice(injuredSegmentsIds, func(i, j int) bool {
return injuredSegmentsIds[i].Less(injuredSegmentsIds[j])
})
sort.Slice(streamIDs, func(i, j int) bool {
return streamIDs[i].Less(streamIDs[j])
})
if !reflect.DeepEqual(streamIDs, injuredSegmentsIds) {
return errs.New("injured objects ids are different")
}
return nil
}
// Start starts parallel segments loop.
func (observer *Observer) Start(ctx context.Context, startTime time.Time) (err error) {
defer mon.Task()(&ctx)(&err)
observer.startTime = startTime
observer.TotalStats = aggregateStats{}
return nil
}
// Fork creates a Partial to process a chunk of all the segments.
func (observer *Observer) Fork(ctx context.Context) (_ rangedloop.Partial, err error) {
defer mon.Task()(&ctx)(&err)
return newObserverFork(observer), nil
}
// Join is called after the chunk for Partial is done.
// This gives the opportunity to merge the output like in a reduce step.
func (observer *Observer) Join(ctx context.Context, partial rangedloop.Partial) (err error) {
defer mon.Task()(&ctx)(&err)
repPartial, ok := partial.(*observerFork)
if !ok {
return Error.New("expected partial type %T but got %T", repPartial, partial)
}
if err := repPartial.repairQueue.Flush(ctx); err != nil {
return Error.Wrap(err)
}
for rs, partialStats := range repPartial.rsStats {
observer.statsCollector[rs].iterationAggregates.combine(partialStats.iterationAggregates)
}
observer.TotalStats.combine(repPartial.totalStats)
return nil
}
// Finish is called after all segments are processed by all observers.
func (observer *Observer) Finish(ctx context.Context) (err error) {
defer mon.Task()(&ctx)(&err)
// remove all segments which were not seen as unhealthy by this checker iteration
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
if err != nil {
return Error.Wrap(err)
}
observer.collectAggregates()
mon.IntVal("remote_files_checked").Observe(observer.TotalStats.objectsChecked) //mon:locked
mon.IntVal("remote_segments_checked").Observe(observer.TotalStats.remoteSegmentsChecked) //mon:locked
mon.IntVal("remote_segments_failed_to_check").Observe(observer.TotalStats.remoteSegmentsFailedToCheck) //mon:locked
mon.IntVal("remote_segments_needing_repair").Observe(observer.TotalStats.remoteSegmentsNeedingRepair) //mon:locked
mon.IntVal("new_remote_segments_needing_repair").Observe(observer.TotalStats.newRemoteSegmentsNeedingRepair) //mon:locked
mon.IntVal("remote_segments_lost").Observe(observer.TotalStats.remoteSegmentsLost) //mon:locked
mon.IntVal("remote_files_lost").Observe(int64(len(observer.TotalStats.objectsLost))) //mon:locked
mon.IntVal("remote_segments_over_threshold_1").Observe(observer.TotalStats.remoteSegmentsOverThreshold[0]) //mon:locked
mon.IntVal("remote_segments_over_threshold_2").Observe(observer.TotalStats.remoteSegmentsOverThreshold[1]) //mon:locked
mon.IntVal("remote_segments_over_threshold_3").Observe(observer.TotalStats.remoteSegmentsOverThreshold[2]) //mon:locked
mon.IntVal("remote_segments_over_threshold_4").Observe(observer.TotalStats.remoteSegmentsOverThreshold[3]) //mon:locked
mon.IntVal("remote_segments_over_threshold_5").Observe(observer.TotalStats.remoteSegmentsOverThreshold[4]) //mon:locked
mon.IntVal("healthy_segments_removed_from_queue").Observe(healthyDeleted) //mon:locked
allUnhealthy := observer.TotalStats.remoteSegmentsNeedingRepair + observer.TotalStats.remoteSegmentsFailedToCheck
allChecked := observer.TotalStats.remoteSegmentsChecked
allHealthy := allChecked - allUnhealthy
mon.FloatVal("remote_segments_healthy_percentage").Observe(100 * float64(allHealthy) / float64(allChecked)) //mon:locked
return nil
}
func (observer *Observer) collectAggregates() {
for _, stats := range observer.statsCollector {
stats.collectAggregates()
}
}
func (observer *Observer) getObserverStats(rsString string) *observerRSStats {
observer.mu.Lock()
defer observer.mu.Unlock()
observerStats, exists := observer.statsCollector[rsString]
if !exists {
observerStats = &observerRSStats{aggregateStats{}, newIterationRSStats(rsString), newSegmentRSStats(rsString)}
mon.Chain(observerStats)
observer.statsCollector[rsString] = observerStats
}
return observerStats
}
// RefreshReliabilityCache forces refreshing node online status cache.
func (observer *Observer) RefreshReliabilityCache(ctx context.Context) error {
return observer.nodestate.Refresh(ctx)
}
// observerFork implements the ranged loop Partial interface.
type observerFork struct {
repairQueue *queue.InsertBuffer
nodestate *ReliabilityCache
overlayService *overlay.Service
rsStats map[string]*partialRSStats
repairOverrides RepairOverridesMap
nodeFailureRate float64
getNodesEstimate func(ctx context.Context) (int, error)
log *zap.Logger
lastStreamID uuid.UUID
totalStats aggregateStats
getObserverStats func(string) *observerRSStats
}
// newObserverFork creates new observer partial instance.
func newObserverFork(observer *Observer) rangedloop.Partial {
// we can only share thread-safe objects.
return &observerFork{
repairQueue: observer.createInsertBuffer(),
nodestate: observer.nodestate,
overlayService: observer.overlayService,
rsStats: make(map[string]*partialRSStats),
repairOverrides: observer.repairOverrides,
nodeFailureRate: observer.nodeFailureRate,
getNodesEstimate: observer.getNodesEstimate,
log: observer.logger,
getObserverStats: observer.getObserverStats,
}
}
func (fork *observerFork) getStatsByRS(redundancy storj.RedundancyScheme) *partialRSStats {
rsString := getRSString(fork.loadRedundancy(redundancy))
stats, ok := fork.rsStats[rsString]
if !ok {
observerStats := fork.getObserverStats(rsString)
fork.rsStats[rsString] = &partialRSStats{
iterationAggregates: aggregateStats{},
segmentStats: observerStats.segmentStats,
}
return fork.rsStats[rsString]
}
return stats
}
func (fork *observerFork) loadRedundancy(redundancy storj.RedundancyScheme) (int, int, int, int) {
repair := int(redundancy.RepairShares)
overrideValue := fork.repairOverrides.GetOverrideValue(redundancy)
if overrideValue != 0 {
repair = int(overrideValue)
}
return int(redundancy.RequiredShares), repair, int(redundancy.OptimalShares), int(redundancy.TotalShares)
}
// Process repair implementation of partial's Process.
func (fork *observerFork) Process(ctx context.Context, segments []rangedloop.Segment) (err error) {
for _, segment := range segments {
if err := fork.process(ctx, &segment); err != nil {
return err
}
}
return nil
}
func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segment) (err error) {
if segment.Inline() {
if fork.lastStreamID.Compare(segment.StreamID) != 0 {
fork.lastStreamID = segment.StreamID
fork.totalStats.objectsChecked++
}
return nil
}
// ignore segment if expired
if segment.Expired(time.Now()) {
return nil
}
stats := fork.getStatsByRS(segment.Redundancy)
if fork.lastStreamID.Compare(segment.StreamID) != 0 {
fork.lastStreamID = segment.StreamID
stats.iterationAggregates.objectsChecked++
fork.totalStats.objectsChecked++
}
fork.totalStats.remoteSegmentsChecked++
stats.iterationAggregates.remoteSegmentsChecked++
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked
pieces := segment.Pieces
if len(pieces) == 0 {
fork.log.Debug("no pieces on remote segment")
return nil
}
totalNumNodes, err := fork.getNodesEstimate(ctx)
if err != nil {
return Error.New("could not get estimate of total number of nodes: %w", err)
}
missingPieces, err := fork.nodestate.MissingPieces(ctx, segment.CreatedAt, segment.Pieces)
if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++
return Error.New("error getting missing pieces: %w", err)
}
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
nodeIDs := make([]storj.NodeID, len(pieces))
for i, p := range pieces {
nodeIDs[i] = p.StorageNode
}
lastNets, err := fork.overlayService.GetNodesNetworkInOrder(ctx, nodeIDs)
if err != nil {
fork.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++
return errs.Combine(Error.New("error determining node last_nets"), err)
}
clumpedPieces := repair.FindClumpedPieces(segment.Pieces, lastNets)
numHealthy := len(pieces) - len(missingPieces) - len(clumpedPieces)
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
stats.segmentStats.segmentTotalCount.Observe(int64(len(pieces)))
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy)) //mon:locked
stats.segmentStats.segmentHealthyCount.Observe(int64(numHealthy))
mon.IntVal("checker_segment_clumped_count").Observe(int64(len(clumpedPieces))) //mon:locked
stats.segmentStats.segmentClumpedCount.Observe(int64(len(clumpedPieces)))
segmentAge := time.Since(segment.CreatedAt)
mon.IntVal("checker_segment_age").Observe(int64(segmentAge.Seconds())) //mon:locked
stats.segmentStats.segmentAge.Observe(int64(segmentAge.Seconds()))
required, repairThreshold, successThreshold, _ := fork.loadRedundancy(segment.Redundancy)
segmentHealth := repair.SegmentHealth(numHealthy, required, totalNumNodes, fork.nodeFailureRate)
mon.FloatVal("checker_segment_health").Observe(segmentHealth) //mon:locked
stats.segmentStats.segmentHealth.Observe(segmentHealth)
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
// minimum required pieces in redundancy
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy <= repairThreshold && numHealthy < successThreshold {
mon.FloatVal("checker_injured_segment_health").Observe(segmentHealth) //mon:locked
stats.segmentStats.injuredSegmentHealth.Observe(segmentHealth)
fork.totalStats.remoteSegmentsNeedingRepair++
stats.iterationAggregates.remoteSegmentsNeedingRepair++
err := fork.repairQueue.Insert(ctx, &queue.InjuredSegment{
StreamID: segment.StreamID,
Position: segment.Position,
UpdatedAt: time.Now().UTC(),
SegmentHealth: segmentHealth,
}, func() {
// Counters are increased after the queue has determined
// that the segment wasn't already queued for repair.
fork.totalStats.newRemoteSegmentsNeedingRepair++
stats.iterationAggregates.newRemoteSegmentsNeedingRepair++
})
if err != nil {
fork.log.Error("error adding injured segment to queue", zap.Error(err))
return nil
}
// monitor irreparable segments
numRetrievable := len(pieces) - len(missingPieces)
if numRetrievable < required {
if !slices.Contains(fork.totalStats.objectsLost, segment.StreamID) {
fork.totalStats.objectsLost = append(fork.totalStats.objectsLost, segment.StreamID)
}
if !slices.Contains(stats.iterationAggregates.objectsLost, segment.StreamID) {
stats.iterationAggregates.objectsLost = append(stats.iterationAggregates.objectsLost, segment.StreamID)
}
repairedAt := time.Time{}
if segment.RepairedAt != nil {
repairedAt = *segment.RepairedAt
}
var segmentAge time.Duration
if segment.CreatedAt.Before(repairedAt) {
segmentAge = time.Since(repairedAt)
} else {
segmentAge = time.Since(segment.CreatedAt)
}
mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked
stats.segmentStats.segmentTimeUntilIrreparable.Observe(int64(segmentAge.Seconds()))
fork.totalStats.remoteSegmentsLost++
stats.iterationAggregates.remoteSegmentsLost++
mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked
stats.segmentStats.segmentsBelowMinReq.Inc(1)
var unhealthyNodes []string
for _, p := range missingPieces {
unhealthyNodes = append(unhealthyNodes, p.StorageNode.String())
}
fork.log.Warn("checker found irreparable segment", zap.String("Segment StreamID", segment.StreamID.String()), zap.Int("Segment Position",
int(segment.Position.Encode())), zap.Int("total pieces", len(pieces)), zap.Int("min required", required), zap.String("unhealthy node IDs", strings.Join(unhealthyNodes, ",")))
} else if numRetrievable > repairThreshold {
// This segment is to be repaired because of clumping (it wouldn't need repair yet
// otherwise). Produce a brief report of where the clumping occurred so that we have
// a better understanding of the cause.
clumpedNets := clumpingReport{
clumpedPieces: clumpedPieces,
allPieces: segment.Pieces,
lastNets: lastNets,
}
fork.log.Info("segment needs repair because of clumping", zap.Stringer("Segment StreamID", segment.StreamID), zap.Uint64("Segment Position", segment.Position.Encode()), zap.Int("total pieces", len(pieces)), zap.Int("min required", required), zap.Stringer("clumping", &clumpedNets))
}
} else {
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(fork.totalStats.remoteSegmentsOverThreshold)) {
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
for i := range fork.totalStats.remoteSegmentsOverThreshold {
if numHealthy == (repairThreshold + i + 1) {
fork.totalStats.remoteSegmentsOverThreshold[i]++
break
}
}
}
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(stats.iterationAggregates.remoteSegmentsOverThreshold)) {
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
for i := range stats.iterationAggregates.remoteSegmentsOverThreshold {
if numHealthy == (repairThreshold + i + 1) {
stats.iterationAggregates.remoteSegmentsOverThreshold[i]++
break
}
}
}
}
return nil
}
type clumpingReport struct {
clumpedPieces metabase.Pieces
allPieces metabase.Pieces
lastNets []string
}
// String produces the clumping report. In case the satellite isn't logging at the required level,
// we avoid doing the work of building the report until String() is called.
func (cr *clumpingReport) String() string {
clumpedNets := make(map[string]int)
for _, clumpedPiece := range cr.clumpedPieces {
lastNet := ""
for i, piece := range cr.allPieces {
if piece.Number == clumpedPiece.Number && piece.StorageNode.Compare(clumpedPiece.StorageNode) == 0 {
lastNet = cr.lastNets[i]
break
}
}
clumpedNets[lastNet]++
}
counts := make([]string, 0, len(clumpedNets))
for clumpedNet, count := range clumpedNets {
counts = append(counts, fmt.Sprintf("[%s]: %d", clumpedNet, count))
}
return strings.Join(counts, ", ")
}