2023-01-19 16:17:01 +00:00
// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
2023-05-17 16:45:09 +01:00
"fmt"
2023-01-19 16:17:01 +00:00
"reflect"
"sort"
"strings"
"sync"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
2023-04-25 09:40:22 +01:00
"golang.org/x/exp/slices"
2023-01-19 16:17:01 +00:00
"storj.io/common/storj"
"storj.io/common/uuid"
2023-05-17 16:45:09 +01:00
"storj.io/storj/satellite/metabase"
2023-01-19 16:17:01 +00:00
"storj.io/storj/satellite/metabase/rangedloop"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/repair"
"storj.io/storj/satellite/repair/queue"
)
2023-04-25 09:40:22 +01:00
var _ rangedloop . Observer = ( * Observer ) ( nil )
var _ rangedloop . Partial = ( * observerFork ) ( nil )
2023-01-19 16:17:01 +00:00
2023-04-25 09:40:22 +01:00
// Observer implements the ranged loop Observer interface.
2023-01-19 16:17:01 +00:00
//
// architecture: Observer
2023-04-25 09:40:22 +01:00
type Observer struct {
2023-01-19 16:17:01 +00:00
logger * zap . Logger
repairQueue queue . RepairQueue
2023-06-29 14:26:52 +01:00
nodesCache * ReliabilityCache
2023-04-25 09:40:22 +01:00
overlayService * overlay . Service
2023-01-19 16:17:01 +00:00
repairOverrides RepairOverridesMap
nodeFailureRate float64
repairQueueBatchSize int
2023-05-18 19:47:23 +01:00
doDeclumping bool
2023-05-30 15:44:36 +01:00
doPlacementCheck bool
2023-01-19 16:17:01 +00:00
// the following are reset on each iteration
startTime time . Time
TotalStats aggregateStats
mu sync . Mutex
statsCollector map [ string ] * observerRSStats
}
2023-04-25 09:40:22 +01:00
// NewObserver creates new checker observer instance.
2023-06-29 09:38:47 +01:00
// TODO move excludedCountries into config but share it somehow with segment repairer.
func NewObserver ( logger * zap . Logger , repairQueue queue . RepairQueue , overlay * overlay . Service , config Config , excludedCountries [ ] string ) * Observer {
2023-04-25 09:40:22 +01:00
return & Observer {
2023-01-19 16:17:01 +00:00
logger : logger ,
repairQueue : repairQueue ,
2023-06-29 14:26:52 +01:00
nodesCache : NewReliabilityCache ( overlay , config . ReliabilityCacheStaleness , excludedCountries ) ,
2023-04-25 09:40:22 +01:00
overlayService : overlay ,
2023-01-19 16:17:01 +00:00
repairOverrides : config . RepairOverrides . GetMap ( ) ,
nodeFailureRate : config . NodeFailureRate ,
repairQueueBatchSize : config . RepairQueueInsertBatchSize ,
2023-05-18 19:47:23 +01:00
doDeclumping : config . DoDeclumping ,
2023-05-30 15:44:36 +01:00
doPlacementCheck : config . DoPlacementCheck ,
2023-01-19 16:17:01 +00:00
statsCollector : make ( map [ string ] * observerRSStats ) ,
}
}
// getNodesEstimate updates the estimate of the total number of nodes. It is guaranteed
// to return a number greater than 0 when the error is nil.
//
// We can't calculate this upon first starting a Ranged Loop Observer, because there may not be any
// nodes yet. We expect that there will be nodes before there are segments, though.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) getNodesEstimate ( ctx context . Context ) ( int , error ) {
2023-01-19 16:17:01 +00:00
// this should be safe to call frequently; it is an efficient caching lookup.
2023-06-29 14:26:52 +01:00
totalNumNodes , err := observer . nodesCache . NumNodes ( ctx )
2023-01-19 16:17:01 +00:00
if err != nil {
// We could proceed here by returning the last good value, or by returning a fallback
// constant estimate, like "20000", and we'd probably be fine, but it would be better
// not to have that happen silently for too long. Also, if we can't get this from the
// database, we probably can't modify the injured segments queue, so it won't help to
// proceed with this repair operation.
return 0 , err
}
if totalNumNodes == 0 {
return 0 , Error . New ( "segment health is meaningless: there are no nodes" )
}
return totalNumNodes , nil
}
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) createInsertBuffer ( ) * queue . InsertBuffer {
2023-01-19 16:17:01 +00:00
return queue . NewInsertBuffer ( observer . repairQueue , observer . repairQueueBatchSize )
}
// TestingCompareInjuredSegmentIDs compares stream id of injured segment.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) TestingCompareInjuredSegmentIDs ( ctx context . Context , streamIDs [ ] uuid . UUID ) error {
2023-01-19 16:17:01 +00:00
injuredSegments , err := observer . repairQueue . SelectN ( ctx , 100 )
if err != nil {
return err
}
var injuredSegmentsIds [ ] uuid . UUID
for _ , segment := range injuredSegments {
injuredSegmentsIds = append ( injuredSegmentsIds , segment . StreamID )
}
sort . Slice ( injuredSegmentsIds , func ( i , j int ) bool {
return injuredSegmentsIds [ i ] . Less ( injuredSegmentsIds [ j ] )
} )
sort . Slice ( streamIDs , func ( i , j int ) bool {
return streamIDs [ i ] . Less ( streamIDs [ j ] )
} )
if ! reflect . DeepEqual ( streamIDs , injuredSegmentsIds ) {
return errs . New ( "injured objects ids are different" )
}
return nil
}
// Start starts parallel segments loop.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) Start ( ctx context . Context , startTime time . Time ) ( err error ) {
2023-02-03 10:04:53 +00:00
defer mon . Task ( ) ( & ctx ) ( & err )
2023-01-19 16:17:01 +00:00
observer . startTime = startTime
observer . TotalStats = aggregateStats { }
return nil
}
// Fork creates a Partial to process a chunk of all the segments.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) Fork ( ctx context . Context ) ( _ rangedloop . Partial , err error ) {
2023-02-03 10:04:53 +00:00
defer mon . Task ( ) ( & ctx ) ( & err )
2023-04-25 09:40:22 +01:00
return newObserverFork ( observer ) , nil
2023-01-19 16:17:01 +00:00
}
// Join is called after the chunk for Partial is done.
// This gives the opportunity to merge the output like in a reduce step.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) Join ( ctx context . Context , partial rangedloop . Partial ) ( err error ) {
2023-02-03 10:04:53 +00:00
defer mon . Task ( ) ( & ctx ) ( & err )
2023-04-25 09:40:22 +01:00
repPartial , ok := partial . ( * observerFork )
2023-01-19 16:17:01 +00:00
if ! ok {
return Error . New ( "expected partial type %T but got %T" , repPartial , partial )
}
if err := repPartial . repairQueue . Flush ( ctx ) ; err != nil {
return Error . Wrap ( err )
}
for rs , partialStats := range repPartial . rsStats {
observer . statsCollector [ rs ] . iterationAggregates . combine ( partialStats . iterationAggregates )
}
observer . TotalStats . combine ( repPartial . totalStats )
return nil
}
// Finish is called after all segments are processed by all observers.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) Finish ( ctx context . Context ) ( err error ) {
2023-02-03 10:04:53 +00:00
defer mon . Task ( ) ( & ctx ) ( & err )
2023-01-19 16:17:01 +00:00
// remove all segments which were not seen as unhealthy by this checker iteration
healthyDeleted , err := observer . repairQueue . Clean ( ctx , observer . startTime )
if err != nil {
return Error . Wrap ( err )
}
observer . collectAggregates ( )
mon . IntVal ( "remote_files_checked" ) . Observe ( observer . TotalStats . objectsChecked ) //mon:locked
mon . IntVal ( "remote_segments_checked" ) . Observe ( observer . TotalStats . remoteSegmentsChecked ) //mon:locked
mon . IntVal ( "remote_segments_failed_to_check" ) . Observe ( observer . TotalStats . remoteSegmentsFailedToCheck ) //mon:locked
mon . IntVal ( "remote_segments_needing_repair" ) . Observe ( observer . TotalStats . remoteSegmentsNeedingRepair ) //mon:locked
mon . IntVal ( "new_remote_segments_needing_repair" ) . Observe ( observer . TotalStats . newRemoteSegmentsNeedingRepair ) //mon:locked
mon . IntVal ( "remote_segments_lost" ) . Observe ( observer . TotalStats . remoteSegmentsLost ) //mon:locked
mon . IntVal ( "remote_files_lost" ) . Observe ( int64 ( len ( observer . TotalStats . objectsLost ) ) ) //mon:locked
mon . IntVal ( "remote_segments_over_threshold_1" ) . Observe ( observer . TotalStats . remoteSegmentsOverThreshold [ 0 ] ) //mon:locked
mon . IntVal ( "remote_segments_over_threshold_2" ) . Observe ( observer . TotalStats . remoteSegmentsOverThreshold [ 1 ] ) //mon:locked
mon . IntVal ( "remote_segments_over_threshold_3" ) . Observe ( observer . TotalStats . remoteSegmentsOverThreshold [ 2 ] ) //mon:locked
mon . IntVal ( "remote_segments_over_threshold_4" ) . Observe ( observer . TotalStats . remoteSegmentsOverThreshold [ 3 ] ) //mon:locked
mon . IntVal ( "remote_segments_over_threshold_5" ) . Observe ( observer . TotalStats . remoteSegmentsOverThreshold [ 4 ] ) //mon:locked
mon . IntVal ( "healthy_segments_removed_from_queue" ) . Observe ( healthyDeleted ) //mon:locked
allUnhealthy := observer . TotalStats . remoteSegmentsNeedingRepair + observer . TotalStats . remoteSegmentsFailedToCheck
allChecked := observer . TotalStats . remoteSegmentsChecked
allHealthy := allChecked - allUnhealthy
mon . FloatVal ( "remote_segments_healthy_percentage" ) . Observe ( 100 * float64 ( allHealthy ) / float64 ( allChecked ) ) //mon:locked
return nil
}
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) collectAggregates ( ) {
2023-01-19 16:17:01 +00:00
for _ , stats := range observer . statsCollector {
stats . collectAggregates ( )
}
}
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) getObserverStats ( rsString string ) * observerRSStats {
2023-01-19 16:17:01 +00:00
observer . mu . Lock ( )
defer observer . mu . Unlock ( )
observerStats , exists := observer . statsCollector [ rsString ]
if ! exists {
observerStats = & observerRSStats { aggregateStats { } , newIterationRSStats ( rsString ) , newSegmentRSStats ( rsString ) }
mon . Chain ( observerStats )
observer . statsCollector [ rsString ] = observerStats
}
return observerStats
}
// RefreshReliabilityCache forces refreshing node online status cache.
2023-04-25 09:40:22 +01:00
func ( observer * Observer ) RefreshReliabilityCache ( ctx context . Context ) error {
2023-06-29 14:26:52 +01:00
return observer . nodesCache . Refresh ( ctx )
2023-01-19 16:17:01 +00:00
}
2023-04-25 09:40:22 +01:00
// observerFork implements the ranged loop Partial interface.
type observerFork struct {
2023-01-19 16:17:01 +00:00
repairQueue * queue . InsertBuffer
2023-06-29 14:26:52 +01:00
nodesCache * ReliabilityCache
2023-04-25 09:40:22 +01:00
overlayService * overlay . Service
2023-01-19 16:17:01 +00:00
rsStats map [ string ] * partialRSStats
repairOverrides RepairOverridesMap
nodeFailureRate float64
getNodesEstimate func ( ctx context . Context ) ( int , error )
log * zap . Logger
2023-05-18 19:47:23 +01:00
doDeclumping bool
2023-05-30 15:44:36 +01:00
doPlacementCheck bool
2023-01-19 16:17:01 +00:00
lastStreamID uuid . UUID
totalStats aggregateStats
getObserverStats func ( string ) * observerRSStats
}
2023-04-25 09:40:22 +01:00
// newObserverFork creates new observer partial instance.
func newObserverFork ( observer * Observer ) rangedloop . Partial {
2023-01-19 16:17:01 +00:00
// we can only share thread-safe objects.
2023-04-25 09:40:22 +01:00
return & observerFork {
2023-01-19 16:17:01 +00:00
repairQueue : observer . createInsertBuffer ( ) ,
2023-06-29 14:26:52 +01:00
nodesCache : observer . nodesCache ,
2023-04-25 09:40:22 +01:00
overlayService : observer . overlayService ,
2023-01-19 16:17:01 +00:00
rsStats : make ( map [ string ] * partialRSStats ) ,
repairOverrides : observer . repairOverrides ,
nodeFailureRate : observer . nodeFailureRate ,
getNodesEstimate : observer . getNodesEstimate ,
log : observer . logger ,
2023-05-18 19:47:23 +01:00
doDeclumping : observer . doDeclumping ,
2023-05-30 15:44:36 +01:00
doPlacementCheck : observer . doPlacementCheck ,
2023-01-19 16:17:01 +00:00
getObserverStats : observer . getObserverStats ,
}
}
2023-04-25 09:40:22 +01:00
func ( fork * observerFork ) getStatsByRS ( redundancy storj . RedundancyScheme ) * partialRSStats {
rsString := getRSString ( fork . loadRedundancy ( redundancy ) )
2023-01-19 16:17:01 +00:00
2023-04-25 09:40:22 +01:00
stats , ok := fork . rsStats [ rsString ]
2023-01-19 16:17:01 +00:00
if ! ok {
2023-04-25 09:40:22 +01:00
observerStats := fork . getObserverStats ( rsString )
2023-01-19 16:17:01 +00:00
2023-04-25 09:40:22 +01:00
fork . rsStats [ rsString ] = & partialRSStats {
2023-01-19 16:17:01 +00:00
iterationAggregates : aggregateStats { } ,
segmentStats : observerStats . segmentStats ,
}
2023-04-25 09:40:22 +01:00
return fork . rsStats [ rsString ]
2023-01-19 16:17:01 +00:00
}
return stats
}
2023-04-25 09:40:22 +01:00
func ( fork * observerFork ) loadRedundancy ( redundancy storj . RedundancyScheme ) ( int , int , int , int ) {
2023-01-19 16:17:01 +00:00
repair := int ( redundancy . RepairShares )
2023-04-25 09:40:22 +01:00
overrideValue := fork . repairOverrides . GetOverrideValue ( redundancy )
2023-01-19 16:17:01 +00:00
if overrideValue != 0 {
repair = int ( overrideValue )
}
return int ( redundancy . RequiredShares ) , repair , int ( redundancy . OptimalShares ) , int ( redundancy . TotalShares )
}
// Process repair implementation of partial's Process.
2023-05-09 12:13:19 +01:00
func ( fork * observerFork ) Process ( ctx context . Context , segments [ ] rangedloop . Segment ) ( err error ) {
2023-01-19 16:17:01 +00:00
for _ , segment := range segments {
2023-04-25 09:40:22 +01:00
if err := fork . process ( ctx , & segment ) ; err != nil {
2023-01-19 16:17:01 +00:00
return err
}
}
return nil
}
2023-05-09 12:13:19 +01:00
func ( fork * observerFork ) process ( ctx context . Context , segment * rangedloop . Segment ) ( err error ) {
2023-01-19 16:17:01 +00:00
if segment . Inline ( ) {
2023-04-25 09:40:22 +01:00
if fork . lastStreamID . Compare ( segment . StreamID ) != 0 {
fork . lastStreamID = segment . StreamID
fork . totalStats . objectsChecked ++
2023-01-19 16:17:01 +00:00
}
return nil
}
// ignore segment if expired
if segment . Expired ( time . Now ( ) ) {
return nil
}
2023-04-25 09:40:22 +01:00
stats := fork . getStatsByRS ( segment . Redundancy )
if fork . lastStreamID . Compare ( segment . StreamID ) != 0 {
fork . lastStreamID = segment . StreamID
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . objectsChecked ++
2023-04-25 09:40:22 +01:00
fork . totalStats . objectsChecked ++
2023-01-19 16:17:01 +00:00
}
2023-04-25 09:40:22 +01:00
fork . totalStats . remoteSegmentsChecked ++
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . remoteSegmentsChecked ++
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon . Counter ( "checker_segments_below_min_req" ) . Inc ( 0 ) //mon:locked
pieces := segment . Pieces
if len ( pieces ) == 0 {
2023-04-25 09:40:22 +01:00
fork . log . Debug ( "no pieces on remote segment" )
2023-01-19 16:17:01 +00:00
return nil
}
2023-04-25 09:40:22 +01:00
totalNumNodes , err := fork . getNodesEstimate ( ctx )
2023-01-19 16:17:01 +00:00
if err != nil {
return Error . New ( "could not get estimate of total number of nodes: %w" , err )
}
2023-06-29 14:26:52 +01:00
missingPieces , err := fork . nodesCache . MissingPieces ( ctx , segment . CreatedAt , segment . Pieces )
2023-01-19 16:17:01 +00:00
if err != nil {
2023-04-25 09:40:22 +01:00
fork . totalStats . remoteSegmentsFailedToCheck ++
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . remoteSegmentsFailedToCheck ++
return Error . New ( "error getting missing pieces: %w" , err )
}
2023-05-18 19:47:23 +01:00
var clumpedPieces metabase . Pieces
var lastNets [ ] string
if fork . doDeclumping {
// if multiple pieces are on the same last_net, keep only the first one. The rest are
// to be considered retrievable but unhealthy.
2023-06-29 14:26:52 +01:00
lastNets , err = fork . nodesCache . PiecesNodesLastNetsInOrder ( ctx , segment . CreatedAt , pieces )
2023-05-18 19:47:23 +01:00
if err != nil {
fork . totalStats . remoteSegmentsFailedToCheck ++
stats . iterationAggregates . remoteSegmentsFailedToCheck ++
return errs . Combine ( Error . New ( "error determining node last_nets" ) , err )
}
clumpedPieces = repair . FindClumpedPieces ( segment . Pieces , lastNets )
2023-04-25 09:40:22 +01:00
}
2023-06-29 14:26:52 +01:00
numOutOfPlacementPieces := 0
2023-05-30 15:44:36 +01:00
if fork . doPlacementCheck && segment . Placement != storj . EveryCountry {
2023-06-29 14:26:52 +01:00
outOfPlacementPieces , err := fork . nodesCache . OutOfPlacementPieces ( ctx , segment . CreatedAt , segment . Pieces , segment . Placement )
2023-05-30 15:44:36 +01:00
if err != nil {
fork . totalStats . remoteSegmentsFailedToCheck ++
stats . iterationAggregates . remoteSegmentsFailedToCheck ++
return errs . Combine ( Error . New ( "error determining nodes placement" ) , err )
}
2023-06-29 14:26:52 +01:00
numOutOfPlacementPieces = len ( outOfPlacementPieces )
2023-05-30 15:44:36 +01:00
}
2023-04-25 09:40:22 +01:00
numHealthy := len ( pieces ) - len ( missingPieces ) - len ( clumpedPieces )
2023-01-19 16:17:01 +00:00
mon . IntVal ( "checker_segment_total_count" ) . Observe ( int64 ( len ( pieces ) ) ) //mon:locked
stats . segmentStats . segmentTotalCount . Observe ( int64 ( len ( pieces ) ) )
mon . IntVal ( "checker_segment_healthy_count" ) . Observe ( int64 ( numHealthy ) ) //mon:locked
stats . segmentStats . segmentHealthyCount . Observe ( int64 ( numHealthy ) )
2023-04-25 09:40:22 +01:00
mon . IntVal ( "checker_segment_clumped_count" ) . Observe ( int64 ( len ( clumpedPieces ) ) ) //mon:locked
stats . segmentStats . segmentClumpedCount . Observe ( int64 ( len ( clumpedPieces ) ) )
2023-06-29 14:26:52 +01:00
mon . IntVal ( "checker_segment_off_placement_count" ) . Observe ( int64 ( numOutOfPlacementPieces ) ) //mon:locked
stats . segmentStats . segmentOffPlacementCount . Observe ( int64 ( numOutOfPlacementPieces ) )
2023-01-19 16:17:01 +00:00
segmentAge := time . Since ( segment . CreatedAt )
mon . IntVal ( "checker_segment_age" ) . Observe ( int64 ( segmentAge . Seconds ( ) ) ) //mon:locked
stats . segmentStats . segmentAge . Observe ( int64 ( segmentAge . Seconds ( ) ) )
2023-04-25 09:40:22 +01:00
required , repairThreshold , successThreshold , _ := fork . loadRedundancy ( segment . Redundancy )
segmentHealth := repair . SegmentHealth ( numHealthy , required , totalNumNodes , fork . nodeFailureRate )
2023-01-19 16:17:01 +00:00
mon . FloatVal ( "checker_segment_health" ) . Observe ( segmentHealth ) //mon:locked
stats . segmentStats . segmentHealth . Observe ( segmentHealth )
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
// minimum required pieces in redundancy
2023-05-30 15:44:36 +01:00
// except for the case when the repair and success thresholds are the same (a case usually seen during testing).
// separate case is when we find pieces which are outside segment placement. in such case we are putting segment
// into queue right away.
2023-06-29 14:26:52 +01:00
if ( numHealthy <= repairThreshold && numHealthy < successThreshold ) || numOutOfPlacementPieces > 0 {
2023-01-19 16:17:01 +00:00
mon . FloatVal ( "checker_injured_segment_health" ) . Observe ( segmentHealth ) //mon:locked
stats . segmentStats . injuredSegmentHealth . Observe ( segmentHealth )
2023-04-25 09:40:22 +01:00
fork . totalStats . remoteSegmentsNeedingRepair ++
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . remoteSegmentsNeedingRepair ++
2023-04-25 09:40:22 +01:00
err := fork . repairQueue . Insert ( ctx , & queue . InjuredSegment {
2023-01-19 16:17:01 +00:00
StreamID : segment . StreamID ,
Position : segment . Position ,
UpdatedAt : time . Now ( ) . UTC ( ) ,
SegmentHealth : segmentHealth ,
} , func ( ) {
// Counters are increased after the queue has determined
// that the segment wasn't already queued for repair.
2023-04-25 09:40:22 +01:00
fork . totalStats . newRemoteSegmentsNeedingRepair ++
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . newRemoteSegmentsNeedingRepair ++
} )
if err != nil {
2023-04-25 09:40:22 +01:00
fork . log . Error ( "error adding injured segment to queue" , zap . Error ( err ) )
2023-01-19 16:17:01 +00:00
return nil
}
// monitor irreparable segments
2023-05-17 16:45:09 +01:00
numRetrievable := len ( pieces ) - len ( missingPieces )
if numRetrievable < required {
2023-04-25 09:40:22 +01:00
if ! slices . Contains ( fork . totalStats . objectsLost , segment . StreamID ) {
fork . totalStats . objectsLost = append ( fork . totalStats . objectsLost , segment . StreamID )
2023-01-19 16:17:01 +00:00
}
2023-04-25 09:40:22 +01:00
if ! slices . Contains ( stats . iterationAggregates . objectsLost , segment . StreamID ) {
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . objectsLost = append ( stats . iterationAggregates . objectsLost , segment . StreamID )
}
repairedAt := time . Time { }
if segment . RepairedAt != nil {
repairedAt = * segment . RepairedAt
}
var segmentAge time . Duration
if segment . CreatedAt . Before ( repairedAt ) {
segmentAge = time . Since ( repairedAt )
} else {
segmentAge = time . Since ( segment . CreatedAt )
}
mon . IntVal ( "checker_segment_time_until_irreparable" ) . Observe ( int64 ( segmentAge . Seconds ( ) ) ) //mon:locked
stats . segmentStats . segmentTimeUntilIrreparable . Observe ( int64 ( segmentAge . Seconds ( ) ) )
2023-04-25 09:40:22 +01:00
fork . totalStats . remoteSegmentsLost ++
2023-01-19 16:17:01 +00:00
stats . iterationAggregates . remoteSegmentsLost ++
mon . Counter ( "checker_segments_below_min_req" ) . Inc ( 1 ) //mon:locked
stats . segmentStats . segmentsBelowMinReq . Inc ( 1 )
var unhealthyNodes [ ] string
for _ , p := range missingPieces {
unhealthyNodes = append ( unhealthyNodes , p . StorageNode . String ( ) )
}
2023-04-25 09:40:22 +01:00
fork . log . Warn ( "checker found irreparable segment" , zap . String ( "Segment StreamID" , segment . StreamID . String ( ) ) , zap . Int ( "Segment Position" ,
2023-01-19 16:17:01 +00:00
int ( segment . Position . Encode ( ) ) ) , zap . Int ( "total pieces" , len ( pieces ) ) , zap . Int ( "min required" , required ) , zap . String ( "unhealthy node IDs" , strings . Join ( unhealthyNodes , "," ) ) )
2023-05-17 16:45:09 +01:00
} else if numRetrievable > repairThreshold {
// This segment is to be repaired because of clumping (it wouldn't need repair yet
// otherwise). Produce a brief report of where the clumping occurred so that we have
// a better understanding of the cause.
2023-05-18 18:38:33 +01:00
clumpedNets := clumpingReport { lastNets : lastNets }
2023-05-17 16:45:09 +01:00
fork . log . Info ( "segment needs repair because of clumping" , zap . Stringer ( "Segment StreamID" , segment . StreamID ) , zap . Uint64 ( "Segment Position" , segment . Position . Encode ( ) ) , zap . Int ( "total pieces" , len ( pieces ) ) , zap . Int ( "min required" , required ) , zap . Stringer ( "clumping" , & clumpedNets ) )
2023-01-19 16:17:01 +00:00
}
} else {
2023-04-25 09:40:22 +01:00
if numHealthy > repairThreshold && numHealthy <= ( repairThreshold + len ( fork . totalStats . remoteSegmentsOverThreshold ) ) {
2023-01-19 16:17:01 +00:00
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
2023-04-25 09:40:22 +01:00
for i := range fork . totalStats . remoteSegmentsOverThreshold {
2023-01-19 16:17:01 +00:00
if numHealthy == ( repairThreshold + i + 1 ) {
2023-04-25 09:40:22 +01:00
fork . totalStats . remoteSegmentsOverThreshold [ i ] ++
2023-01-19 16:17:01 +00:00
break
}
}
}
if numHealthy > repairThreshold && numHealthy <= ( repairThreshold + len ( stats . iterationAggregates . remoteSegmentsOverThreshold ) ) {
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
for i := range stats . iterationAggregates . remoteSegmentsOverThreshold {
if numHealthy == ( repairThreshold + i + 1 ) {
stats . iterationAggregates . remoteSegmentsOverThreshold [ i ] ++
break
}
}
}
}
return nil
}
2023-05-17 16:45:09 +01:00
type clumpingReport struct {
2023-05-18 18:38:33 +01:00
lastNets [ ] string
2023-05-17 16:45:09 +01:00
}
// String produces the clumping report. In case the satellite isn't logging at the required level,
// we avoid doing the work of building the report until String() is called.
func ( cr * clumpingReport ) String ( ) string {
2023-05-18 18:38:33 +01:00
netCounts := make ( map [ string ] int )
for _ , lastNet := range cr . lastNets {
if lastNet == "" {
lastNet = "unknown"
2023-05-17 16:45:09 +01:00
}
2023-05-18 18:38:33 +01:00
netCounts [ lastNet ] ++
2023-05-17 16:45:09 +01:00
}
2023-05-18 18:38:33 +01:00
counts := make ( [ ] string , 0 , len ( netCounts ) )
for lastNet , count := range netCounts {
if count > 1 {
counts = append ( counts , fmt . Sprintf ( "[%s]: %d" , lastNet , count ) )
}
2023-05-17 16:45:09 +01:00
}
return strings . Join ( counts , ", " )
}