satellite/repair : implemented ranged loop observer

implemented observer and partial, created new structures to keep mon
metrics remain in same way as in segment loop

Change-Id: I209c126096c84b94d4717332e56238266f6cd004
This commit is contained in:
Qweder93 2023-01-19 18:17:01 +02:00 committed by Nikolai Siedov
parent c320835a38
commit d6a948f59d
10 changed files with 1125 additions and 22 deletions

View File

@ -353,22 +353,27 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
})
}
{ // setup datarepair
// TODO: simplify argument list somehow
peer.Repair.Checker = checker.NewChecker(
peer.Log.Named("repair:checker"),
peer.DB.RepairQueue(),
peer.Metainfo.Metabase,
peer.Metainfo.SegmentLoop,
peer.Overlay.Service,
config.Checker)
peer.Services.Add(lifecycle.Item{
Name: "repair:checker",
Run: peer.Repair.Checker.Run,
Close: peer.Repair.Checker.Close,
})
peer.Debug.Server.Panel.Add(
debug.Cycle("Repair Checker", peer.Repair.Checker.Loop))
{ // setup data repair
log := peer.Log.Named("repair:checker")
if config.Repairer.UseRangedLoop {
log.Info("using ranged loop")
} else {
peer.Repair.Checker = checker.NewChecker(
log,
peer.DB.RepairQueue(),
peer.Metainfo.Metabase,
peer.Metainfo.SegmentLoop,
peer.Overlay.Service,
config.Checker)
peer.Services.Add(lifecycle.Item{
Name: "repair:checker",
Run: peer.Repair.Checker.Run,
Close: peer.Repair.Checker.Close,
})
peer.Debug.Server.Panel.Add(
debug.Cycle("Repair Checker", peer.Repair.Checker.Loop))
}
}
{ // setup reputation

View File

@ -82,7 +82,7 @@ func TestObserverGarbageCollectionBloomFilters(t *testing.T) {
// service instantiated for the testplanet.
rangedloopConfig := planet.Satellites[0].Config.RangedLoop
segments := rangedloop.NewMetabaseRangeSplitter(planet.Satellites[0].Metabase.DB, rangedloopConfig.AsOfSystemInterval, rangedloopConfig.BatchSize)
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, &segments,
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, segments,
[]rangedloop.Observer{observer})
_, err = rangedLoop.RunOnce(ctx)
@ -200,7 +200,7 @@ func TestObserverGarbageCollectionBloomFilters_AllowNotEmptyBucket(t *testing.T)
// service instantiated for the testplanet.
rangedloopConfig := planet.Satellites[0].Config.RangedLoop
segments := rangedloop.NewMetabaseRangeSplitter(planet.Satellites[0].Metabase.DB, rangedloopConfig.AsOfSystemInterval, rangedloopConfig.BatchSize)
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, &segments,
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, segments,
[]rangedloop.Observer{observer})
_, err = rangedLoop.RunOnce(ctx)

View File

@ -31,8 +31,8 @@ type MetabaseSegmentProvider struct {
}
// NewMetabaseRangeSplitter creates the segment provider.
func NewMetabaseRangeSplitter(db *metabase.DB, asOfSystemInterval time.Duration, batchSize int) MetabaseRangeSplitter {
return MetabaseRangeSplitter{
func NewMetabaseRangeSplitter(db *metabase.DB, asOfSystemInterval time.Duration, batchSize int) *MetabaseRangeSplitter {
return &MetabaseRangeSplitter{
db: db,
asOfSystemInterval: asOfSystemInterval,
batchSize: batchSize,

View File

@ -20,9 +20,12 @@ import (
"storj.io/storj/satellite/audit"
"storj.io/storj/satellite/gc/bloomfilter"
"storj.io/storj/satellite/gracefulexit"
"storj.io/storj/satellite/mailservice"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/metabase/rangedloop"
"storj.io/storj/satellite/metrics"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/repair/checker"
)
// RangedLoop is the satellite ranged loop process.
@ -48,6 +51,18 @@ type RangedLoop struct {
Observer rangedloop.Observer
}
Mail struct {
Service *mailservice.Service
}
Overlay struct {
Service *overlay.Service
}
Repair struct {
Observer rangedloop.Observer
}
GracefulExit struct {
Observer rangedloop.Observer
}
@ -66,7 +81,7 @@ type RangedLoop struct {
}
// NewRangedLoop creates a new satellite ranged loop process.
func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Config, atomicLogLevel *zap.AtomicLevel) (*RangedLoop, error) {
func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Config, atomicLogLevel *zap.AtomicLevel) (_ *RangedLoop, err error) {
peer := &RangedLoop{
Log: log,
DB: db,
@ -117,6 +132,39 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
db.StoragenodeAccounting())
}
{ // setup mail service
peer.Mail.Service, err = setupMailService(peer.Log, *config)
if err != nil {
return nil, errs.Combine(err, peer.Close())
}
peer.Services.Add(lifecycle.Item{
Name: "mail:service",
Close: peer.Mail.Service.Close,
})
}
{ // setup overlay
peer.Overlay.Service, err = overlay.NewService(peer.Log.Named("overlay"), peer.DB.OverlayCache(), peer.DB.NodeEvents(), peer.Mail.Service, config.Console.ExternalAddress, config.Console.SatelliteName, config.Overlay)
if err != nil {
return nil, errs.Combine(err, peer.Close())
}
peer.Services.Add(lifecycle.Item{
Name: "overlay",
Run: peer.Overlay.Service.Run,
Close: peer.Overlay.Service.Close,
})
}
{ // setup repair
peer.Repair.Observer = checker.NewRangedLoopObserver(
peer.Log.Named("repair:checker"),
peer.DB.RepairQueue(),
peer.Overlay.Service,
config.Checker,
)
}
{ // setup garbage collection bloom filter observer
peer.GarbageCollectionBF.Observer = bloomfilter.NewObserver(log.Named("gc-bf"), config.GarbageCollectionBF, db.OverlayCache())
}
@ -133,6 +181,7 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
if config.Metrics.UseRangedLoop {
observers = append(observers, peer.Metrics.Observer)
}
if config.Tally.UseRangedLoop {
observers = append(observers, peer.Accounting.NodeTallyObserver)
}
@ -145,8 +194,12 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
observers = append(observers, peer.GarbageCollectionBF.Observer)
}
if config.Repairer.UseRangedLoop {
observers = append(observers, peer.Repair.Observer)
}
segments := rangedloop.NewMetabaseRangeSplitter(metabaseDB, config.RangedLoop.AsOfSystemInterval, config.RangedLoop.BatchSize)
peer.RangedLoop.Service = rangedloop.NewService(log.Named("rangedloop"), config.RangedLoop, &segments, observers)
peer.RangedLoop.Service = rangedloop.NewService(log.Named("rangedloop"), config.RangedLoop, segments, observers)
peer.Services.Add(lifecycle.Item{
Name: "rangeloop",

View File

@ -90,6 +90,22 @@ type aggregateStats struct {
remoteSegmentsOverThreshold [5]int64
}
func (a *aggregateStats) combine(stats aggregateStats) {
a.objectsChecked += stats.objectsChecked
a.remoteSegmentsChecked += stats.remoteSegmentsChecked
a.remoteSegmentsNeedingRepair += stats.remoteSegmentsNeedingRepair
a.newRemoteSegmentsNeedingRepair += stats.newRemoteSegmentsNeedingRepair
a.remoteSegmentsLost += stats.remoteSegmentsLost
a.remoteSegmentsFailedToCheck += stats.remoteSegmentsFailedToCheck
a.objectsLost = append(a.objectsLost, stats.objectsLost...)
a.remoteSegmentsOverThreshold[0] += stats.remoteSegmentsOverThreshold[0]
a.remoteSegmentsOverThreshold[1] += stats.remoteSegmentsOverThreshold[1]
a.remoteSegmentsOverThreshold[2] += stats.remoteSegmentsOverThreshold[2]
a.remoteSegmentsOverThreshold[3] += stats.remoteSegmentsOverThreshold[3]
a.remoteSegmentsOverThreshold[4] += stats.remoteSegmentsOverThreshold[4]
}
func newStats(rs string) *stats {
return &stats{
iterationAggregates: new(aggregateStats),

View File

@ -0,0 +1,425 @@
// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import (
"context"
"reflect"
"sort"
"strings"
"sync"
"time"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/storj"
"storj.io/common/uuid"
"storj.io/storj/satellite/metabase/rangedloop"
"storj.io/storj/satellite/metabase/segmentloop"
"storj.io/storj/satellite/overlay"
"storj.io/storj/satellite/repair"
"storj.io/storj/satellite/repair/queue"
)
var _ rangedloop.Observer = (*RangedLoopObserver)(nil)
// RangedLoopObserver implements the ranged loop Observer interface. Should be renamed to checkerObserver after rangedloop will replace segmentloop.
//
// architecture: Observer
type RangedLoopObserver struct {
logger *zap.Logger
repairQueue queue.RepairQueue
nodestate *ReliabilityCache
repairOverrides RepairOverridesMap
nodeFailureRate float64
repairQueueBatchSize int
// the following are reset on each iteration
startTime time.Time
TotalStats aggregateStats
mu sync.Mutex
statsCollector map[string]*observerRSStats
}
// NewRangedLoopObserver creates new checker observer instance.
func NewRangedLoopObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, config Config) rangedloop.Observer {
return &RangedLoopObserver{
logger: logger,
repairQueue: repairQueue,
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
repairOverrides: config.RepairOverrides.GetMap(),
nodeFailureRate: config.NodeFailureRate,
repairQueueBatchSize: config.RepairQueueInsertBatchSize,
statsCollector: make(map[string]*observerRSStats),
}
}
// getNodesEstimate updates the estimate of the total number of nodes. It is guaranteed
// to return a number greater than 0 when the error is nil.
//
// We can't calculate this upon first starting a Ranged Loop Observer, because there may not be any
// nodes yet. We expect that there will be nodes before there are segments, though.
func (observer *RangedLoopObserver) getNodesEstimate(ctx context.Context) (int, error) {
// this should be safe to call frequently; it is an efficient caching lookup.
totalNumNodes, err := observer.nodestate.NumNodes(ctx)
if err != nil {
// We could proceed here by returning the last good value, or by returning a fallback
// constant estimate, like "20000", and we'd probably be fine, but it would be better
// not to have that happen silently for too long. Also, if we can't get this from the
// database, we probably can't modify the injured segments queue, so it won't help to
// proceed with this repair operation.
return 0, err
}
if totalNumNodes == 0 {
return 0, Error.New("segment health is meaningless: there are no nodes")
}
return totalNumNodes, nil
}
func (observer *RangedLoopObserver) createInsertBuffer() *queue.InsertBuffer {
return queue.NewInsertBuffer(observer.repairQueue, observer.repairQueueBatchSize)
}
// TestingCompareInjuredSegmentIDs compares stream id of injured segment.
func (observer *RangedLoopObserver) TestingCompareInjuredSegmentIDs(ctx context.Context, streamIDs []uuid.UUID) error {
injuredSegments, err := observer.repairQueue.SelectN(ctx, 100)
if err != nil {
return err
}
var injuredSegmentsIds []uuid.UUID
for _, segment := range injuredSegments {
injuredSegmentsIds = append(injuredSegmentsIds, segment.StreamID)
}
sort.Slice(injuredSegmentsIds, func(i, j int) bool {
return injuredSegmentsIds[i].Less(injuredSegmentsIds[j])
})
sort.Slice(streamIDs, func(i, j int) bool {
return streamIDs[i].Less(streamIDs[j])
})
if !reflect.DeepEqual(streamIDs, injuredSegmentsIds) {
return errs.New("injured objects ids are different")
}
return nil
}
// Start starts parallel segments loop.
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) error {
observer.startTime = startTime
observer.TotalStats = aggregateStats{}
return nil
}
// Fork creates a Partial to process a chunk of all the segments.
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
return newRangedLoopCheckerPartial(observer), nil
}
// Join is called after the chunk for Partial is done.
// This gives the opportunity to merge the output like in a reduce step.
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
repPartial, ok := partial.(*repairPartial)
if !ok {
return Error.New("expected partial type %T but got %T", repPartial, partial)
}
if err := repPartial.repairQueue.Flush(ctx); err != nil {
return Error.Wrap(err)
}
for rs, partialStats := range repPartial.rsStats {
observer.statsCollector[rs].iterationAggregates.combine(partialStats.iterationAggregates)
}
observer.TotalStats.combine(repPartial.totalStats)
return nil
}
// Finish is called after all segments are processed by all observers.
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
// remove all segments which were not seen as unhealthy by this checker iteration
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
if err != nil {
return Error.Wrap(err)
}
observer.collectAggregates()
mon.IntVal("remote_files_checked").Observe(observer.TotalStats.objectsChecked) //mon:locked
mon.IntVal("remote_segments_checked").Observe(observer.TotalStats.remoteSegmentsChecked) //mon:locked
mon.IntVal("remote_segments_failed_to_check").Observe(observer.TotalStats.remoteSegmentsFailedToCheck) //mon:locked
mon.IntVal("remote_segments_needing_repair").Observe(observer.TotalStats.remoteSegmentsNeedingRepair) //mon:locked
mon.IntVal("new_remote_segments_needing_repair").Observe(observer.TotalStats.newRemoteSegmentsNeedingRepair) //mon:locked
mon.IntVal("remote_segments_lost").Observe(observer.TotalStats.remoteSegmentsLost) //mon:locked
mon.IntVal("remote_files_lost").Observe(int64(len(observer.TotalStats.objectsLost))) //mon:locked
mon.IntVal("remote_segments_over_threshold_1").Observe(observer.TotalStats.remoteSegmentsOverThreshold[0]) //mon:locked
mon.IntVal("remote_segments_over_threshold_2").Observe(observer.TotalStats.remoteSegmentsOverThreshold[1]) //mon:locked
mon.IntVal("remote_segments_over_threshold_3").Observe(observer.TotalStats.remoteSegmentsOverThreshold[2]) //mon:locked
mon.IntVal("remote_segments_over_threshold_4").Observe(observer.TotalStats.remoteSegmentsOverThreshold[3]) //mon:locked
mon.IntVal("remote_segments_over_threshold_5").Observe(observer.TotalStats.remoteSegmentsOverThreshold[4]) //mon:locked
mon.IntVal("healthy_segments_removed_from_queue").Observe(healthyDeleted) //mon:locked
allUnhealthy := observer.TotalStats.remoteSegmentsNeedingRepair + observer.TotalStats.remoteSegmentsFailedToCheck
allChecked := observer.TotalStats.remoteSegmentsChecked
allHealthy := allChecked - allUnhealthy
mon.FloatVal("remote_segments_healthy_percentage").Observe(100 * float64(allHealthy) / float64(allChecked)) //mon:locked
return nil
}
func (observer *RangedLoopObserver) collectAggregates() {
for _, stats := range observer.statsCollector {
stats.collectAggregates()
}
}
func (observer *RangedLoopObserver) getObserverStats(rsString string) *observerRSStats {
observer.mu.Lock()
defer observer.mu.Unlock()
observerStats, exists := observer.statsCollector[rsString]
if !exists {
observerStats = &observerRSStats{aggregateStats{}, newIterationRSStats(rsString), newSegmentRSStats(rsString)}
mon.Chain(observerStats)
observer.statsCollector[rsString] = observerStats
}
return observerStats
}
// RefreshReliabilityCache forces refreshing node online status cache.
func (observer *RangedLoopObserver) RefreshReliabilityCache(ctx context.Context) error {
return observer.nodestate.Refresh(ctx)
}
// repairPartial implements the ranged loop Partial interface.
//
// architecture: Observer
type repairPartial struct {
repairQueue *queue.InsertBuffer
nodestate *ReliabilityCache
rsStats map[string]*partialRSStats
repairOverrides RepairOverridesMap
nodeFailureRate float64
getNodesEstimate func(ctx context.Context) (int, error)
log *zap.Logger
lastStreamID uuid.UUID
totalStats aggregateStats
getObserverStats func(string) *observerRSStats
}
// newRangedLoopCheckerPartial creates new checker partial instance.
func newRangedLoopCheckerPartial(observer *RangedLoopObserver) rangedloop.Partial {
// we can only share thread-safe objects.
return &repairPartial{
repairQueue: observer.createInsertBuffer(),
nodestate: observer.nodestate,
rsStats: make(map[string]*partialRSStats),
repairOverrides: observer.repairOverrides,
nodeFailureRate: observer.nodeFailureRate,
getNodesEstimate: observer.getNodesEstimate,
log: observer.logger,
getObserverStats: observer.getObserverStats,
}
}
func (rp *repairPartial) getStatsByRS(redundancy storj.RedundancyScheme) *partialRSStats {
rsString := getRSString(rp.loadRedundancy(redundancy))
stats, ok := rp.rsStats[rsString]
if !ok {
observerStats := rp.getObserverStats(rsString)
rp.rsStats[rsString] = &partialRSStats{
iterationAggregates: aggregateStats{},
segmentStats: observerStats.segmentStats,
}
return rp.rsStats[rsString]
}
return stats
}
func (rp *repairPartial) loadRedundancy(redundancy storj.RedundancyScheme) (int, int, int, int) {
repair := int(redundancy.RepairShares)
overrideValue := rp.repairOverrides.GetOverrideValue(redundancy)
if overrideValue != 0 {
repair = int(overrideValue)
}
return int(redundancy.RequiredShares), repair, int(redundancy.OptimalShares), int(redundancy.TotalShares)
}
// Process repair implementation of partial's Process.
func (rp *repairPartial) Process(ctx context.Context, segments []segmentloop.Segment) (err error) {
for _, segment := range segments {
if err := rp.process(ctx, &segment); err != nil {
return err
}
}
return nil
}
func (rp *repairPartial) process(ctx context.Context, segment *segmentloop.Segment) (err error) {
if segment.Inline() {
if rp.lastStreamID.Compare(segment.StreamID) != 0 {
rp.lastStreamID = segment.StreamID
rp.totalStats.objectsChecked++
}
return nil
}
// ignore segment if expired
if segment.Expired(time.Now()) {
return nil
}
stats := rp.getStatsByRS(segment.Redundancy)
if rp.lastStreamID.Compare(segment.StreamID) != 0 {
rp.lastStreamID = segment.StreamID
stats.iterationAggregates.objectsChecked++
rp.totalStats.objectsChecked++
}
rp.totalStats.remoteSegmentsChecked++
stats.iterationAggregates.remoteSegmentsChecked++
// ensure we get values, even if only zero values, so that redash can have an alert based on this
mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked
pieces := segment.Pieces
if len(pieces) == 0 {
rp.log.Debug("no pieces on remote segment")
return nil
}
totalNumNodes, err := rp.getNodesEstimate(ctx)
if err != nil {
return Error.New("could not get estimate of total number of nodes: %w", err)
}
missingPieces, err := rp.nodestate.MissingPieces(ctx, segment.CreatedAt, segment.Pieces)
if err != nil {
rp.totalStats.remoteSegmentsFailedToCheck++
stats.iterationAggregates.remoteSegmentsFailedToCheck++
return Error.New("error getting missing pieces: %w", err)
}
numHealthy := len(pieces) - len(missingPieces)
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
stats.segmentStats.segmentTotalCount.Observe(int64(len(pieces)))
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy)) //mon:locked
stats.segmentStats.segmentHealthyCount.Observe(int64(numHealthy))
segmentAge := time.Since(segment.CreatedAt)
mon.IntVal("checker_segment_age").Observe(int64(segmentAge.Seconds())) //mon:locked
stats.segmentStats.segmentAge.Observe(int64(segmentAge.Seconds()))
required, repairThreshold, successThreshold, _ := rp.loadRedundancy(segment.Redundancy)
segmentHealth := repair.SegmentHealth(numHealthy, required, totalNumNodes, rp.nodeFailureRate)
mon.FloatVal("checker_segment_health").Observe(segmentHealth) //mon:locked
stats.segmentStats.segmentHealth.Observe(segmentHealth)
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
// minimum required pieces in redundancy
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
if numHealthy <= repairThreshold && numHealthy < successThreshold {
mon.FloatVal("checker_injured_segment_health").Observe(segmentHealth) //mon:locked
stats.segmentStats.injuredSegmentHealth.Observe(segmentHealth)
rp.totalStats.remoteSegmentsNeedingRepair++
stats.iterationAggregates.remoteSegmentsNeedingRepair++
err := rp.repairQueue.Insert(ctx, &queue.InjuredSegment{
StreamID: segment.StreamID,
Position: segment.Position,
UpdatedAt: time.Now().UTC(),
SegmentHealth: segmentHealth,
}, func() {
// Counters are increased after the queue has determined
// that the segment wasn't already queued for repair.
rp.totalStats.newRemoteSegmentsNeedingRepair++
stats.iterationAggregates.newRemoteSegmentsNeedingRepair++
})
if err != nil {
rp.log.Error("error adding injured segment to queue", zap.Error(err))
return nil
}
// monitor irreparable segments
if numHealthy < required {
if !containsStreamID(rp.totalStats.objectsLost, segment.StreamID) {
rp.totalStats.objectsLost = append(rp.totalStats.objectsLost, segment.StreamID)
}
if !containsStreamID(stats.iterationAggregates.objectsLost, segment.StreamID) {
stats.iterationAggregates.objectsLost = append(stats.iterationAggregates.objectsLost, segment.StreamID)
}
repairedAt := time.Time{}
if segment.RepairedAt != nil {
repairedAt = *segment.RepairedAt
}
var segmentAge time.Duration
if segment.CreatedAt.Before(repairedAt) {
segmentAge = time.Since(repairedAt)
} else {
segmentAge = time.Since(segment.CreatedAt)
}
mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked
stats.segmentStats.segmentTimeUntilIrreparable.Observe(int64(segmentAge.Seconds()))
rp.totalStats.remoteSegmentsLost++
stats.iterationAggregates.remoteSegmentsLost++
mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked
stats.segmentStats.segmentsBelowMinReq.Inc(1)
var unhealthyNodes []string
for _, p := range missingPieces {
unhealthyNodes = append(unhealthyNodes, p.StorageNode.String())
}
rp.log.Warn("checker found irreparable segment", zap.String("Segment StreamID", segment.StreamID.String()), zap.Int("Segment Position",
int(segment.Position.Encode())), zap.Int("total pieces", len(pieces)), zap.Int("min required", required), zap.String("unhealthy node IDs", strings.Join(unhealthyNodes, ",")))
}
} else {
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(rp.totalStats.remoteSegmentsOverThreshold)) {
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
for i := range rp.totalStats.remoteSegmentsOverThreshold {
if numHealthy == (repairThreshold + i + 1) {
rp.totalStats.remoteSegmentsOverThreshold[i]++
break
}
}
}
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(stats.iterationAggregates.remoteSegmentsOverThreshold)) {
// record metrics for segments right above repair threshold
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
for i := range stats.iterationAggregates.remoteSegmentsOverThreshold {
if numHealthy == (repairThreshold + i + 1) {
stats.iterationAggregates.remoteSegmentsOverThreshold[i]++
break
}
}
}
}
return nil
}

View File

@ -0,0 +1,461 @@
// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
package checker_test
import (
"fmt"
"reflect"
"sort"
"testing"
"time"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"storj.io/common/memory"
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/common/uuid"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/metabase"
"storj.io/storj/satellite/metabase/rangedloop"
"storj.io/storj/satellite/repair/checker"
"storj.io/storj/satellite/repair/queue"
)
func TestIdentifyInjuredSegmentsObserver(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.UseRangedLoop = true
config.RangedLoop.Parallelism = 4
config.RangedLoop.BatchSize = 4
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
repairQueue := planet.Satellites[0].DB.RepairQueue()
rs := storj.RedundancyScheme{
RequiredShares: 2,
RepairShares: 3,
OptimalShares: 4,
TotalShares: 5,
ShareSize: 256,
}
projectID := planet.Uplinks[0].Projects[0].ID
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: projectID,
BucketName: "test-bucket",
}
// add some valid pointers
for x := 0; x < 10; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("a-%d", x))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
}
// add pointer that needs repair
expectedLocation.ObjectKey = metabase.ObjectKey("b-0")
b0StreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), nil)
// add pointer that is unhealthy, but is expired
expectedLocation.ObjectKey = metabase.ObjectKey("b-1")
expiresAt := time.Now().Add(-time.Hour)
insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), &expiresAt)
// add some valid pointers
for x := 0; x < 10; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("c-%d", x))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
}
_, err = planet.Satellites[0].RangedLoop.RangedLoop.Service.RunOnce(ctx)
require.NoError(t, err)
// check that the unhealthy, non-expired segment was added to the queue
// and that the expired segment was ignored
injuredSegment, err := repairQueue.Select(ctx)
require.NoError(t, err)
err = repairQueue.Delete(ctx, injuredSegment)
require.NoError(t, err)
require.Equal(t, b0StreamID, injuredSegment.StreamID)
_, err = repairQueue.Select(ctx)
require.Error(t, err)
})
}
func TestIdentifyIrreparableSegmentsObserver(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 3, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.UseRangedLoop = true
config.RangedLoop.Parallelism = 4
config.RangedLoop.BatchSize = 4
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
rangeLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
const numberOfNodes = 10
pieces := make(metabase.Pieces, 0, numberOfNodes)
// use online nodes
for i, storagenode := range planet.StorageNodes {
pieces = append(pieces, metabase.Piece{
Number: uint16(i),
StorageNode: storagenode.ID(),
})
}
// simulate offline nodes
expectedLostPieces := make(map[int32]bool)
for i := len(pieces); i < numberOfNodes; i++ {
pieces = append(pieces, metabase.Piece{
Number: uint16(i),
StorageNode: storj.NodeID{byte(i)},
})
expectedLostPieces[int32(i)] = true
}
rs := storj.RedundancyScheme{
ShareSize: 256,
RequiredShares: 4,
RepairShares: 8,
OptimalShares: 9,
TotalShares: 10,
}
projectID := planet.Uplinks[0].Projects[0].ID
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: projectID,
BucketName: "test-bucket",
}
// when number of healthy piece is less than minimum required number of piece in redundancy,
// the piece is considered irreparable but also will be put into repair queue
expectedLocation.ObjectKey = "piece"
insertSegment(ctx, t, planet, rs, expectedLocation, pieces, nil)
expectedLocation.ObjectKey = "piece-expired"
expiresAt := time.Now().Add(-time.Hour)
insertSegment(ctx, t, planet, rs, expectedLocation, pieces, &expiresAt)
_, err = rangeLoopService.RunOnce(ctx)
require.NoError(t, err)
// check that single irreparable segment was added repair queue
repairQueue := planet.Satellites[0].DB.RepairQueue()
_, err = repairQueue.Select(ctx)
require.NoError(t, err)
count, err := repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 1, count)
// check irreparable once again but wait a second
time.Sleep(1 * time.Second)
_, err = rangeLoopService.RunOnce(ctx)
require.NoError(t, err)
expectedLocation.ObjectKey = "piece"
_, err = planet.Satellites[0].Metabase.DB.DeleteObjectExactVersion(ctx, metabase.DeleteObjectExactVersion{
ObjectLocation: expectedLocation.Object(),
Version: metabase.DefaultVersion,
})
require.NoError(t, err)
_, err = rangeLoopService.RunOnce(ctx)
require.NoError(t, err)
count, err = repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 0, count)
})
}
func TestIgnoringCopiedSegmentsObserver(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.UseRangedLoop = true
config.RangedLoop.Parallelism = 4
config.RangedLoop.BatchSize = 4
config.Metainfo.RS.Min = 2
config.Metainfo.RS.Repair = 3
config.Metainfo.RS.Success = 4
config.Metainfo.RS.Total = 4
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellite := planet.Satellites[0]
uplink := planet.Uplinks[0]
metabaseDB := satellite.Metabase.DB
rangedLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
repairQueue := satellite.DB.RepairQueue()
err := uplink.CreateBucket(ctx, satellite, "test-bucket")
require.NoError(t, err)
testData := testrand.Bytes(8 * memory.KiB)
err = uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
require.NoError(t, err)
project, err := uplink.OpenProject(ctx, satellite)
require.NoError(t, err)
defer ctx.Check(project.Close)
segments, err := metabaseDB.TestingAllSegments(ctx)
require.NoError(t, err)
require.Len(t, segments, 1)
_, err = project.CopyObject(ctx, "testbucket", "test/path", "testbucket", "empty", nil)
require.NoError(t, err)
segmentsAfterCopy, err := metabaseDB.TestingAllSegments(ctx)
require.NoError(t, err)
require.Len(t, segmentsAfterCopy, 2)
err = planet.StopNodeAndUpdate(ctx, planet.FindNode(segments[0].Pieces[0].StorageNode))
require.NoError(t, err)
_, err = rangedLoopService.RunOnce(ctx)
require.NoError(t, err)
// check that injured segment in repair queue streamID is same that in original segment.
injuredSegment, err := repairQueue.Select(ctx)
require.NoError(t, err)
require.Equal(t, segments[0].StreamID, injuredSegment.StreamID)
// check that repair queue has only original segment, and not copied one.
injuredSegments, err := repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 1, injuredSegments)
})
}
func TestCleanRepairQueueObserver(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.UseRangedLoop = true
config.RangedLoop.Parallelism = 4
config.RangedLoop.BatchSize = 4
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
rangedLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
repairQueue := planet.Satellites[0].DB.RepairQueue()
observer := planet.Satellites[0].RangedLoop.Repair.Observer.(*checker.RangedLoopObserver)
planet.Satellites[0].Repair.Repairer.Loop.Pause()
rs := storj.RedundancyScheme{
RequiredShares: 2,
RepairShares: 3,
OptimalShares: 4,
TotalShares: 5,
ShareSize: 256,
}
projectID := planet.Uplinks[0].Projects[0].ID
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: projectID,
BucketName: "test-bucket",
}
healthyCount := 5
for i := 0; i < healthyCount; i++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("healthy-%d", i))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
}
unhealthyCount := 5
unhealthyIDs := make(map[uuid.UUID]struct{})
for i := 0; i < unhealthyCount; i++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("unhealthy-%d", i))
unhealthyStreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), nil)
unhealthyIDs[unhealthyStreamID] = struct{}{}
}
// suspend enough nodes to make healthy pointers unhealthy
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
require.NoError(t, planet.Satellites[0].Overlay.DB.TestSuspendNodeUnknownAudit(ctx, planet.StorageNodes[i].ID(), time.Now()))
}
require.NoError(t, observer.RefreshReliabilityCache(ctx))
// check that repair queue is empty to avoid false positive
count, err := repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, 0, count)
_, err = rangedLoopService.RunOnce(ctx)
require.NoError(t, err)
// check that the pointers were put into the repair queue
// and not cleaned up at the end of the checker iteration
count, err = repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, healthyCount+unhealthyCount, count)
// unsuspend nodes to make the previously healthy pointers healthy again
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
require.NoError(t, planet.Satellites[0].Overlay.DB.TestUnsuspendNodeUnknownAudit(ctx, planet.StorageNodes[i].ID()))
}
require.NoError(t, observer.RefreshReliabilityCache(ctx))
// The checker will not insert/update the now healthy segments causing
// them to be removed from the queue at the end of the checker iteration
_, err = rangedLoopService.RunOnce(ctx)
require.NoError(t, err)
// only unhealthy segments should remain
count, err = repairQueue.Count(ctx)
require.NoError(t, err)
require.Equal(t, unhealthyCount, count)
segs, err := repairQueue.SelectN(ctx, count)
require.NoError(t, err)
require.Equal(t, len(unhealthyIDs), len(segs))
for _, s := range segs {
_, ok := unhealthyIDs[s.StreamID]
require.True(t, ok)
}
})
}
func TestRepairObserver(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Repairer.UseRangedLoop = true
config.RangedLoop.Parallelism = 4
config.RangedLoop.BatchSize = 4
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
rs := storj.RedundancyScheme{
RequiredShares: 2,
RepairShares: 3,
OptimalShares: 4,
TotalShares: 5,
ShareSize: 256,
}
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
require.NoError(t, err)
expectedLocation := metabase.SegmentLocation{
ProjectID: planet.Uplinks[0].Projects[0].ID,
BucketName: "test-bucket",
}
// add some valid pointers
for x := 0; x < 20; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("a-%d", x))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
}
var injuredSegmentStreamIDs []uuid.UUID
// add pointer that needs repair
for x := 0; x < 5; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("b-%d", x))
injuredSegmentStreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), nil)
injuredSegmentStreamIDs = append(injuredSegmentStreamIDs, injuredSegmentStreamID)
}
// add pointer that is unhealthy, but is expired
expectedLocation.ObjectKey = metabase.ObjectKey("d-1")
expiresAt := time.Now().Add(-time.Hour)
insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), &expiresAt)
// add some valid pointers
for x := 0; x < 20; x++ {
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("c-%d", x))
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
}
compare := func(insertedSegmentsIDs []uuid.UUID, fromRepairQueue []queue.InjuredSegment) bool {
var repairQueueIDs []uuid.UUID
for _, v := range fromRepairQueue {
repairQueueIDs = append(repairQueueIDs, v.StreamID)
}
sort.Slice(insertedSegmentsIDs, func(i, j int) bool {
return insertedSegmentsIDs[i].Less(insertedSegmentsIDs[j])
})
sort.Slice(repairQueueIDs, func(i, j int) bool {
return repairQueueIDs[i].Less(repairQueueIDs[j])
})
return reflect.DeepEqual(insertedSegmentsIDs, repairQueueIDs)
}
type TestCase struct {
BatchSize int
Parallelism int
}
_, err = planet.Satellites[0].RangedLoop.RangedLoop.Service.RunOnce(ctx)
require.NoError(t, err)
injuredSegments, err := planet.Satellites[0].DB.RepairQueue().SelectN(ctx, 10)
require.NoError(t, err)
require.Len(t, injuredSegments, 5)
require.True(t, compare(injuredSegmentStreamIDs, injuredSegments))
_, err = planet.Satellites[0].DB.RepairQueue().Clean(ctx, time.Now())
require.NoError(t, err)
for _, tc := range []TestCase{
{1, 1},
{3, 1},
{5, 1},
{1, 3},
{3, 3},
{5, 3},
{1, 5},
{3, 5},
{5, 5},
} {
observer := planet.Satellites[0].RangedLoop.Repair.Observer
config := planet.Satellites[0].Config
service := rangedloop.NewService(planet.Log(), rangedloop.Config{
Parallelism: tc.Parallelism,
BatchSize: tc.BatchSize,
}, rangedloop.NewMetabaseRangeSplitter(planet.Satellites[0].Metabase.DB, config.RangedLoop.AsOfSystemInterval, config.RangedLoop.BatchSize), []rangedloop.Observer{observer})
_, err = service.RunOnce(ctx)
require.NoError(t, err)
injuredSegments, err = planet.Satellites[0].DB.RepairQueue().SelectN(ctx, 10)
require.NoError(t, err)
require.Len(t, injuredSegments, 5)
require.True(t, compare(injuredSegmentStreamIDs, injuredSegments))
_, err = planet.Satellites[0].DB.RepairQueue().Clean(ctx, time.Now())
require.NoError(t, err)
}
})
}

View File

@ -0,0 +1,139 @@
// Copyright (C) 2023 Storj Labs, Inc.
// See LICENSE for copying information.
package checker
import "github.com/spacemonkeygo/monkit/v3"
type observerRSStats struct {
// iterationAggregates contains the aggregated counters across all partials.
// The values are observed by the distributions in iterationStats
iterationAggregates aggregateStats
// iterationStats are the distributions for per-iteration stats. The distributions
// are updated using iterationAggregates after each loop iteration completes.
iterationStats iterationRSStats
// segmentStats contains threadsafe distributions and is shared by all partials. The
// distributions are updated when processing the segment.
segmentStats *segmentRSStats
}
// Stats implements the monkit.StatSource interface.
func (stats *observerRSStats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) {
stats.iterationStats.objectsChecked.Stats(cb)
stats.iterationStats.remoteSegmentsChecked.Stats(cb)
stats.iterationStats.remoteSegmentsNeedingRepair.Stats(cb)
stats.iterationStats.newRemoteSegmentsNeedingRepair.Stats(cb)
stats.iterationStats.remoteSegmentsLost.Stats(cb)
stats.iterationStats.objectsLost.Stats(cb)
stats.iterationStats.remoteSegmentsFailedToCheck.Stats(cb)
stats.iterationStats.remoteSegmentsHealthyPercentage.Stats(cb)
stats.iterationStats.remoteSegmentsOverThreshold1.Stats(cb)
stats.iterationStats.remoteSegmentsOverThreshold2.Stats(cb)
stats.iterationStats.remoteSegmentsOverThreshold3.Stats(cb)
stats.iterationStats.remoteSegmentsOverThreshold4.Stats(cb)
stats.iterationStats.remoteSegmentsOverThreshold5.Stats(cb)
stats.segmentStats.segmentsBelowMinReq.Stats(cb)
stats.segmentStats.segmentTotalCount.Stats(cb)
stats.segmentStats.segmentHealthyCount.Stats(cb)
stats.segmentStats.segmentAge.Stats(cb)
stats.segmentStats.segmentHealth.Stats(cb)
stats.segmentStats.injuredSegmentHealth.Stats(cb)
stats.segmentStats.segmentTimeUntilIrreparable.Stats(cb)
}
type iterationRSStats struct {
objectsChecked *monkit.IntVal
remoteSegmentsChecked *monkit.IntVal
remoteSegmentsNeedingRepair *monkit.IntVal
newRemoteSegmentsNeedingRepair *monkit.IntVal
remoteSegmentsLost *monkit.IntVal
objectsLost *monkit.IntVal
remoteSegmentsFailedToCheck *monkit.IntVal
remoteSegmentsHealthyPercentage *monkit.FloatVal
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
remoteSegmentsOverThreshold1 *monkit.IntVal
remoteSegmentsOverThreshold2 *monkit.IntVal
remoteSegmentsOverThreshold3 *monkit.IntVal
remoteSegmentsOverThreshold4 *monkit.IntVal
remoteSegmentsOverThreshold5 *monkit.IntVal
}
func newIterationRSStats(rs string) iterationRSStats {
return iterationRSStats{
objectsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_objects_checked").WithTag("rs_scheme", rs)),
remoteSegmentsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_checked").WithTag("rs_scheme", rs)),
remoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_needing_repair").WithTag("rs_scheme", rs)),
newRemoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "new_remote_segments_needing_repair").WithTag("rs_scheme", rs)),
remoteSegmentsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_lost").WithTag("rs_scheme", rs)),
objectsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "objects_lost").WithTag("rs_scheme", rs)),
remoteSegmentsFailedToCheck: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_failed_to_check").WithTag("rs_scheme", rs)),
remoteSegmentsHealthyPercentage: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_healthy_percentage").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold1: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_1").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold2: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_2").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold3: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_3").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold4: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_4").WithTag("rs_scheme", rs)),
remoteSegmentsOverThreshold5: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_5").WithTag("rs_scheme", rs)),
}
}
type partialRSStats struct {
// iterationAggregates are counts aggregated by each partial for stats for the whole loop
// and are aggregated into the observer during join. These aggregated counters
// are tallied into distributions at the end of each loop.
iterationAggregates aggregateStats
// segmentStats contains thread-safe distributions and is shared by all partials. The
// distributions are updated when processing the segment.
segmentStats *segmentRSStats
}
type segmentRSStats struct {
segmentsBelowMinReq *monkit.Counter
segmentTotalCount *monkit.IntVal
segmentHealthyCount *monkit.IntVal
segmentAge *monkit.IntVal
segmentHealth *monkit.FloatVal
injuredSegmentHealth *monkit.FloatVal
segmentTimeUntilIrreparable *monkit.IntVal
}
func newSegmentRSStats(rs string) *segmentRSStats {
return &segmentRSStats{
segmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segments_below_min_req").WithTag("rs_scheme", rs)),
segmentTotalCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_total_count").WithTag("rs_scheme", rs)),
segmentHealthyCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_healthy_count").WithTag("rs_scheme", rs)),
segmentAge: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_age").WithTag("rs_scheme", rs)),
segmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_health").WithTag("rs_scheme", rs)),
injuredSegmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_injured_segment_health").WithTag("rs_scheme", rs)),
segmentTimeUntilIrreparable: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_time_until_irreparable").WithTag("rs_scheme", rs)),
}
}
func (stats *observerRSStats) collectAggregates() {
stats.iterationStats.objectsChecked.Observe(stats.iterationAggregates.objectsChecked)
stats.iterationStats.remoteSegmentsChecked.Observe(stats.iterationAggregates.remoteSegmentsChecked)
stats.iterationStats.remoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.remoteSegmentsNeedingRepair)
stats.iterationStats.newRemoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.newRemoteSegmentsNeedingRepair)
stats.iterationStats.remoteSegmentsLost.Observe(stats.iterationAggregates.remoteSegmentsLost)
stats.iterationStats.objectsLost.Observe(int64(len(stats.iterationAggregates.objectsLost)))
stats.iterationStats.remoteSegmentsFailedToCheck.Observe(stats.iterationAggregates.remoteSegmentsFailedToCheck)
stats.iterationStats.remoteSegmentsOverThreshold1.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[0])
stats.iterationStats.remoteSegmentsOverThreshold2.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[1])
stats.iterationStats.remoteSegmentsOverThreshold3.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[2])
stats.iterationStats.remoteSegmentsOverThreshold4.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[3])
stats.iterationStats.remoteSegmentsOverThreshold5.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[4])
allUnhealthy := stats.iterationAggregates.remoteSegmentsNeedingRepair + stats.iterationAggregates.remoteSegmentsFailedToCheck
allChecked := stats.iterationAggregates.remoteSegmentsChecked
allHealthy := allChecked - allUnhealthy
stats.iterationStats.remoteSegmentsHealthyPercentage.Observe(100 * float64(allHealthy) / float64(allChecked))
// resetting iteration aggregates after loop run finished
stats.iterationAggregates = aggregateStats{}
}

View File

@ -35,6 +35,7 @@ type Config struct {
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
UseRangedLoop bool `help:"whether to use ranged loop instead of segment loop" default:"false"`
}
// Service contains the information needed to run the repair service.

View File

@ -913,6 +913,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
# time limit for an entire repair job, from queue pop to upload completion
# repairer.total-timeout: 45m0s
# whether to use ranged loop instead of segment loop
# repairer.use-ranged-loop: false
# the number of times a node has been audited to not be considered a New Node
# reputation.audit-count: 100