satellite/repair : implemented ranged loop observer
implemented observer and partial, created new structures to keep mon metrics remain in same way as in segment loop Change-Id: I209c126096c84b94d4717332e56238266f6cd004
This commit is contained in:
parent
c320835a38
commit
d6a948f59d
@ -353,22 +353,27 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
|
||||
})
|
||||
}
|
||||
|
||||
{ // setup datarepair
|
||||
// TODO: simplify argument list somehow
|
||||
peer.Repair.Checker = checker.NewChecker(
|
||||
peer.Log.Named("repair:checker"),
|
||||
peer.DB.RepairQueue(),
|
||||
peer.Metainfo.Metabase,
|
||||
peer.Metainfo.SegmentLoop,
|
||||
peer.Overlay.Service,
|
||||
config.Checker)
|
||||
peer.Services.Add(lifecycle.Item{
|
||||
Name: "repair:checker",
|
||||
Run: peer.Repair.Checker.Run,
|
||||
Close: peer.Repair.Checker.Close,
|
||||
})
|
||||
peer.Debug.Server.Panel.Add(
|
||||
debug.Cycle("Repair Checker", peer.Repair.Checker.Loop))
|
||||
{ // setup data repair
|
||||
log := peer.Log.Named("repair:checker")
|
||||
if config.Repairer.UseRangedLoop {
|
||||
log.Info("using ranged loop")
|
||||
} else {
|
||||
peer.Repair.Checker = checker.NewChecker(
|
||||
log,
|
||||
peer.DB.RepairQueue(),
|
||||
peer.Metainfo.Metabase,
|
||||
peer.Metainfo.SegmentLoop,
|
||||
peer.Overlay.Service,
|
||||
config.Checker)
|
||||
peer.Services.Add(lifecycle.Item{
|
||||
Name: "repair:checker",
|
||||
Run: peer.Repair.Checker.Run,
|
||||
Close: peer.Repair.Checker.Close,
|
||||
})
|
||||
|
||||
peer.Debug.Server.Panel.Add(
|
||||
debug.Cycle("Repair Checker", peer.Repair.Checker.Loop))
|
||||
}
|
||||
}
|
||||
|
||||
{ // setup reputation
|
||||
|
@ -82,7 +82,7 @@ func TestObserverGarbageCollectionBloomFilters(t *testing.T) {
|
||||
// service instantiated for the testplanet.
|
||||
rangedloopConfig := planet.Satellites[0].Config.RangedLoop
|
||||
segments := rangedloop.NewMetabaseRangeSplitter(planet.Satellites[0].Metabase.DB, rangedloopConfig.AsOfSystemInterval, rangedloopConfig.BatchSize)
|
||||
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, &segments,
|
||||
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, segments,
|
||||
[]rangedloop.Observer{observer})
|
||||
|
||||
_, err = rangedLoop.RunOnce(ctx)
|
||||
@ -200,7 +200,7 @@ func TestObserverGarbageCollectionBloomFilters_AllowNotEmptyBucket(t *testing.T)
|
||||
// service instantiated for the testplanet.
|
||||
rangedloopConfig := planet.Satellites[0].Config.RangedLoop
|
||||
segments := rangedloop.NewMetabaseRangeSplitter(planet.Satellites[0].Metabase.DB, rangedloopConfig.AsOfSystemInterval, rangedloopConfig.BatchSize)
|
||||
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, &segments,
|
||||
rangedLoop := rangedloop.NewService(zap.NewNop(), planet.Satellites[0].Config.RangedLoop, segments,
|
||||
[]rangedloop.Observer{observer})
|
||||
|
||||
_, err = rangedLoop.RunOnce(ctx)
|
||||
|
@ -31,8 +31,8 @@ type MetabaseSegmentProvider struct {
|
||||
}
|
||||
|
||||
// NewMetabaseRangeSplitter creates the segment provider.
|
||||
func NewMetabaseRangeSplitter(db *metabase.DB, asOfSystemInterval time.Duration, batchSize int) MetabaseRangeSplitter {
|
||||
return MetabaseRangeSplitter{
|
||||
func NewMetabaseRangeSplitter(db *metabase.DB, asOfSystemInterval time.Duration, batchSize int) *MetabaseRangeSplitter {
|
||||
return &MetabaseRangeSplitter{
|
||||
db: db,
|
||||
asOfSystemInterval: asOfSystemInterval,
|
||||
batchSize: batchSize,
|
||||
|
@ -20,9 +20,12 @@ import (
|
||||
"storj.io/storj/satellite/audit"
|
||||
"storj.io/storj/satellite/gc/bloomfilter"
|
||||
"storj.io/storj/satellite/gracefulexit"
|
||||
"storj.io/storj/satellite/mailservice"
|
||||
"storj.io/storj/satellite/metabase"
|
||||
"storj.io/storj/satellite/metabase/rangedloop"
|
||||
"storj.io/storj/satellite/metrics"
|
||||
"storj.io/storj/satellite/overlay"
|
||||
"storj.io/storj/satellite/repair/checker"
|
||||
)
|
||||
|
||||
// RangedLoop is the satellite ranged loop process.
|
||||
@ -48,6 +51,18 @@ type RangedLoop struct {
|
||||
Observer rangedloop.Observer
|
||||
}
|
||||
|
||||
Mail struct {
|
||||
Service *mailservice.Service
|
||||
}
|
||||
|
||||
Overlay struct {
|
||||
Service *overlay.Service
|
||||
}
|
||||
|
||||
Repair struct {
|
||||
Observer rangedloop.Observer
|
||||
}
|
||||
|
||||
GracefulExit struct {
|
||||
Observer rangedloop.Observer
|
||||
}
|
||||
@ -66,7 +81,7 @@ type RangedLoop struct {
|
||||
}
|
||||
|
||||
// NewRangedLoop creates a new satellite ranged loop process.
|
||||
func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Config, atomicLogLevel *zap.AtomicLevel) (*RangedLoop, error) {
|
||||
func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Config, atomicLogLevel *zap.AtomicLevel) (_ *RangedLoop, err error) {
|
||||
peer := &RangedLoop{
|
||||
Log: log,
|
||||
DB: db,
|
||||
@ -117,6 +132,39 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
|
||||
db.StoragenodeAccounting())
|
||||
}
|
||||
|
||||
{ // setup mail service
|
||||
peer.Mail.Service, err = setupMailService(peer.Log, *config)
|
||||
if err != nil {
|
||||
return nil, errs.Combine(err, peer.Close())
|
||||
}
|
||||
|
||||
peer.Services.Add(lifecycle.Item{
|
||||
Name: "mail:service",
|
||||
Close: peer.Mail.Service.Close,
|
||||
})
|
||||
}
|
||||
|
||||
{ // setup overlay
|
||||
peer.Overlay.Service, err = overlay.NewService(peer.Log.Named("overlay"), peer.DB.OverlayCache(), peer.DB.NodeEvents(), peer.Mail.Service, config.Console.ExternalAddress, config.Console.SatelliteName, config.Overlay)
|
||||
if err != nil {
|
||||
return nil, errs.Combine(err, peer.Close())
|
||||
}
|
||||
peer.Services.Add(lifecycle.Item{
|
||||
Name: "overlay",
|
||||
Run: peer.Overlay.Service.Run,
|
||||
Close: peer.Overlay.Service.Close,
|
||||
})
|
||||
}
|
||||
|
||||
{ // setup repair
|
||||
peer.Repair.Observer = checker.NewRangedLoopObserver(
|
||||
peer.Log.Named("repair:checker"),
|
||||
peer.DB.RepairQueue(),
|
||||
peer.Overlay.Service,
|
||||
config.Checker,
|
||||
)
|
||||
}
|
||||
|
||||
{ // setup garbage collection bloom filter observer
|
||||
peer.GarbageCollectionBF.Observer = bloomfilter.NewObserver(log.Named("gc-bf"), config.GarbageCollectionBF, db.OverlayCache())
|
||||
}
|
||||
@ -133,6 +181,7 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
|
||||
if config.Metrics.UseRangedLoop {
|
||||
observers = append(observers, peer.Metrics.Observer)
|
||||
}
|
||||
|
||||
if config.Tally.UseRangedLoop {
|
||||
observers = append(observers, peer.Accounting.NodeTallyObserver)
|
||||
}
|
||||
@ -145,8 +194,12 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
|
||||
observers = append(observers, peer.GarbageCollectionBF.Observer)
|
||||
}
|
||||
|
||||
if config.Repairer.UseRangedLoop {
|
||||
observers = append(observers, peer.Repair.Observer)
|
||||
}
|
||||
|
||||
segments := rangedloop.NewMetabaseRangeSplitter(metabaseDB, config.RangedLoop.AsOfSystemInterval, config.RangedLoop.BatchSize)
|
||||
peer.RangedLoop.Service = rangedloop.NewService(log.Named("rangedloop"), config.RangedLoop, &segments, observers)
|
||||
peer.RangedLoop.Service = rangedloop.NewService(log.Named("rangedloop"), config.RangedLoop, segments, observers)
|
||||
|
||||
peer.Services.Add(lifecycle.Item{
|
||||
Name: "rangeloop",
|
||||
|
@ -90,6 +90,22 @@ type aggregateStats struct {
|
||||
remoteSegmentsOverThreshold [5]int64
|
||||
}
|
||||
|
||||
func (a *aggregateStats) combine(stats aggregateStats) {
|
||||
a.objectsChecked += stats.objectsChecked
|
||||
a.remoteSegmentsChecked += stats.remoteSegmentsChecked
|
||||
a.remoteSegmentsNeedingRepair += stats.remoteSegmentsNeedingRepair
|
||||
a.newRemoteSegmentsNeedingRepair += stats.newRemoteSegmentsNeedingRepair
|
||||
a.remoteSegmentsLost += stats.remoteSegmentsLost
|
||||
a.remoteSegmentsFailedToCheck += stats.remoteSegmentsFailedToCheck
|
||||
a.objectsLost = append(a.objectsLost, stats.objectsLost...)
|
||||
|
||||
a.remoteSegmentsOverThreshold[0] += stats.remoteSegmentsOverThreshold[0]
|
||||
a.remoteSegmentsOverThreshold[1] += stats.remoteSegmentsOverThreshold[1]
|
||||
a.remoteSegmentsOverThreshold[2] += stats.remoteSegmentsOverThreshold[2]
|
||||
a.remoteSegmentsOverThreshold[3] += stats.remoteSegmentsOverThreshold[3]
|
||||
a.remoteSegmentsOverThreshold[4] += stats.remoteSegmentsOverThreshold[4]
|
||||
}
|
||||
|
||||
func newStats(rs string) *stats {
|
||||
return &stats{
|
||||
iterationAggregates: new(aggregateStats),
|
||||
|
425
satellite/repair/checker/observer.go
Normal file
425
satellite/repair/checker/observer.go
Normal file
@ -0,0 +1,425 @@
|
||||
// Copyright (C) 2023 Storj Labs, Inc.
|
||||
// See LICENSE for copying information.
|
||||
|
||||
package checker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/zeebo/errs"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"storj.io/common/storj"
|
||||
"storj.io/common/uuid"
|
||||
"storj.io/storj/satellite/metabase/rangedloop"
|
||||
"storj.io/storj/satellite/metabase/segmentloop"
|
||||
"storj.io/storj/satellite/overlay"
|
||||
"storj.io/storj/satellite/repair"
|
||||
"storj.io/storj/satellite/repair/queue"
|
||||
)
|
||||
|
||||
var _ rangedloop.Observer = (*RangedLoopObserver)(nil)
|
||||
|
||||
// RangedLoopObserver implements the ranged loop Observer interface. Should be renamed to checkerObserver after rangedloop will replace segmentloop.
|
||||
//
|
||||
// architecture: Observer
|
||||
type RangedLoopObserver struct {
|
||||
logger *zap.Logger
|
||||
repairQueue queue.RepairQueue
|
||||
nodestate *ReliabilityCache
|
||||
repairOverrides RepairOverridesMap
|
||||
nodeFailureRate float64
|
||||
repairQueueBatchSize int
|
||||
|
||||
// the following are reset on each iteration
|
||||
startTime time.Time
|
||||
TotalStats aggregateStats
|
||||
|
||||
mu sync.Mutex
|
||||
statsCollector map[string]*observerRSStats
|
||||
}
|
||||
|
||||
// NewRangedLoopObserver creates new checker observer instance.
|
||||
func NewRangedLoopObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, config Config) rangedloop.Observer {
|
||||
return &RangedLoopObserver{
|
||||
logger: logger,
|
||||
|
||||
repairQueue: repairQueue,
|
||||
nodestate: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness),
|
||||
repairOverrides: config.RepairOverrides.GetMap(),
|
||||
nodeFailureRate: config.NodeFailureRate,
|
||||
repairQueueBatchSize: config.RepairQueueInsertBatchSize,
|
||||
statsCollector: make(map[string]*observerRSStats),
|
||||
}
|
||||
}
|
||||
|
||||
// getNodesEstimate updates the estimate of the total number of nodes. It is guaranteed
|
||||
// to return a number greater than 0 when the error is nil.
|
||||
//
|
||||
// We can't calculate this upon first starting a Ranged Loop Observer, because there may not be any
|
||||
// nodes yet. We expect that there will be nodes before there are segments, though.
|
||||
func (observer *RangedLoopObserver) getNodesEstimate(ctx context.Context) (int, error) {
|
||||
// this should be safe to call frequently; it is an efficient caching lookup.
|
||||
totalNumNodes, err := observer.nodestate.NumNodes(ctx)
|
||||
if err != nil {
|
||||
// We could proceed here by returning the last good value, or by returning a fallback
|
||||
// constant estimate, like "20000", and we'd probably be fine, but it would be better
|
||||
// not to have that happen silently for too long. Also, if we can't get this from the
|
||||
// database, we probably can't modify the injured segments queue, so it won't help to
|
||||
// proceed with this repair operation.
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if totalNumNodes == 0 {
|
||||
return 0, Error.New("segment health is meaningless: there are no nodes")
|
||||
}
|
||||
|
||||
return totalNumNodes, nil
|
||||
}
|
||||
|
||||
func (observer *RangedLoopObserver) createInsertBuffer() *queue.InsertBuffer {
|
||||
return queue.NewInsertBuffer(observer.repairQueue, observer.repairQueueBatchSize)
|
||||
}
|
||||
|
||||
// TestingCompareInjuredSegmentIDs compares stream id of injured segment.
|
||||
func (observer *RangedLoopObserver) TestingCompareInjuredSegmentIDs(ctx context.Context, streamIDs []uuid.UUID) error {
|
||||
injuredSegments, err := observer.repairQueue.SelectN(ctx, 100)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var injuredSegmentsIds []uuid.UUID
|
||||
for _, segment := range injuredSegments {
|
||||
injuredSegmentsIds = append(injuredSegmentsIds, segment.StreamID)
|
||||
}
|
||||
|
||||
sort.Slice(injuredSegmentsIds, func(i, j int) bool {
|
||||
return injuredSegmentsIds[i].Less(injuredSegmentsIds[j])
|
||||
})
|
||||
|
||||
sort.Slice(streamIDs, func(i, j int) bool {
|
||||
return streamIDs[i].Less(streamIDs[j])
|
||||
})
|
||||
|
||||
if !reflect.DeepEqual(streamIDs, injuredSegmentsIds) {
|
||||
return errs.New("injured objects ids are different")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start starts parallel segments loop.
|
||||
func (observer *RangedLoopObserver) Start(ctx context.Context, startTime time.Time) error {
|
||||
observer.startTime = startTime
|
||||
observer.TotalStats = aggregateStats{}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fork creates a Partial to process a chunk of all the segments.
|
||||
func (observer *RangedLoopObserver) Fork(ctx context.Context) (rangedloop.Partial, error) {
|
||||
return newRangedLoopCheckerPartial(observer), nil
|
||||
}
|
||||
|
||||
// Join is called after the chunk for Partial is done.
|
||||
// This gives the opportunity to merge the output like in a reduce step.
|
||||
func (observer *RangedLoopObserver) Join(ctx context.Context, partial rangedloop.Partial) error {
|
||||
repPartial, ok := partial.(*repairPartial)
|
||||
if !ok {
|
||||
return Error.New("expected partial type %T but got %T", repPartial, partial)
|
||||
}
|
||||
|
||||
if err := repPartial.repairQueue.Flush(ctx); err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
for rs, partialStats := range repPartial.rsStats {
|
||||
observer.statsCollector[rs].iterationAggregates.combine(partialStats.iterationAggregates)
|
||||
}
|
||||
|
||||
observer.TotalStats.combine(repPartial.totalStats)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Finish is called after all segments are processed by all observers.
|
||||
func (observer *RangedLoopObserver) Finish(ctx context.Context) error {
|
||||
// remove all segments which were not seen as unhealthy by this checker iteration
|
||||
healthyDeleted, err := observer.repairQueue.Clean(ctx, observer.startTime)
|
||||
if err != nil {
|
||||
return Error.Wrap(err)
|
||||
}
|
||||
|
||||
observer.collectAggregates()
|
||||
|
||||
mon.IntVal("remote_files_checked").Observe(observer.TotalStats.objectsChecked) //mon:locked
|
||||
mon.IntVal("remote_segments_checked").Observe(observer.TotalStats.remoteSegmentsChecked) //mon:locked
|
||||
mon.IntVal("remote_segments_failed_to_check").Observe(observer.TotalStats.remoteSegmentsFailedToCheck) //mon:locked
|
||||
mon.IntVal("remote_segments_needing_repair").Observe(observer.TotalStats.remoteSegmentsNeedingRepair) //mon:locked
|
||||
mon.IntVal("new_remote_segments_needing_repair").Observe(observer.TotalStats.newRemoteSegmentsNeedingRepair) //mon:locked
|
||||
mon.IntVal("remote_segments_lost").Observe(observer.TotalStats.remoteSegmentsLost) //mon:locked
|
||||
mon.IntVal("remote_files_lost").Observe(int64(len(observer.TotalStats.objectsLost))) //mon:locked
|
||||
mon.IntVal("remote_segments_over_threshold_1").Observe(observer.TotalStats.remoteSegmentsOverThreshold[0]) //mon:locked
|
||||
mon.IntVal("remote_segments_over_threshold_2").Observe(observer.TotalStats.remoteSegmentsOverThreshold[1]) //mon:locked
|
||||
mon.IntVal("remote_segments_over_threshold_3").Observe(observer.TotalStats.remoteSegmentsOverThreshold[2]) //mon:locked
|
||||
mon.IntVal("remote_segments_over_threshold_4").Observe(observer.TotalStats.remoteSegmentsOverThreshold[3]) //mon:locked
|
||||
mon.IntVal("remote_segments_over_threshold_5").Observe(observer.TotalStats.remoteSegmentsOverThreshold[4]) //mon:locked
|
||||
mon.IntVal("healthy_segments_removed_from_queue").Observe(healthyDeleted) //mon:locked
|
||||
allUnhealthy := observer.TotalStats.remoteSegmentsNeedingRepair + observer.TotalStats.remoteSegmentsFailedToCheck
|
||||
allChecked := observer.TotalStats.remoteSegmentsChecked
|
||||
allHealthy := allChecked - allUnhealthy
|
||||
mon.FloatVal("remote_segments_healthy_percentage").Observe(100 * float64(allHealthy) / float64(allChecked)) //mon:locked
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (observer *RangedLoopObserver) collectAggregates() {
|
||||
for _, stats := range observer.statsCollector {
|
||||
stats.collectAggregates()
|
||||
}
|
||||
}
|
||||
|
||||
func (observer *RangedLoopObserver) getObserverStats(rsString string) *observerRSStats {
|
||||
observer.mu.Lock()
|
||||
defer observer.mu.Unlock()
|
||||
|
||||
observerStats, exists := observer.statsCollector[rsString]
|
||||
if !exists {
|
||||
observerStats = &observerRSStats{aggregateStats{}, newIterationRSStats(rsString), newSegmentRSStats(rsString)}
|
||||
mon.Chain(observerStats)
|
||||
observer.statsCollector[rsString] = observerStats
|
||||
}
|
||||
|
||||
return observerStats
|
||||
}
|
||||
|
||||
// RefreshReliabilityCache forces refreshing node online status cache.
|
||||
func (observer *RangedLoopObserver) RefreshReliabilityCache(ctx context.Context) error {
|
||||
return observer.nodestate.Refresh(ctx)
|
||||
}
|
||||
|
||||
// repairPartial implements the ranged loop Partial interface.
|
||||
//
|
||||
// architecture: Observer
|
||||
type repairPartial struct {
|
||||
repairQueue *queue.InsertBuffer
|
||||
nodestate *ReliabilityCache
|
||||
rsStats map[string]*partialRSStats
|
||||
repairOverrides RepairOverridesMap
|
||||
nodeFailureRate float64
|
||||
getNodesEstimate func(ctx context.Context) (int, error)
|
||||
log *zap.Logger
|
||||
lastStreamID uuid.UUID
|
||||
totalStats aggregateStats
|
||||
|
||||
getObserverStats func(string) *observerRSStats
|
||||
}
|
||||
|
||||
// newRangedLoopCheckerPartial creates new checker partial instance.
|
||||
func newRangedLoopCheckerPartial(observer *RangedLoopObserver) rangedloop.Partial {
|
||||
// we can only share thread-safe objects.
|
||||
return &repairPartial{
|
||||
repairQueue: observer.createInsertBuffer(),
|
||||
nodestate: observer.nodestate,
|
||||
rsStats: make(map[string]*partialRSStats),
|
||||
repairOverrides: observer.repairOverrides,
|
||||
nodeFailureRate: observer.nodeFailureRate,
|
||||
getNodesEstimate: observer.getNodesEstimate,
|
||||
log: observer.logger,
|
||||
getObserverStats: observer.getObserverStats,
|
||||
}
|
||||
}
|
||||
|
||||
func (rp *repairPartial) getStatsByRS(redundancy storj.RedundancyScheme) *partialRSStats {
|
||||
rsString := getRSString(rp.loadRedundancy(redundancy))
|
||||
|
||||
stats, ok := rp.rsStats[rsString]
|
||||
if !ok {
|
||||
observerStats := rp.getObserverStats(rsString)
|
||||
|
||||
rp.rsStats[rsString] = &partialRSStats{
|
||||
iterationAggregates: aggregateStats{},
|
||||
segmentStats: observerStats.segmentStats,
|
||||
}
|
||||
return rp.rsStats[rsString]
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
func (rp *repairPartial) loadRedundancy(redundancy storj.RedundancyScheme) (int, int, int, int) {
|
||||
repair := int(redundancy.RepairShares)
|
||||
|
||||
overrideValue := rp.repairOverrides.GetOverrideValue(redundancy)
|
||||
if overrideValue != 0 {
|
||||
repair = int(overrideValue)
|
||||
}
|
||||
|
||||
return int(redundancy.RequiredShares), repair, int(redundancy.OptimalShares), int(redundancy.TotalShares)
|
||||
}
|
||||
|
||||
// Process repair implementation of partial's Process.
|
||||
func (rp *repairPartial) Process(ctx context.Context, segments []segmentloop.Segment) (err error) {
|
||||
for _, segment := range segments {
|
||||
if err := rp.process(ctx, &segment); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rp *repairPartial) process(ctx context.Context, segment *segmentloop.Segment) (err error) {
|
||||
if segment.Inline() {
|
||||
if rp.lastStreamID.Compare(segment.StreamID) != 0 {
|
||||
rp.lastStreamID = segment.StreamID
|
||||
rp.totalStats.objectsChecked++
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ignore segment if expired
|
||||
if segment.Expired(time.Now()) {
|
||||
return nil
|
||||
}
|
||||
|
||||
stats := rp.getStatsByRS(segment.Redundancy)
|
||||
if rp.lastStreamID.Compare(segment.StreamID) != 0 {
|
||||
rp.lastStreamID = segment.StreamID
|
||||
stats.iterationAggregates.objectsChecked++
|
||||
rp.totalStats.objectsChecked++
|
||||
}
|
||||
|
||||
rp.totalStats.remoteSegmentsChecked++
|
||||
stats.iterationAggregates.remoteSegmentsChecked++
|
||||
|
||||
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
||||
mon.Counter("checker_segments_below_min_req").Inc(0) //mon:locked
|
||||
pieces := segment.Pieces
|
||||
if len(pieces) == 0 {
|
||||
rp.log.Debug("no pieces on remote segment")
|
||||
return nil
|
||||
}
|
||||
|
||||
totalNumNodes, err := rp.getNodesEstimate(ctx)
|
||||
if err != nil {
|
||||
return Error.New("could not get estimate of total number of nodes: %w", err)
|
||||
}
|
||||
|
||||
missingPieces, err := rp.nodestate.MissingPieces(ctx, segment.CreatedAt, segment.Pieces)
|
||||
if err != nil {
|
||||
rp.totalStats.remoteSegmentsFailedToCheck++
|
||||
stats.iterationAggregates.remoteSegmentsFailedToCheck++
|
||||
return Error.New("error getting missing pieces: %w", err)
|
||||
}
|
||||
|
||||
numHealthy := len(pieces) - len(missingPieces)
|
||||
mon.IntVal("checker_segment_total_count").Observe(int64(len(pieces))) //mon:locked
|
||||
stats.segmentStats.segmentTotalCount.Observe(int64(len(pieces)))
|
||||
|
||||
mon.IntVal("checker_segment_healthy_count").Observe(int64(numHealthy)) //mon:locked
|
||||
stats.segmentStats.segmentHealthyCount.Observe(int64(numHealthy))
|
||||
|
||||
segmentAge := time.Since(segment.CreatedAt)
|
||||
mon.IntVal("checker_segment_age").Observe(int64(segmentAge.Seconds())) //mon:locked
|
||||
stats.segmentStats.segmentAge.Observe(int64(segmentAge.Seconds()))
|
||||
|
||||
required, repairThreshold, successThreshold, _ := rp.loadRedundancy(segment.Redundancy)
|
||||
segmentHealth := repair.SegmentHealth(numHealthy, required, totalNumNodes, rp.nodeFailureRate)
|
||||
mon.FloatVal("checker_segment_health").Observe(segmentHealth) //mon:locked
|
||||
stats.segmentStats.segmentHealth.Observe(segmentHealth)
|
||||
|
||||
// we repair when the number of healthy pieces is less than or equal to the repair threshold and is greater or equal to
|
||||
// minimum required pieces in redundancy
|
||||
// except for the case when the repair and success thresholds are the same (a case usually seen during testing)
|
||||
if numHealthy <= repairThreshold && numHealthy < successThreshold {
|
||||
mon.FloatVal("checker_injured_segment_health").Observe(segmentHealth) //mon:locked
|
||||
stats.segmentStats.injuredSegmentHealth.Observe(segmentHealth)
|
||||
rp.totalStats.remoteSegmentsNeedingRepair++
|
||||
stats.iterationAggregates.remoteSegmentsNeedingRepair++
|
||||
err := rp.repairQueue.Insert(ctx, &queue.InjuredSegment{
|
||||
StreamID: segment.StreamID,
|
||||
Position: segment.Position,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
SegmentHealth: segmentHealth,
|
||||
}, func() {
|
||||
// Counters are increased after the queue has determined
|
||||
// that the segment wasn't already queued for repair.
|
||||
rp.totalStats.newRemoteSegmentsNeedingRepair++
|
||||
stats.iterationAggregates.newRemoteSegmentsNeedingRepair++
|
||||
})
|
||||
if err != nil {
|
||||
rp.log.Error("error adding injured segment to queue", zap.Error(err))
|
||||
return nil
|
||||
}
|
||||
|
||||
// monitor irreparable segments
|
||||
if numHealthy < required {
|
||||
if !containsStreamID(rp.totalStats.objectsLost, segment.StreamID) {
|
||||
rp.totalStats.objectsLost = append(rp.totalStats.objectsLost, segment.StreamID)
|
||||
}
|
||||
|
||||
if !containsStreamID(stats.iterationAggregates.objectsLost, segment.StreamID) {
|
||||
stats.iterationAggregates.objectsLost = append(stats.iterationAggregates.objectsLost, segment.StreamID)
|
||||
}
|
||||
|
||||
repairedAt := time.Time{}
|
||||
if segment.RepairedAt != nil {
|
||||
repairedAt = *segment.RepairedAt
|
||||
}
|
||||
|
||||
var segmentAge time.Duration
|
||||
if segment.CreatedAt.Before(repairedAt) {
|
||||
segmentAge = time.Since(repairedAt)
|
||||
} else {
|
||||
segmentAge = time.Since(segment.CreatedAt)
|
||||
}
|
||||
|
||||
mon.IntVal("checker_segment_time_until_irreparable").Observe(int64(segmentAge.Seconds())) //mon:locked
|
||||
stats.segmentStats.segmentTimeUntilIrreparable.Observe(int64(segmentAge.Seconds()))
|
||||
|
||||
rp.totalStats.remoteSegmentsLost++
|
||||
stats.iterationAggregates.remoteSegmentsLost++
|
||||
|
||||
mon.Counter("checker_segments_below_min_req").Inc(1) //mon:locked
|
||||
stats.segmentStats.segmentsBelowMinReq.Inc(1)
|
||||
|
||||
var unhealthyNodes []string
|
||||
for _, p := range missingPieces {
|
||||
unhealthyNodes = append(unhealthyNodes, p.StorageNode.String())
|
||||
}
|
||||
rp.log.Warn("checker found irreparable segment", zap.String("Segment StreamID", segment.StreamID.String()), zap.Int("Segment Position",
|
||||
int(segment.Position.Encode())), zap.Int("total pieces", len(pieces)), zap.Int("min required", required), zap.String("unhealthy node IDs", strings.Join(unhealthyNodes, ",")))
|
||||
}
|
||||
} else {
|
||||
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(rp.totalStats.remoteSegmentsOverThreshold)) {
|
||||
// record metrics for segments right above repair threshold
|
||||
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
|
||||
for i := range rp.totalStats.remoteSegmentsOverThreshold {
|
||||
if numHealthy == (repairThreshold + i + 1) {
|
||||
rp.totalStats.remoteSegmentsOverThreshold[i]++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if numHealthy > repairThreshold && numHealthy <= (repairThreshold+len(stats.iterationAggregates.remoteSegmentsOverThreshold)) {
|
||||
// record metrics for segments right above repair threshold
|
||||
// numHealthy=repairThreshold+1 through numHealthy=repairThreshold+5
|
||||
for i := range stats.iterationAggregates.remoteSegmentsOverThreshold {
|
||||
if numHealthy == (repairThreshold + i + 1) {
|
||||
stats.iterationAggregates.remoteSegmentsOverThreshold[i]++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
461
satellite/repair/checker/observer_test.go
Normal file
461
satellite/repair/checker/observer_test.go
Normal file
@ -0,0 +1,461 @@
|
||||
// Copyright (C) 2023 Storj Labs, Inc.
|
||||
// See LICENSE for copying information.
|
||||
|
||||
package checker_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"storj.io/common/memory"
|
||||
"storj.io/common/storj"
|
||||
"storj.io/common/testcontext"
|
||||
"storj.io/common/testrand"
|
||||
"storj.io/common/uuid"
|
||||
"storj.io/storj/private/testplanet"
|
||||
"storj.io/storj/satellite"
|
||||
"storj.io/storj/satellite/metabase"
|
||||
"storj.io/storj/satellite/metabase/rangedloop"
|
||||
"storj.io/storj/satellite/repair/checker"
|
||||
"storj.io/storj/satellite/repair/queue"
|
||||
)
|
||||
|
||||
func TestIdentifyInjuredSegmentsObserver(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.UseRangedLoop = true
|
||||
config.RangedLoop.Parallelism = 4
|
||||
config.RangedLoop.BatchSize = 4
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
repairQueue := planet.Satellites[0].DB.RepairQueue()
|
||||
|
||||
rs := storj.RedundancyScheme{
|
||||
RequiredShares: 2,
|
||||
RepairShares: 3,
|
||||
OptimalShares: 4,
|
||||
TotalShares: 5,
|
||||
ShareSize: 256,
|
||||
}
|
||||
|
||||
projectID := planet.Uplinks[0].Projects[0].ID
|
||||
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedLocation := metabase.SegmentLocation{
|
||||
ProjectID: projectID,
|
||||
BucketName: "test-bucket",
|
||||
}
|
||||
|
||||
// add some valid pointers
|
||||
for x := 0; x < 10; x++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("a-%d", x))
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
|
||||
}
|
||||
|
||||
// add pointer that needs repair
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey("b-0")
|
||||
b0StreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), nil)
|
||||
|
||||
// add pointer that is unhealthy, but is expired
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey("b-1")
|
||||
expiresAt := time.Now().Add(-time.Hour)
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), &expiresAt)
|
||||
|
||||
// add some valid pointers
|
||||
for x := 0; x < 10; x++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("c-%d", x))
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
|
||||
}
|
||||
|
||||
_, err = planet.Satellites[0].RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// check that the unhealthy, non-expired segment was added to the queue
|
||||
// and that the expired segment was ignored
|
||||
injuredSegment, err := repairQueue.Select(ctx)
|
||||
require.NoError(t, err)
|
||||
err = repairQueue.Delete(ctx, injuredSegment)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, b0StreamID, injuredSegment.StreamID)
|
||||
|
||||
_, err = repairQueue.Select(ctx)
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
func TestIdentifyIrreparableSegmentsObserver(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 3, UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.UseRangedLoop = true
|
||||
config.RangedLoop.Parallelism = 4
|
||||
config.RangedLoop.BatchSize = 4
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
rangeLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
|
||||
|
||||
const numberOfNodes = 10
|
||||
pieces := make(metabase.Pieces, 0, numberOfNodes)
|
||||
// use online nodes
|
||||
for i, storagenode := range planet.StorageNodes {
|
||||
pieces = append(pieces, metabase.Piece{
|
||||
Number: uint16(i),
|
||||
StorageNode: storagenode.ID(),
|
||||
})
|
||||
}
|
||||
|
||||
// simulate offline nodes
|
||||
expectedLostPieces := make(map[int32]bool)
|
||||
for i := len(pieces); i < numberOfNodes; i++ {
|
||||
pieces = append(pieces, metabase.Piece{
|
||||
Number: uint16(i),
|
||||
StorageNode: storj.NodeID{byte(i)},
|
||||
})
|
||||
expectedLostPieces[int32(i)] = true
|
||||
}
|
||||
|
||||
rs := storj.RedundancyScheme{
|
||||
ShareSize: 256,
|
||||
RequiredShares: 4,
|
||||
RepairShares: 8,
|
||||
OptimalShares: 9,
|
||||
TotalShares: 10,
|
||||
}
|
||||
|
||||
projectID := planet.Uplinks[0].Projects[0].ID
|
||||
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedLocation := metabase.SegmentLocation{
|
||||
ProjectID: projectID,
|
||||
BucketName: "test-bucket",
|
||||
}
|
||||
|
||||
// when number of healthy piece is less than minimum required number of piece in redundancy,
|
||||
// the piece is considered irreparable but also will be put into repair queue
|
||||
|
||||
expectedLocation.ObjectKey = "piece"
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, pieces, nil)
|
||||
|
||||
expectedLocation.ObjectKey = "piece-expired"
|
||||
expiresAt := time.Now().Add(-time.Hour)
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, pieces, &expiresAt)
|
||||
|
||||
_, err = rangeLoopService.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// check that single irreparable segment was added repair queue
|
||||
repairQueue := planet.Satellites[0].DB.RepairQueue()
|
||||
_, err = repairQueue.Select(ctx)
|
||||
require.NoError(t, err)
|
||||
count, err := repairQueue.Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
// check irreparable once again but wait a second
|
||||
time.Sleep(1 * time.Second)
|
||||
_, err = rangeLoopService.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedLocation.ObjectKey = "piece"
|
||||
_, err = planet.Satellites[0].Metabase.DB.DeleteObjectExactVersion(ctx, metabase.DeleteObjectExactVersion{
|
||||
ObjectLocation: expectedLocation.Object(),
|
||||
Version: metabase.DefaultVersion,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = rangeLoopService.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
count, err = repairQueue.Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, count)
|
||||
})
|
||||
}
|
||||
|
||||
func TestIgnoringCopiedSegmentsObserver(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.UseRangedLoop = true
|
||||
config.RangedLoop.Parallelism = 4
|
||||
config.RangedLoop.BatchSize = 4
|
||||
config.Metainfo.RS.Min = 2
|
||||
config.Metainfo.RS.Repair = 3
|
||||
config.Metainfo.RS.Success = 4
|
||||
config.Metainfo.RS.Total = 4
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
satellite := planet.Satellites[0]
|
||||
uplink := planet.Uplinks[0]
|
||||
metabaseDB := satellite.Metabase.DB
|
||||
|
||||
rangedLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
|
||||
repairQueue := satellite.DB.RepairQueue()
|
||||
|
||||
err := uplink.CreateBucket(ctx, satellite, "test-bucket")
|
||||
require.NoError(t, err)
|
||||
|
||||
testData := testrand.Bytes(8 * memory.KiB)
|
||||
err = uplink.Upload(ctx, satellite, "testbucket", "test/path", testData)
|
||||
require.NoError(t, err)
|
||||
|
||||
project, err := uplink.OpenProject(ctx, satellite)
|
||||
require.NoError(t, err)
|
||||
defer ctx.Check(project.Close)
|
||||
|
||||
segments, err := metabaseDB.TestingAllSegments(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, segments, 1)
|
||||
|
||||
_, err = project.CopyObject(ctx, "testbucket", "test/path", "testbucket", "empty", nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
segmentsAfterCopy, err := metabaseDB.TestingAllSegments(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, segmentsAfterCopy, 2)
|
||||
|
||||
err = planet.StopNodeAndUpdate(ctx, planet.FindNode(segments[0].Pieces[0].StorageNode))
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = rangedLoopService.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// check that injured segment in repair queue streamID is same that in original segment.
|
||||
injuredSegment, err := repairQueue.Select(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, segments[0].StreamID, injuredSegment.StreamID)
|
||||
|
||||
// check that repair queue has only original segment, and not copied one.
|
||||
injuredSegments, err := repairQueue.Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, injuredSegments)
|
||||
})
|
||||
}
|
||||
|
||||
func TestCleanRepairQueueObserver(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.UseRangedLoop = true
|
||||
config.RangedLoop.Parallelism = 4
|
||||
config.RangedLoop.BatchSize = 4
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
rangedLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
|
||||
repairQueue := planet.Satellites[0].DB.RepairQueue()
|
||||
observer := planet.Satellites[0].RangedLoop.Repair.Observer.(*checker.RangedLoopObserver)
|
||||
planet.Satellites[0].Repair.Repairer.Loop.Pause()
|
||||
|
||||
rs := storj.RedundancyScheme{
|
||||
RequiredShares: 2,
|
||||
RepairShares: 3,
|
||||
OptimalShares: 4,
|
||||
TotalShares: 5,
|
||||
ShareSize: 256,
|
||||
}
|
||||
|
||||
projectID := planet.Uplinks[0].Projects[0].ID
|
||||
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedLocation := metabase.SegmentLocation{
|
||||
ProjectID: projectID,
|
||||
BucketName: "test-bucket",
|
||||
}
|
||||
|
||||
healthyCount := 5
|
||||
for i := 0; i < healthyCount; i++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("healthy-%d", i))
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
|
||||
}
|
||||
unhealthyCount := 5
|
||||
unhealthyIDs := make(map[uuid.UUID]struct{})
|
||||
for i := 0; i < unhealthyCount; i++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("unhealthy-%d", i))
|
||||
unhealthyStreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), nil)
|
||||
unhealthyIDs[unhealthyStreamID] = struct{}{}
|
||||
}
|
||||
|
||||
// suspend enough nodes to make healthy pointers unhealthy
|
||||
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
|
||||
require.NoError(t, planet.Satellites[0].Overlay.DB.TestSuspendNodeUnknownAudit(ctx, planet.StorageNodes[i].ID(), time.Now()))
|
||||
}
|
||||
|
||||
require.NoError(t, observer.RefreshReliabilityCache(ctx))
|
||||
|
||||
// check that repair queue is empty to avoid false positive
|
||||
count, err := repairQueue.Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, count)
|
||||
|
||||
_, err = rangedLoopService.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// check that the pointers were put into the repair queue
|
||||
// and not cleaned up at the end of the checker iteration
|
||||
count, err = repairQueue.Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, healthyCount+unhealthyCount, count)
|
||||
|
||||
// unsuspend nodes to make the previously healthy pointers healthy again
|
||||
for i := rs.RequiredShares; i < rs.OptimalShares; i++ {
|
||||
require.NoError(t, planet.Satellites[0].Overlay.DB.TestUnsuspendNodeUnknownAudit(ctx, planet.StorageNodes[i].ID()))
|
||||
}
|
||||
|
||||
require.NoError(t, observer.RefreshReliabilityCache(ctx))
|
||||
|
||||
// The checker will not insert/update the now healthy segments causing
|
||||
// them to be removed from the queue at the end of the checker iteration
|
||||
_, err = rangedLoopService.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
// only unhealthy segments should remain
|
||||
count, err = repairQueue.Count(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, unhealthyCount, count)
|
||||
|
||||
segs, err := repairQueue.SelectN(ctx, count)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, len(unhealthyIDs), len(segs))
|
||||
|
||||
for _, s := range segs {
|
||||
_, ok := unhealthyIDs[s.StreamID]
|
||||
require.True(t, ok)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestRepairObserver(t *testing.T) {
|
||||
testplanet.Run(t, testplanet.Config{
|
||||
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
||||
Reconfigure: testplanet.Reconfigure{
|
||||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
|
||||
config.Repairer.UseRangedLoop = true
|
||||
config.RangedLoop.Parallelism = 4
|
||||
config.RangedLoop.BatchSize = 4
|
||||
},
|
||||
},
|
||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||
rs := storj.RedundancyScheme{
|
||||
RequiredShares: 2,
|
||||
RepairShares: 3,
|
||||
OptimalShares: 4,
|
||||
TotalShares: 5,
|
||||
ShareSize: 256,
|
||||
}
|
||||
|
||||
err := planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "test-bucket")
|
||||
require.NoError(t, err)
|
||||
|
||||
expectedLocation := metabase.SegmentLocation{
|
||||
ProjectID: planet.Uplinks[0].Projects[0].ID,
|
||||
BucketName: "test-bucket",
|
||||
}
|
||||
|
||||
// add some valid pointers
|
||||
for x := 0; x < 20; x++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("a-%d", x))
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
|
||||
}
|
||||
|
||||
var injuredSegmentStreamIDs []uuid.UUID
|
||||
|
||||
// add pointer that needs repair
|
||||
for x := 0; x < 5; x++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("b-%d", x))
|
||||
injuredSegmentStreamID := insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), nil)
|
||||
injuredSegmentStreamIDs = append(injuredSegmentStreamIDs, injuredSegmentStreamID)
|
||||
}
|
||||
|
||||
// add pointer that is unhealthy, but is expired
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey("d-1")
|
||||
expiresAt := time.Now().Add(-time.Hour)
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createLostPieces(planet, rs), &expiresAt)
|
||||
|
||||
// add some valid pointers
|
||||
for x := 0; x < 20; x++ {
|
||||
expectedLocation.ObjectKey = metabase.ObjectKey(fmt.Sprintf("c-%d", x))
|
||||
insertSegment(ctx, t, planet, rs, expectedLocation, createPieces(planet, rs), nil)
|
||||
}
|
||||
|
||||
compare := func(insertedSegmentsIDs []uuid.UUID, fromRepairQueue []queue.InjuredSegment) bool {
|
||||
var repairQueueIDs []uuid.UUID
|
||||
for _, v := range fromRepairQueue {
|
||||
repairQueueIDs = append(repairQueueIDs, v.StreamID)
|
||||
}
|
||||
|
||||
sort.Slice(insertedSegmentsIDs, func(i, j int) bool {
|
||||
return insertedSegmentsIDs[i].Less(insertedSegmentsIDs[j])
|
||||
})
|
||||
sort.Slice(repairQueueIDs, func(i, j int) bool {
|
||||
return repairQueueIDs[i].Less(repairQueueIDs[j])
|
||||
})
|
||||
|
||||
return reflect.DeepEqual(insertedSegmentsIDs, repairQueueIDs)
|
||||
}
|
||||
|
||||
type TestCase struct {
|
||||
BatchSize int
|
||||
Parallelism int
|
||||
}
|
||||
|
||||
_, err = planet.Satellites[0].RangedLoop.RangedLoop.Service.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
injuredSegments, err := planet.Satellites[0].DB.RepairQueue().SelectN(ctx, 10)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, injuredSegments, 5)
|
||||
require.True(t, compare(injuredSegmentStreamIDs, injuredSegments))
|
||||
|
||||
_, err = planet.Satellites[0].DB.RepairQueue().Clean(ctx, time.Now())
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, tc := range []TestCase{
|
||||
{1, 1},
|
||||
{3, 1},
|
||||
{5, 1},
|
||||
{1, 3},
|
||||
{3, 3},
|
||||
{5, 3},
|
||||
{1, 5},
|
||||
{3, 5},
|
||||
{5, 5},
|
||||
} {
|
||||
observer := planet.Satellites[0].RangedLoop.Repair.Observer
|
||||
config := planet.Satellites[0].Config
|
||||
service := rangedloop.NewService(planet.Log(), rangedloop.Config{
|
||||
Parallelism: tc.Parallelism,
|
||||
BatchSize: tc.BatchSize,
|
||||
}, rangedloop.NewMetabaseRangeSplitter(planet.Satellites[0].Metabase.DB, config.RangedLoop.AsOfSystemInterval, config.RangedLoop.BatchSize), []rangedloop.Observer{observer})
|
||||
|
||||
_, err = service.RunOnce(ctx)
|
||||
require.NoError(t, err)
|
||||
|
||||
injuredSegments, err = planet.Satellites[0].DB.RepairQueue().SelectN(ctx, 10)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, injuredSegments, 5)
|
||||
require.True(t, compare(injuredSegmentStreamIDs, injuredSegments))
|
||||
|
||||
_, err = planet.Satellites[0].DB.RepairQueue().Clean(ctx, time.Now())
|
||||
require.NoError(t, err)
|
||||
}
|
||||
})
|
||||
}
|
139
satellite/repair/checker/observerstats.go
Normal file
139
satellite/repair/checker/observerstats.go
Normal file
@ -0,0 +1,139 @@
|
||||
// Copyright (C) 2023 Storj Labs, Inc.
|
||||
// See LICENSE for copying information.
|
||||
|
||||
package checker
|
||||
|
||||
import "github.com/spacemonkeygo/monkit/v3"
|
||||
|
||||
type observerRSStats struct {
|
||||
// iterationAggregates contains the aggregated counters across all partials.
|
||||
// The values are observed by the distributions in iterationStats
|
||||
iterationAggregates aggregateStats
|
||||
|
||||
// iterationStats are the distributions for per-iteration stats. The distributions
|
||||
// are updated using iterationAggregates after each loop iteration completes.
|
||||
iterationStats iterationRSStats
|
||||
|
||||
// segmentStats contains threadsafe distributions and is shared by all partials. The
|
||||
// distributions are updated when processing the segment.
|
||||
segmentStats *segmentRSStats
|
||||
}
|
||||
|
||||
// Stats implements the monkit.StatSource interface.
|
||||
func (stats *observerRSStats) Stats(cb func(key monkit.SeriesKey, field string, val float64)) {
|
||||
stats.iterationStats.objectsChecked.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsChecked.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsNeedingRepair.Stats(cb)
|
||||
stats.iterationStats.newRemoteSegmentsNeedingRepair.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsLost.Stats(cb)
|
||||
stats.iterationStats.objectsLost.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsFailedToCheck.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsHealthyPercentage.Stats(cb)
|
||||
|
||||
stats.iterationStats.remoteSegmentsOverThreshold1.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsOverThreshold2.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsOverThreshold3.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsOverThreshold4.Stats(cb)
|
||||
stats.iterationStats.remoteSegmentsOverThreshold5.Stats(cb)
|
||||
|
||||
stats.segmentStats.segmentsBelowMinReq.Stats(cb)
|
||||
stats.segmentStats.segmentTotalCount.Stats(cb)
|
||||
stats.segmentStats.segmentHealthyCount.Stats(cb)
|
||||
stats.segmentStats.segmentAge.Stats(cb)
|
||||
stats.segmentStats.segmentHealth.Stats(cb)
|
||||
stats.segmentStats.injuredSegmentHealth.Stats(cb)
|
||||
stats.segmentStats.segmentTimeUntilIrreparable.Stats(cb)
|
||||
}
|
||||
|
||||
type iterationRSStats struct {
|
||||
objectsChecked *monkit.IntVal
|
||||
remoteSegmentsChecked *monkit.IntVal
|
||||
remoteSegmentsNeedingRepair *monkit.IntVal
|
||||
newRemoteSegmentsNeedingRepair *monkit.IntVal
|
||||
remoteSegmentsLost *monkit.IntVal
|
||||
objectsLost *monkit.IntVal
|
||||
remoteSegmentsFailedToCheck *monkit.IntVal
|
||||
remoteSegmentsHealthyPercentage *monkit.FloatVal
|
||||
|
||||
// remoteSegmentsOverThreshold[0]=# of healthy=rt+1, remoteSegmentsOverThreshold[1]=# of healthy=rt+2, etc...
|
||||
remoteSegmentsOverThreshold1 *monkit.IntVal
|
||||
remoteSegmentsOverThreshold2 *monkit.IntVal
|
||||
remoteSegmentsOverThreshold3 *monkit.IntVal
|
||||
remoteSegmentsOverThreshold4 *monkit.IntVal
|
||||
remoteSegmentsOverThreshold5 *monkit.IntVal
|
||||
}
|
||||
|
||||
func newIterationRSStats(rs string) iterationRSStats {
|
||||
return iterationRSStats{
|
||||
objectsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_objects_checked").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsChecked: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_checked").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_needing_repair").WithTag("rs_scheme", rs)),
|
||||
newRemoteSegmentsNeedingRepair: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "new_remote_segments_needing_repair").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_lost").WithTag("rs_scheme", rs)),
|
||||
objectsLost: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "objects_lost").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsFailedToCheck: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_failed_to_check").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsHealthyPercentage: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_healthy_percentage").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsOverThreshold1: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_1").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsOverThreshold2: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_2").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsOverThreshold3: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_3").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsOverThreshold4: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_4").WithTag("rs_scheme", rs)),
|
||||
remoteSegmentsOverThreshold5: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "remote_segments_over_threshold_5").WithTag("rs_scheme", rs)),
|
||||
}
|
||||
}
|
||||
|
||||
type partialRSStats struct {
|
||||
// iterationAggregates are counts aggregated by each partial for stats for the whole loop
|
||||
// and are aggregated into the observer during join. These aggregated counters
|
||||
// are tallied into distributions at the end of each loop.
|
||||
iterationAggregates aggregateStats
|
||||
|
||||
// segmentStats contains thread-safe distributions and is shared by all partials. The
|
||||
// distributions are updated when processing the segment.
|
||||
segmentStats *segmentRSStats
|
||||
}
|
||||
|
||||
type segmentRSStats struct {
|
||||
segmentsBelowMinReq *monkit.Counter
|
||||
segmentTotalCount *monkit.IntVal
|
||||
segmentHealthyCount *monkit.IntVal
|
||||
segmentAge *monkit.IntVal
|
||||
segmentHealth *monkit.FloatVal
|
||||
injuredSegmentHealth *monkit.FloatVal
|
||||
segmentTimeUntilIrreparable *monkit.IntVal
|
||||
}
|
||||
|
||||
func newSegmentRSStats(rs string) *segmentRSStats {
|
||||
return &segmentRSStats{
|
||||
segmentsBelowMinReq: monkit.NewCounter(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segments_below_min_req").WithTag("rs_scheme", rs)),
|
||||
segmentTotalCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_total_count").WithTag("rs_scheme", rs)),
|
||||
segmentHealthyCount: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_healthy_count").WithTag("rs_scheme", rs)),
|
||||
segmentAge: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_age").WithTag("rs_scheme", rs)),
|
||||
segmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_health").WithTag("rs_scheme", rs)),
|
||||
injuredSegmentHealth: monkit.NewFloatVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_injured_segment_health").WithTag("rs_scheme", rs)),
|
||||
segmentTimeUntilIrreparable: monkit.NewIntVal(monkit.NewSeriesKey("tagged_repair_stats").WithTag("name", "checker_segment_time_until_irreparable").WithTag("rs_scheme", rs)),
|
||||
}
|
||||
}
|
||||
|
||||
func (stats *observerRSStats) collectAggregates() {
|
||||
stats.iterationStats.objectsChecked.Observe(stats.iterationAggregates.objectsChecked)
|
||||
stats.iterationStats.remoteSegmentsChecked.Observe(stats.iterationAggregates.remoteSegmentsChecked)
|
||||
stats.iterationStats.remoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.remoteSegmentsNeedingRepair)
|
||||
stats.iterationStats.newRemoteSegmentsNeedingRepair.Observe(stats.iterationAggregates.newRemoteSegmentsNeedingRepair)
|
||||
stats.iterationStats.remoteSegmentsLost.Observe(stats.iterationAggregates.remoteSegmentsLost)
|
||||
stats.iterationStats.objectsLost.Observe(int64(len(stats.iterationAggregates.objectsLost)))
|
||||
stats.iterationStats.remoteSegmentsFailedToCheck.Observe(stats.iterationAggregates.remoteSegmentsFailedToCheck)
|
||||
stats.iterationStats.remoteSegmentsOverThreshold1.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[0])
|
||||
stats.iterationStats.remoteSegmentsOverThreshold2.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[1])
|
||||
stats.iterationStats.remoteSegmentsOverThreshold3.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[2])
|
||||
stats.iterationStats.remoteSegmentsOverThreshold4.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[3])
|
||||
stats.iterationStats.remoteSegmentsOverThreshold5.Observe(stats.iterationAggregates.remoteSegmentsOverThreshold[4])
|
||||
|
||||
allUnhealthy := stats.iterationAggregates.remoteSegmentsNeedingRepair + stats.iterationAggregates.remoteSegmentsFailedToCheck
|
||||
allChecked := stats.iterationAggregates.remoteSegmentsChecked
|
||||
allHealthy := allChecked - allUnhealthy
|
||||
|
||||
stats.iterationStats.remoteSegmentsHealthyPercentage.Observe(100 * float64(allHealthy) / float64(allChecked))
|
||||
|
||||
// resetting iteration aggregates after loop run finished
|
||||
stats.iterationAggregates = aggregateStats{}
|
||||
}
|
@ -35,6 +35,7 @@ type Config struct {
|
||||
MaxExcessRateOptimalThreshold float64 `help:"ratio applied to the optimal threshold to calculate the excess of the maximum number of repaired pieces to upload" default:"0.05"`
|
||||
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
|
||||
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
|
||||
UseRangedLoop bool `help:"whether to use ranged loop instead of segment loop" default:"false"`
|
||||
}
|
||||
|
||||
// Service contains the information needed to run the repair service.
|
||||
|
3
scripts/testdata/satellite-config.yaml.lock
vendored
3
scripts/testdata/satellite-config.yaml.lock
vendored
@ -913,6 +913,9 @@ identity.key-path: /root/.local/share/storj/identity/satellite/identity.key
|
||||
# time limit for an entire repair job, from queue pop to upload completion
|
||||
# repairer.total-timeout: 45m0s
|
||||
|
||||
# whether to use ranged loop instead of segment loop
|
||||
# repairer.use-ranged-loop: false
|
||||
|
||||
# the number of times a node has been audited to not be considered a New Node
|
||||
# reputation.audit-count: 100
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user