satellite/repair/repairer: fix NumHealthyInExcludedCountries calculation
Currently, we have issue were while counting unhealthy pieces we are counting twice piece which is in excluded country and is outside segment placement. This can cause unnecessary repair. This change is also doing another step to move RepairExcludedCountryCodes from overlay config into repair package. Change-Id: I3692f6e0ddb9982af925db42be23d644aec1963f
This commit is contained in:
parent
05f30740f5
commit
1d62dc63f5
@ -426,9 +426,8 @@ func TestAllInOne(t *testing.T) {
|
|||||||
log.Named("repair:checker"),
|
log.Named("repair:checker"),
|
||||||
satellite.DB.RepairQueue(),
|
satellite.DB.RepairQueue(),
|
||||||
satellite.Overlay.Service,
|
satellite.Overlay.Service,
|
||||||
satellite.Config.Checker,
|
|
||||||
overlay.NewPlacementRules().CreateFilters,
|
overlay.NewPlacementRules().CreateFilters,
|
||||||
[]string{},
|
satellite.Config.Checker,
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -62,8 +62,6 @@ type DB interface {
|
|||||||
|
|
||||||
// Get looks up the node by nodeID
|
// Get looks up the node by nodeID
|
||||||
Get(ctx context.Context, nodeID storj.NodeID) (*NodeDossier, error)
|
Get(ctx context.Context, nodeID storj.NodeID) (*NodeDossier, error)
|
||||||
// KnownReliableInExcludedCountries filters healthy nodes that are in excluded countries.
|
|
||||||
KnownReliableInExcludedCountries(context.Context, *NodeCriteria, storj.NodeIDList) (storj.NodeIDList, error)
|
|
||||||
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
|
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
|
||||||
KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList, onlineWindow, asOfSystemInterval time.Duration) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error)
|
KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList, onlineWindow, asOfSystemInterval time.Duration) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error)
|
||||||
// Reliable returns all nodes that are reliable (separated by whether they are currently online or offline).
|
// Reliable returns all nodes that are reliable (separated by whether they are currently online or offline).
|
||||||
@ -199,7 +197,6 @@ type NodeCriteria struct {
|
|||||||
MinimumVersion string // semver or empty
|
MinimumVersion string // semver or empty
|
||||||
OnlineWindow time.Duration
|
OnlineWindow time.Duration
|
||||||
AsOfSystemInterval time.Duration // only used for CRDB queries
|
AsOfSystemInterval time.Duration // only used for CRDB queries
|
||||||
ExcludedCountries []string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReputationStatus indicates current reputation status for a node.
|
// ReputationStatus indicates current reputation status for a node.
|
||||||
@ -540,17 +537,6 @@ func (service *Service) InsertOfflineNodeEvents(ctx context.Context, cooldown ti
|
|||||||
return count, err
|
return count, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// KnownReliableInExcludedCountries filters healthy nodes that are in excluded countries.
|
|
||||||
func (service *Service) KnownReliableInExcludedCountries(ctx context.Context, nodeIds storj.NodeIDList) (reliableInExcluded storj.NodeIDList, err error) {
|
|
||||||
defer mon.Task()(&ctx)(&err)
|
|
||||||
|
|
||||||
criteria := &NodeCriteria{
|
|
||||||
OnlineWindow: service.config.Node.OnlineWindow,
|
|
||||||
ExcludedCountries: service.config.RepairExcludedCountryCodes,
|
|
||||||
}
|
|
||||||
return service.db.KnownReliableInExcludedCountries(ctx, criteria, nodeIds)
|
|
||||||
}
|
|
||||||
|
|
||||||
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
|
// KnownReliable filters a set of nodes to reliable (online and qualified) nodes.
|
||||||
func (service *Service) KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList) (onlineNodes []nodeselection.SelectedNode, offlineNodes []nodeselection.SelectedNode, err error) {
|
func (service *Service) KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList) (onlineNodes []nodeselection.SelectedNode, offlineNodes []nodeselection.SelectedNode, err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
@ -747,28 +733,6 @@ func (service *Service) GetMissingPieces(ctx context.Context, pieces metabase.Pi
|
|||||||
return maps.Values(missingPiecesMap), nil
|
return maps.Values(missingPiecesMap), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetReliablePiecesInExcludedCountries returns the list of pieces held by nodes located in excluded countries.
|
|
||||||
func (service *Service) GetReliablePiecesInExcludedCountries(ctx context.Context, pieces metabase.Pieces) (piecesInExcluded []uint16, err error) {
|
|
||||||
defer mon.Task()(&ctx)(&err)
|
|
||||||
var nodeIDs storj.NodeIDList
|
|
||||||
for _, p := range pieces {
|
|
||||||
nodeIDs = append(nodeIDs, p.StorageNode)
|
|
||||||
}
|
|
||||||
inExcluded, err := service.KnownReliableInExcludedCountries(ctx, nodeIDs)
|
|
||||||
if err != nil {
|
|
||||||
return nil, Error.New("error getting nodes %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, p := range pieces {
|
|
||||||
for _, nodeID := range inExcluded {
|
|
||||||
if nodeID == p.StorageNode {
|
|
||||||
piecesInExcluded = append(piecesInExcluded, p.Number)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return piecesInExcluded, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// DQNodesLastSeenBefore disqualifies nodes who have not been contacted since the cutoff time.
|
// DQNodesLastSeenBefore disqualifies nodes who have not been contacted since the cutoff time.
|
||||||
func (service *Service) DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (count int, err error) {
|
func (service *Service) DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (count int, err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
@ -816,28 +816,6 @@ func TestVetAndUnvetNode(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestKnownReliableInExcludedCountries(t *testing.T) {
|
|
||||||
testplanet.Run(t, testplanet.Config{
|
|
||||||
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
|
|
||||||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
|
||||||
service := planet.Satellites[0].Overlay.Service
|
|
||||||
node := planet.StorageNodes[0]
|
|
||||||
|
|
||||||
onlineNodes, _, err := service.Reliable(ctx)
|
|
||||||
require.NoError(t, err)
|
|
||||||
require.Len(t, onlineNodes, 2)
|
|
||||||
|
|
||||||
err = planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, node.ID(), "FR")
|
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
// first node should be excluded from Reliable result because of country code
|
|
||||||
nodes, err := service.KnownReliableInExcludedCountries(ctx, storj.NodeIDList{onlineNodes[0].ID, onlineNodes[1].ID})
|
|
||||||
require.NoError(t, err)
|
|
||||||
require.Len(t, nodes, 1)
|
|
||||||
require.Equal(t, node.ID(), nodes[0])
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateReputationNodeEvents(t *testing.T) {
|
func TestUpdateReputationNodeEvents(t *testing.T) {
|
||||||
testplanet.Run(t, testplanet.Config{
|
testplanet.Run(t, testplanet.Config{
|
||||||
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
|
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
|
||||||
|
@ -151,13 +151,16 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
|
|||||||
}
|
}
|
||||||
|
|
||||||
{ // setup repair
|
{ // setup repair
|
||||||
|
if len(config.Checker.RepairExcludedCountryCodes) == 0 {
|
||||||
|
config.Checker.RepairExcludedCountryCodes = config.Overlay.RepairExcludedCountryCodes
|
||||||
|
}
|
||||||
|
|
||||||
peer.Repair.Observer = checker.NewObserver(
|
peer.Repair.Observer = checker.NewObserver(
|
||||||
peer.Log.Named("repair:checker"),
|
peer.Log.Named("repair:checker"),
|
||||||
peer.DB.RepairQueue(),
|
peer.DB.RepairQueue(),
|
||||||
peer.Overlay.Service,
|
peer.Overlay.Service,
|
||||||
config.Checker,
|
|
||||||
config.Placement.CreateFilters,
|
config.Placement.CreateFilters,
|
||||||
config.Overlay.RepairExcludedCountryCodes,
|
config.Checker,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ type Config struct {
|
|||||||
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
|
// This results in `2/9200/4 = 0.00005435` being the probability of any single node going down in the interval of one checker iteration.
|
||||||
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
|
NodeFailureRate float64 `help:"the probability of a single node going down within the next checker iteration" default:"0.00005435" `
|
||||||
RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
|
RepairQueueInsertBatchSize int `help:"Number of damaged segments to buffer in-memory before flushing to the repair queue" default:"100" `
|
||||||
|
RepairExcludedCountryCodes []string `help:"list of country codes to treat node from this country as offline " default:"" hidden:"true"`
|
||||||
DoDeclumping bool `help:"Treat pieces on the same network as in need of repair" default:"false"`
|
DoDeclumping bool `help:"Treat pieces on the same network as in need of repair" default:"false"`
|
||||||
DoPlacementCheck bool `help:"Treat pieces out of segment placement as in need of repair" default:"true"`
|
DoPlacementCheck bool `help:"Treat pieces out of segment placement as in need of repair" default:"true"`
|
||||||
}
|
}
|
||||||
|
@ -51,13 +51,12 @@ type Observer struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// NewObserver creates new checker observer instance.
|
// NewObserver creates new checker observer instance.
|
||||||
// TODO move excludedCountries into config but share it somehow with segment repairer.
|
func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, placementRules overlay.PlacementRules, config Config) *Observer {
|
||||||
func NewObserver(logger *zap.Logger, repairQueue queue.RepairQueue, overlay *overlay.Service, config Config, placementRules overlay.PlacementRules, excludedCountries []string) *Observer {
|
|
||||||
return &Observer{
|
return &Observer{
|
||||||
logger: logger,
|
logger: logger,
|
||||||
|
|
||||||
repairQueue: repairQueue,
|
repairQueue: repairQueue,
|
||||||
nodesCache: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness, placementRules, excludedCountries),
|
nodesCache: NewReliabilityCache(overlay, config.ReliabilityCacheStaleness, placementRules, config.RepairExcludedCountryCodes),
|
||||||
overlayService: overlay,
|
overlayService: overlay,
|
||||||
repairOverrides: config.RepairOverrides.GetMap(),
|
repairOverrides: config.RepairOverrides.GetMap(),
|
||||||
nodeFailureRate: config.NodeFailureRate,
|
nodeFailureRate: config.NodeFailureRate,
|
||||||
|
@ -555,7 +555,7 @@ func BenchmarkRemoteSegment(b *testing.B) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
observer := checker.NewObserver(zap.NewNop(), planet.Satellites[0].DB.RepairQueue(),
|
observer := checker.NewObserver(zap.NewNop(), planet.Satellites[0].DB.RepairQueue(),
|
||||||
planet.Satellites[0].Auditor.Overlay, planet.Satellites[0].Config.Checker, overlay.NewPlacementRules().CreateFilters, []string{})
|
planet.Satellites[0].Auditor.Overlay, overlay.NewPlacementRules().CreateFilters, planet.Satellites[0].Config.Checker)
|
||||||
segments, err := planet.Satellites[0].Metabase.DB.TestingAllSegments(ctx)
|
segments, err := planet.Satellites[0].Metabase.DB.TestingAllSegments(ctx)
|
||||||
require.NoError(b, err)
|
require.NoError(b, err)
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@ type Config struct {
|
|||||||
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
|
InMemoryRepair bool `help:"whether to download pieces for repair in memory (true) or download to disk (false)" default:"false"`
|
||||||
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
|
ReputationUpdateEnabled bool `help:"whether the audit score of nodes should be updated as a part of repair" default:"false"`
|
||||||
UseRangedLoop bool `help:"whether to enable repair checker observer with ranged loop" default:"true"`
|
UseRangedLoop bool `help:"whether to enable repair checker observer with ranged loop" default:"true"`
|
||||||
|
RepairExcludedCountryCodes []string `help:"list of country codes to treat node from this country as offline" default:"" hidden:"true"`
|
||||||
DoDeclumping bool `help:"repair pieces on the same network to other nodes" default:"false"`
|
DoDeclumping bool `help:"repair pieces on the same network to other nodes" default:"false"`
|
||||||
DoPlacementCheck bool `help:"repair pieces out of segment placement" default:"true"`
|
DoPlacementCheck bool `help:"repair pieces out of segment placement" default:"true"`
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ import (
|
|||||||
|
|
||||||
"storj.io/common/pb"
|
"storj.io/common/pb"
|
||||||
"storj.io/common/storj"
|
"storj.io/common/storj"
|
||||||
|
"storj.io/common/storj/location"
|
||||||
"storj.io/common/sync2"
|
"storj.io/common/sync2"
|
||||||
"storj.io/storj/satellite/audit"
|
"storj.io/storj/satellite/audit"
|
||||||
"storj.io/storj/satellite/metabase"
|
"storj.io/storj/satellite/metabase"
|
||||||
@ -99,6 +100,8 @@ type SegmentRepairer struct {
|
|||||||
// repairOverrides is the set of values configured by the checker to override the repair threshold for various RS schemes.
|
// repairOverrides is the set of values configured by the checker to override the repair threshold for various RS schemes.
|
||||||
repairOverrides checker.RepairOverridesMap
|
repairOverrides checker.RepairOverridesMap
|
||||||
|
|
||||||
|
excludedCountryCodes map[location.CountryCode]struct{}
|
||||||
|
|
||||||
nowFn func() time.Time
|
nowFn func() time.Time
|
||||||
OnTestingCheckSegmentAlteredHook func()
|
OnTestingCheckSegmentAlteredHook func()
|
||||||
OnTestingPiecesReportHook func(pieces FetchResultReport)
|
OnTestingPiecesReportHook func(pieces FetchResultReport)
|
||||||
@ -127,6 +130,13 @@ func NewSegmentRepairer(
|
|||||||
excessOptimalThreshold = 0
|
excessOptimalThreshold = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
excludedCountryCodes := make(map[location.CountryCode]struct{})
|
||||||
|
for _, countryCode := range config.RepairExcludedCountryCodes {
|
||||||
|
if cc := location.ToCountryCode(countryCode); cc != location.None {
|
||||||
|
excludedCountryCodes[cc] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return &SegmentRepairer{
|
return &SegmentRepairer{
|
||||||
log: log,
|
log: log,
|
||||||
statsCollector: newStatsCollector(),
|
statsCollector: newStatsCollector(),
|
||||||
@ -137,6 +147,7 @@ func NewSegmentRepairer(
|
|||||||
timeout: config.Timeout,
|
timeout: config.Timeout,
|
||||||
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
|
multiplierOptimalThreshold: 1 + excessOptimalThreshold,
|
||||||
repairOverrides: repairOverrides.GetMap(),
|
repairOverrides: repairOverrides.GetMap(),
|
||||||
|
excludedCountryCodes: excludedCountryCodes,
|
||||||
reporter: reporter,
|
reporter: reporter,
|
||||||
reputationUpdateEnabled: config.ReputationUpdateEnabled,
|
reputationUpdateEnabled: config.ReputationUpdateEnabled,
|
||||||
doDeclumping: config.DoDeclumping,
|
doDeclumping: config.DoDeclumping,
|
||||||
@ -223,13 +234,6 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
|||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
piecesInExcludedCountries, err := repairer.overlay.GetReliablePiecesInExcludedCountries(ctx, pieces)
|
|
||||||
if err != nil {
|
|
||||||
return false, overlayQueryError.New("error identifying pieces in excluded countries: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
numHealthyInExcludedCountries := len(piecesInExcludedCountries)
|
|
||||||
|
|
||||||
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
// ensure we get values, even if only zero values, so that redash can have an alert based on this
|
||||||
mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked
|
mon.Counter("repairer_segments_below_min_req").Inc(0) //mon:locked
|
||||||
stats.repairerSegmentsBelowMinReq.Inc(0)
|
stats.repairerSegmentsBelowMinReq.Inc(0)
|
||||||
@ -248,7 +252,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
|||||||
}
|
}
|
||||||
|
|
||||||
// repair not needed
|
// repair not needed
|
||||||
if numHealthy-numHealthyInExcludedCountries > int(repairThreshold) {
|
if numHealthy-piecesCheck.NumHealthyInExcludedCountries > int(repairThreshold) {
|
||||||
// remove pieces out of placement without repairing as we are above repair threshold
|
// remove pieces out of placement without repairing as we are above repair threshold
|
||||||
if len(piecesCheck.OutOfPlacementPiecesSet) > 0 {
|
if len(piecesCheck.OutOfPlacementPiecesSet) > 0 {
|
||||||
|
|
||||||
@ -348,12 +352,12 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
|||||||
var minSuccessfulNeeded int
|
var minSuccessfulNeeded int
|
||||||
{
|
{
|
||||||
totalNeeded := math.Ceil(float64(redundancy.OptimalThreshold()) * repairer.multiplierOptimalThreshold)
|
totalNeeded := math.Ceil(float64(redundancy.OptimalThreshold()) * repairer.multiplierOptimalThreshold)
|
||||||
requestCount = int(totalNeeded) + numHealthyInExcludedCountries
|
requestCount = int(totalNeeded) + piecesCheck.NumHealthyInExcludedCountries
|
||||||
if requestCount > redundancy.TotalCount() {
|
if requestCount > redundancy.TotalCount() {
|
||||||
requestCount = redundancy.TotalCount()
|
requestCount = redundancy.TotalCount()
|
||||||
}
|
}
|
||||||
requestCount -= numHealthy
|
requestCount -= numHealthy
|
||||||
minSuccessfulNeeded = redundancy.OptimalThreshold() - numHealthy + numHealthyInExcludedCountries
|
minSuccessfulNeeded = redundancy.OptimalThreshold() - numHealthy + piecesCheck.NumHealthyInExcludedCountries
|
||||||
}
|
}
|
||||||
|
|
||||||
// Request Overlay for n-h new storage nodes
|
// Request Overlay for n-h new storage nodes
|
||||||
@ -368,7 +372,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create the order limits for the PUT_REPAIR action
|
// Create the order limits for the PUT_REPAIR action
|
||||||
putLimits, putPrivateKey, err := repairer.orders.CreatePutRepairOrderLimits(ctx, segment, getOrderLimits, healthySet, newNodes, repairer.multiplierOptimalThreshold, numHealthyInExcludedCountries)
|
putLimits, putPrivateKey, err := repairer.orders.CreatePutRepairOrderLimits(ctx, segment, getOrderLimits, healthySet, newNodes, repairer.multiplierOptimalThreshold, piecesCheck.NumHealthyInExcludedCountries)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, orderLimitFailureError.New("could not create PUT_REPAIR order limits: %w", err)
|
return false, orderLimitFailureError.New("could not create PUT_REPAIR order limits: %w", err)
|
||||||
}
|
}
|
||||||
@ -633,7 +637,7 @@ func (repairer *SegmentRepairer) Repair(ctx context.Context, queueSegment *queue
|
|||||||
zap.Uint64("Position", segment.Position.Encode()),
|
zap.Uint64("Position", segment.Position.Encode()),
|
||||||
zap.Int("clumped pieces", len(piecesCheck.ClumpedPiecesSet)),
|
zap.Int("clumped pieces", len(piecesCheck.ClumpedPiecesSet)),
|
||||||
zap.Int("out of placement pieces", len(piecesCheck.OutOfPlacementPiecesSet)),
|
zap.Int("out of placement pieces", len(piecesCheck.OutOfPlacementPiecesSet)),
|
||||||
zap.Int("in excluded countries", numHealthyInExcludedCountries),
|
zap.Int("in excluded countries", piecesCheck.NumHealthyInExcludedCountries),
|
||||||
zap.Int("removed pieces", len(toRemove)),
|
zap.Int("removed pieces", len(toRemove)),
|
||||||
zap.Int("repaired pieces", len(repairedPieces)),
|
zap.Int("repaired pieces", len(repairedPieces)),
|
||||||
zap.Int("healthy before repair", numHealthy),
|
zap.Int("healthy before repair", numHealthy),
|
||||||
@ -649,12 +653,14 @@ type piecesCheckResult struct {
|
|||||||
OutOfPlacementPiecesSet map[uint16]bool
|
OutOfPlacementPiecesSet map[uint16]bool
|
||||||
|
|
||||||
NumUnhealthyRetrievable int
|
NumUnhealthyRetrievable int
|
||||||
|
NumHealthyInExcludedCountries int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (repairer *SegmentRepairer) classifySegmentPieces(ctx context.Context, segment metabase.Segment) (result piecesCheckResult, err error) {
|
func (repairer *SegmentRepairer) classifySegmentPieces(ctx context.Context, segment metabase.Segment) (result piecesCheckResult, err error) {
|
||||||
defer mon.Task()(&ctx)(&err)
|
defer mon.Task()(&ctx)(&err)
|
||||||
|
|
||||||
pieces := segment.Pieces
|
pieces := segment.Pieces
|
||||||
|
placement := segment.Placement
|
||||||
|
|
||||||
allNodeIDs := make([]storj.NodeID, len(pieces))
|
allNodeIDs := make([]storj.NodeID, len(pieces))
|
||||||
nodeIDPieceMap := map[storj.NodeID]uint16{}
|
nodeIDPieceMap := map[storj.NodeID]uint16{}
|
||||||
@ -674,6 +680,12 @@ func (repairer *SegmentRepairer) classifySegmentPieces(ctx context.Context, segm
|
|||||||
|
|
||||||
// remove online nodes from missing pieces
|
// remove online nodes from missing pieces
|
||||||
for _, onlineNode := range online {
|
for _, onlineNode := range online {
|
||||||
|
// count online nodes in excluded countries only if country is not excluded by segment
|
||||||
|
// placement, those nodes will be counted with out of placement check
|
||||||
|
if _, excluded := repairer.excludedCountryCodes[onlineNode.CountryCode]; excluded && placement.AllowedCountry(onlineNode.CountryCode) {
|
||||||
|
result.NumHealthyInExcludedCountries++
|
||||||
|
}
|
||||||
|
|
||||||
pieceNum := nodeIDPieceMap[onlineNode.ID]
|
pieceNum := nodeIDPieceMap[onlineNode.ID]
|
||||||
delete(result.MissingPiecesSet, pieceNum)
|
delete(result.MissingPiecesSet, pieceNum)
|
||||||
}
|
}
|
||||||
@ -705,7 +717,7 @@ func (repairer *SegmentRepairer) classifySegmentPieces(ctx context.Context, segm
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if repairer.doPlacementCheck && segment.Placement != storj.EveryCountry {
|
if repairer.doPlacementCheck && placement != storj.EveryCountry {
|
||||||
result.OutOfPlacementPiecesSet = map[uint16]bool{}
|
result.OutOfPlacementPiecesSet = map[uint16]bool{}
|
||||||
|
|
||||||
nodeFilters := repairer.placementRules(segment.Placement)
|
nodeFilters := repairer.placementRules(segment.Placement)
|
||||||
|
@ -216,7 +216,7 @@ func TestSegmentRepairPlacementAndClumped(t *testing.T) {
|
|||||||
|
|
||||||
func TestSegmentRepairPlacementNotEnoughNodes(t *testing.T) {
|
func TestSegmentRepairPlacementNotEnoughNodes(t *testing.T) {
|
||||||
testplanet.Run(t, testplanet.Config{
|
testplanet.Run(t, testplanet.Config{
|
||||||
SatelliteCount: 1, StorageNodeCount: 8, UplinkCount: 1,
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
||||||
Reconfigure: testplanet.Reconfigure{
|
Reconfigure: testplanet.Reconfigure{
|
||||||
Satellite: testplanet.ReconfigureRS(1, 2, 4, 4),
|
Satellite: testplanet.ReconfigureRS(1, 2, 4, 4),
|
||||||
},
|
},
|
||||||
@ -255,11 +255,65 @@ func TestSegmentRepairPlacementNotEnoughNodes(t *testing.T) {
|
|||||||
StreamID: segments[0].StreamID,
|
StreamID: segments[0].StreamID,
|
||||||
Position: segments[0].Position,
|
Position: segments[0].Position,
|
||||||
})
|
})
|
||||||
require.Error(t, err)
|
require.True(t, overlay.ErrNotEnoughNodes.Has(err))
|
||||||
require.False(t, shouldDelete)
|
require.False(t, shouldDelete)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSegmentRepairPlacementAndExcludedCountries(t *testing.T) {
|
||||||
|
testplanet.Run(t, testplanet.Config{
|
||||||
|
SatelliteCount: 1, StorageNodeCount: 4, UplinkCount: 1,
|
||||||
|
Reconfigure: testplanet.Reconfigure{
|
||||||
|
Satellite: testplanet.Combine(
|
||||||
|
testplanet.ReconfigureRS(1, 2, 4, 4),
|
||||||
|
func(log *zap.Logger, index int, config *satellite.Config) {
|
||||||
|
config.Overlay.RepairExcludedCountryCodes = []string{"US"}
|
||||||
|
},
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
|
||||||
|
require.NoError(t, planet.Uplinks[0].CreateBucket(ctx, planet.Satellites[0], "testbucket"))
|
||||||
|
|
||||||
|
_, err := planet.Satellites[0].API.Buckets.Service.UpdateBucket(ctx, buckets.Bucket{
|
||||||
|
ProjectID: planet.Uplinks[0].Projects[0].ID,
|
||||||
|
Name: "testbucket",
|
||||||
|
Placement: storj.EU,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
for _, node := range planet.StorageNodes {
|
||||||
|
require.NoError(t, planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, node.ID(), "PL"))
|
||||||
|
}
|
||||||
|
|
||||||
|
err = planet.Uplinks[0].Upload(ctx, planet.Satellites[0], "testbucket", "object", testrand.Bytes(5*memory.KiB))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
segments, err := planet.Satellites[0].Metabase.DB.TestingAllSegments(ctx)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, segments, 1)
|
||||||
|
require.Len(t, segments[0].Pieces, 4)
|
||||||
|
|
||||||
|
// change single node to location outside bucket placement and location which is part of RepairExcludedCountryCodes
|
||||||
|
require.NoError(t, planet.Satellites[0].Overlay.Service.TestNodeCountryCode(ctx, segments[0].Pieces[0].StorageNode, "US"))
|
||||||
|
|
||||||
|
require.NoError(t, planet.Satellites[0].Repairer.Overlay.DownloadSelectionCache.Refresh(ctx))
|
||||||
|
|
||||||
|
shouldDelete, err := planet.Satellites[0].Repairer.SegmentRepairer.Repair(ctx, &queue.InjuredSegment{
|
||||||
|
StreamID: segments[0].StreamID,
|
||||||
|
Position: segments[0].Position,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.True(t, shouldDelete)
|
||||||
|
|
||||||
|
// we are checking that repairer counted only single piece as out of placement and didn't count this piece
|
||||||
|
// also as from excluded country. That would cause full repair because repairer would count single pieces
|
||||||
|
// as unhealthy two times. Full repair would restore number of pieces to 4 but we just removed single pieces.
|
||||||
|
segmentsAfter, err := planet.Satellites[0].Metabase.DB.TestingAllSegments(ctx)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.ElementsMatch(t, segments[0].Pieces[1:], segmentsAfter[0].Pieces)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func allPiecesInPlacement(ctx context.Context, overaly *overlay.Service, pieces metabase.Pieces, placement storj.PlacementConstraint) (bool, error) {
|
func allPiecesInPlacement(ctx context.Context, overaly *overlay.Service, pieces metabase.Pieces, placement storj.PlacementConstraint) (bool, error) {
|
||||||
for _, piece := range pieces {
|
for _, piece := range pieces {
|
||||||
nodeDossier, err := overaly.Get(ctx, piece.StorageNode)
|
nodeDossier, err := overaly.Get(ctx, piece.StorageNode)
|
||||||
|
@ -211,6 +211,10 @@ func NewRepairer(log *zap.Logger, full *identity.FullIdentity,
|
|||||||
config.Repairer.DownloadTimeout,
|
config.Repairer.DownloadTimeout,
|
||||||
config.Repairer.InMemoryRepair)
|
config.Repairer.InMemoryRepair)
|
||||||
|
|
||||||
|
if len(config.Repairer.RepairExcludedCountryCodes) == 0 {
|
||||||
|
config.Repairer.RepairExcludedCountryCodes = config.Overlay.RepairExcludedCountryCodes
|
||||||
|
}
|
||||||
|
|
||||||
peer.SegmentRepairer = repairer.NewSegmentRepairer(
|
peer.SegmentRepairer = repairer.NewSegmentRepairer(
|
||||||
log.Named("segment-repair"),
|
log.Named("segment-repair"),
|
||||||
metabaseDB,
|
metabaseDB,
|
||||||
|
@ -386,80 +386,6 @@ func (cache *overlaycache) UpdateLastOfflineEmail(ctx context.Context, nodeIDs s
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// KnownReliableInExcludedCountries filters healthy nodes that are in excluded countries.
|
|
||||||
func (cache *overlaycache) KnownReliableInExcludedCountries(ctx context.Context, criteria *overlay.NodeCriteria, nodeIDs storj.NodeIDList) (reliableInExcluded storj.NodeIDList, err error) {
|
|
||||||
for {
|
|
||||||
reliableInExcluded, err = cache.knownReliableInExcludedCountries(ctx, criteria, nodeIDs)
|
|
||||||
if err != nil {
|
|
||||||
if cockroachutil.NeedsRetry(err) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return reliableInExcluded, err
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
return reliableInExcluded, err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (cache *overlaycache) knownReliableInExcludedCountries(ctx context.Context, criteria *overlay.NodeCriteria, nodeIDs storj.NodeIDList) (reliableInExcluded storj.NodeIDList, err error) {
|
|
||||||
defer mon.Task()(&ctx)(&err)
|
|
||||||
|
|
||||||
if len(nodeIDs) == 0 {
|
|
||||||
return nil, Error.New("no ids provided")
|
|
||||||
}
|
|
||||||
|
|
||||||
args := []interface{}{
|
|
||||||
pgutil.NodeIDArray(nodeIDs),
|
|
||||||
time.Now().Add(-criteria.OnlineWindow),
|
|
||||||
}
|
|
||||||
|
|
||||||
// When this config is not set, it's a string slice with one empty string. This is a sanity check just
|
|
||||||
// in case for some reason it's nil or has no elements.
|
|
||||||
if criteria.ExcludedCountries == nil || len(criteria.ExcludedCountries) == 0 {
|
|
||||||
return reliableInExcluded, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var excludedCountriesCondition string
|
|
||||||
if criteria.ExcludedCountries[0] == "" {
|
|
||||||
return reliableInExcluded, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
excludedCountriesCondition = "AND country_code IN (SELECT UNNEST($3::TEXT[]))"
|
|
||||||
args = append(args, pgutil.TextArray(criteria.ExcludedCountries))
|
|
||||||
|
|
||||||
// get reliable and online nodes
|
|
||||||
var rows tagsql.Rows
|
|
||||||
rows, err = cache.db.Query(ctx, cache.db.Rebind(`
|
|
||||||
SELECT id
|
|
||||||
FROM nodes
|
|
||||||
`+cache.db.impl.AsOfSystemInterval(criteria.AsOfSystemInterval)+`
|
|
||||||
WHERE id = any($1::bytea[])
|
|
||||||
AND disqualified IS NULL
|
|
||||||
AND unknown_audit_suspended IS NULL
|
|
||||||
AND offline_suspended IS NULL
|
|
||||||
AND exit_finished_at IS NULL
|
|
||||||
AND last_contact_success > $2
|
|
||||||
`+excludedCountriesCondition+`
|
|
||||||
`), args...,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer func() { err = errs.Combine(err, rows.Close()) }()
|
|
||||||
|
|
||||||
for rows.Next() {
|
|
||||||
var id storj.NodeID
|
|
||||||
err = rows.Scan(&id)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
reliableInExcluded = append(reliableInExcluded, id)
|
|
||||||
}
|
|
||||||
|
|
||||||
return reliableInExcluded, Error.Wrap(rows.Err())
|
|
||||||
}
|
|
||||||
|
|
||||||
// KnownReliable filters a set of nodes to reliable nodes. List is split into online and offline nodes.
|
// KnownReliable filters a set of nodes to reliable nodes. List is split into online and offline nodes.
|
||||||
func (cache *overlaycache) KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList, onlineWindow, asOfSystemInterval time.Duration) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error) {
|
func (cache *overlaycache) KnownReliable(ctx context.Context, nodeIDs storj.NodeIDList, onlineWindow, asOfSystemInterval time.Duration) (online []nodeselection.SelectedNode, offline []nodeselection.SelectedNode, err error) {
|
||||||
for {
|
for {
|
||||||
|
Loading…
Reference in New Issue
Block a user