satellite/durability: use single classifier per observer instance

the new bus_factor calculation doesn't make sense with different classes, as we have overlaps.

For example: it can detect a risk if we loose one country and one different subnet (with possible overlap).

It's better to calculate the stat and bus_factor per class (net, country, ...).

It also makes it easier to measure execution time per class.

Change-Id: I7d4d5f7cb811cd50c5831077b43e001908aab96b
This commit is contained in:
Márton Elek 2023-11-20 09:58:58 +01:00 committed by Storj Robot
parent 7c25e3733a
commit 0f4f1ddde8
5 changed files with 88 additions and 72 deletions

View File

@ -75,12 +75,12 @@ type ReportConfig struct {
type Report struct { type Report struct {
healthStat map[string]*HealthStat healthStat map[string]*HealthStat
busFactor HealthStat busFactor HealthStat
classifiers []NodeClassifier classifier NodeClassifier
aliasMap *metabase.NodeAliasMap aliasMap *metabase.NodeAliasMap
nodes map[storj.NodeID]*nodeselection.SelectedNode nodes map[storj.NodeID]*nodeselection.SelectedNode
db overlay.DB db overlay.DB
metabaseDB *metabase.DB metabaseDB *metabase.DB
reporter func(n time.Time, name string, stat *HealthStat) reporter func(n time.Time, class string, value string, stat *HealthStat)
reportThreshold int reportThreshold int
busFactorThreshold int busFactorThreshold int
asOfSystemInterval time.Duration asOfSystemInterval time.Duration
@ -90,17 +90,19 @@ type Report struct {
className map[classID]string className map[classID]string
// contains the available classes for each node alias. // contains the available classes for each node alias.
classified [][]classID classified []classID
maxPieceCount int maxPieceCount int
class string
} }
// NewDurability creates the new instance. // NewDurability creates the new instance.
func NewDurability(db overlay.DB, metabaseDB *metabase.DB, classifiers []NodeClassifier, maxPieceCount int, reportThreshold int, busFactorThreshold int, asOfSystemInterval time.Duration) *Report { func NewDurability(db overlay.DB, metabaseDB *metabase.DB, class string, classifier NodeClassifier, maxPieceCount int, reportThreshold int, busFactorThreshold int, asOfSystemInterval time.Duration) *Report {
return &Report{ return &Report{
class: class,
db: db, db: db,
metabaseDB: metabaseDB, metabaseDB: metabaseDB,
classifiers: classifiers, classifier: classifier,
reportThreshold: reportThreshold, reportThreshold: reportThreshold,
busFactorThreshold: busFactorThreshold, busFactorThreshold: busFactorThreshold,
asOfSystemInterval: asOfSystemInterval, asOfSystemInterval: asOfSystemInterval,
@ -137,35 +139,33 @@ func (c *Report) resetStat() {
} }
func (c *Report) classifyNodeAliases() { func (c *Report) classifyNodeAliases() {
c.classID = make(map[string]classID, len(c.classifiers)) c.classID = make(map[string]classID)
c.className = make(map[classID]string, len(c.classifiers)) c.className = make(map[classID]string)
c.classified = make([][]classID, c.aliasMap.Max()+1) c.classID["unclassified"] = 1
c.className[0] = "unclassified"
c.classified = make([]classID, c.aliasMap.Max()+1)
for _, node := range c.nodes { for _, node := range c.nodes {
alias, ok := c.aliasMap.Alias(node.ID) alias, ok := c.aliasMap.Alias(node.ID)
if !ok { if !ok {
continue continue
} }
classes := make([]classID, len(c.classifiers)) class := c.classifier(node)
for i, group := range c.classifiers {
class := group(node)
id, ok := c.classID[class] id, ok := c.classID[class]
if !ok { if !ok {
id = classID(len(c.classID)) id = classID(len(c.classID))
c.className[id] = class c.className[id] = class
c.classID[class] = id c.classID[class] = id
} }
classes[i] = id c.classified[alias] = id
}
c.classified[alias] = classes
} }
} }
// Fork implements rangedloop.Observer. // Fork implements rangedloop.Observer.
func (c *Report) Fork(ctx context.Context) (rangedloop.Partial, error) { func (c *Report) Fork(ctx context.Context) (rangedloop.Partial, error) {
d := &ObserverFork{ d := &ObserverFork{
classifiers: c.classifiers,
aliasMap: c.aliasMap, aliasMap: c.aliasMap,
nodes: c.nodes, nodes: c.nodes,
classifierCache: make([][]string, c.aliasMap.Max()+1), classifierCache: make([][]string, c.aliasMap.Max()+1),
@ -206,16 +206,21 @@ func (c *Report) Join(ctx context.Context, partial rangedloop.Partial) (err erro
func (c *Report) Finish(ctx context.Context) error { func (c *Report) Finish(ctx context.Context) error {
reportTime := time.Now() reportTime := time.Now()
for name, stat := range c.healthStat { for name, stat := range c.healthStat {
c.reporter(reportTime, name, stat) c.reporter(reportTime, c.class, name, stat)
} }
return nil return nil
} }
// TestChangeReporter modifies the reporter for unit tests. // TestChangeReporter modifies the reporter for unit tests.
func (c *Report) TestChangeReporter(r func(n time.Time, name string, stat *HealthStat)) { func (c *Report) TestChangeReporter(r func(n time.Time, class string, name string, stat *HealthStat)) {
c.reporter = r c.reporter = r
} }
// GetClass return with the class instance name (like last_net or country).
func (c *Report) GetClass() string {
return c.class
}
// classID is a fork level short identifier for each class. // classID is a fork level short identifier for each class.
type classID int32 type classID int32
@ -225,7 +230,6 @@ type ObserverFork struct {
healthStat []HealthStat healthStat []HealthStat
busFactor HealthStat busFactor HealthStat
classifiers []NodeClassifier
aliasMap *metabase.NodeAliasMap aliasMap *metabase.NodeAliasMap
nodes map[storj.NodeID]*nodeselection.SelectedNode nodes map[storj.NodeID]*nodeselection.SelectedNode
classifierCache [][]string classifierCache [][]string
@ -234,7 +238,7 @@ type ObserverFork struct {
reportThreshold int reportThreshold int
busFactorThreshold int busFactorThreshold int
classified [][]classID classified []classID
} }
// Process implements rangedloop.Partial. // Process implements rangedloop.Partial.
@ -254,16 +258,15 @@ func (c *ObserverFork) Process(ctx context.Context, segments []rangedloop.Segmen
continue continue
} }
classes := c.classified[piece.Alias] class := c.classified[piece.Alias]
// unavailable/offline nodes were not classified // unavailable/offline nodes were not classified
if len(classes) > 0 { if class > 0 {
healthyPieceCount++ healthyPieceCount++
} }
for _, class := range classes {
controlledByClass[class]++ controlledByClass[class]++
}
} }
busFactorGroups := c.busFactorCache busFactorGroups := c.busFactorCache
@ -307,9 +310,10 @@ func (c *ObserverFork) Process(ctx context.Context, segments []rangedloop.Segmen
return nil return nil
} }
func reportToEventkit(n time.Time, name string, stat *HealthStat) { func reportToEventkit(n time.Time, class string, name string, stat *HealthStat) {
ek.Event("durability", ek.Event("durability",
eventkit.String("name", name), eventkit.String("class", class),
eventkit.String("value", name),
eventkit.String("exemplar", stat.Exemplar), eventkit.String("exemplar", stat.Exemplar),
eventkit.Timestamp("report_time", n), eventkit.Timestamp("report_time", n),
eventkit.Int64("min", int64(stat.Min())), eventkit.Int64("min", int64(stat.Min())),

View File

@ -64,17 +64,19 @@ func TestDurabilityIntegration(t *testing.T) {
} }
result := make(map[string]*durability.HealthStat) result := make(map[string]*durability.HealthStat)
planet.Satellites[0].RangedLoop.DurabilityReport.Observer.TestChangeReporter(func(n time.Time, name string, stat *durability.HealthStat) { for _, observer := range planet.Satellites[0].RangedLoop.DurabilityReport.Observer {
result[name] = stat observer.TestChangeReporter(func(n time.Time, class string, value string, stat *durability.HealthStat) {
result[value] = stat
}) })
}
rangedLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service rangedLoopService := planet.Satellites[0].RangedLoop.RangedLoop.Service
_, err := rangedLoopService.RunOnce(ctx) _, err := rangedLoopService.RunOnce(ctx)
require.Len(t, result, 15) require.Len(t, result, 15)
// one or two pieces are controlled out of the 5-6 --> 3 or 4 pieces are available without HU nodes // one or two pieces are controlled out of the 5-6 --> 3 or 4 pieces are available without HU nodes
require.True(t, result["c:HU"].Min() > 2) require.True(t, result["HU"].Min() > 2)
require.True(t, result["c:HU"].Min() < 5) require.True(t, result["HU"].Min() < 5)
require.NoError(t, err) require.NoError(t, err)
}) })
} }

View File

@ -64,10 +64,9 @@ func TestDurability(t *testing.T) {
} }
ctx := testcontext.New(t) ctx := testcontext.New(t)
c := NewDurability(nil, nil, []NodeClassifier{ c := NewDurability(nil, nil, "net", func(node *nodeselection.SelectedNode) string {
func(node *nodeselection.SelectedNode) string { return node.LastNet
return "net:" + node.LastNet }, 110, 0, 0, 0)
}}, 110, 0, 0, 0)
c.aliasMap = metabase.NewNodeAliasMap(aliases) c.aliasMap = metabase.NewNodeAliasMap(aliases)
for _, node := range storageNodes { for _, node := range storageNodes {
@ -98,11 +97,11 @@ func TestDurability(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
} }
require.NotNil(t, c.healthStat["net:127.0.0.0"]) require.NotNil(t, c.healthStat["127.0.0.0"])
require.Equal(t, 1, c.healthStat["net:127.0.0.0"].Min()) require.Equal(t, 1, c.healthStat["127.0.0.0"].Min())
require.Equal(t, segment1.StreamID.String()+"/0", c.healthStat["net:127.0.0.0"].Exemplar) require.Equal(t, segment1.StreamID.String()+"/0", c.healthStat["127.0.0.0"].Exemplar)
require.Equal(t, 2, c.healthStat["net:127.0.1.0"].Min()) require.Equal(t, 2, c.healthStat["127.0.1.0"].Min())
require.Equal(t, 3, c.healthStat["net:127.0.2.0"].Min()) require.Equal(t, 3, c.healthStat["127.0.2.0"].Min())
// usually called with c.Start() // usually called with c.Start()
c.resetStat() c.resetStat()
@ -113,8 +112,7 @@ func TestDurability(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
// second run supposed to have zero stat. // second run supposed to have zero stat.
require.Nil(t, c.healthStat["net:127.0.0.0"]) require.Nil(t, c.healthStat["127.0.0.0"])
} }
func TestDurabilityUnknownNode(t *testing.T) { func TestDurabilityUnknownNode(t *testing.T) {
@ -132,10 +130,9 @@ func TestDurabilityUnknownNode(t *testing.T) {
}) })
ctx := testcontext.New(t) ctx := testcontext.New(t)
c := NewDurability(nil, nil, []NodeClassifier{ c := NewDurability(nil, nil, "net", func(node *nodeselection.SelectedNode) string {
func(node *nodeselection.SelectedNode) string { return node.LastNet
return "net:" + node.LastNet }, 110, 0, 0, 0)
}}, 110, 0, 0, 0)
c.aliasMap = metabase.NewNodeAliasMap(aliases) c.aliasMap = metabase.NewNodeAliasMap(aliases)
for _, node := range storageNodes { for _, node := range storageNodes {
@ -174,7 +171,7 @@ func TestDurabilityUnknownNode(t *testing.T) {
err = c.Join(ctx, fork) err = c.Join(ctx, fork)
require.NoError(t, err) require.NoError(t, err)
// note: the newly created node (alias 9999) is not considered. // note: the newly created node (alias 9999) is not considered.
require.Equal(t, 0, c.healthStat["net:127.0.0.1"].Min()) require.Equal(t, 0, c.healthStat["127.0.0.1"].Min())
} }
func TestBusFactor(t *testing.T) { func TestBusFactor(t *testing.T) {
@ -182,7 +179,7 @@ func TestBusFactor(t *testing.T) {
f := ObserverFork{} f := ObserverFork{}
for i := 0; i < 100; i++ { for i := 0; i < 100; i++ {
f.classified = append(f.classified, []classID{classID((i))}) f.classified = append(f.classified, classID(i))
} }
f.controlledByClassCache = make([]int32, 100) f.controlledByClassCache = make([]int32, 100)
f.busFactorCache = make([]int32, 300) f.busFactorCache = make([]int32, 300)
@ -295,11 +292,6 @@ func BenchmarkDurabilityProcess(b *testing.B) {
aliasMap: aliases, aliasMap: aliases,
nodes: nodeMap, nodes: nodeMap,
classifierCache: make([][]string, aliases.Max()), classifierCache: make([][]string, aliases.Max()),
classifiers: []NodeClassifier{
func(node *nodeselection.SelectedNode) string {
return "email:" + node.Email
},
},
} }
b.ResetTimer() b.ResetTimer()

View File

@ -19,7 +19,7 @@ var (
func sendObserverDurations(observerDurations []ObserverDuration) { func sendObserverDurations(observerDurations []ObserverDuration) {
for _, od := range observerDurations { for _, od := range observerDurations {
ev.Event("rangedloop", ev.Event("rangedloop",
eventkit.String("observer", fmt.Sprintf("%T", od.Observer)), eventkit.String("observer", observerName(od.Observer)),
eventkit.Duration("duration", od.Duration)) eventkit.Duration("duration", od.Duration))
} }
@ -44,7 +44,7 @@ func (o *completedObserverStats) Stats(cb func(key monkit.SeriesKey, field strin
// if there are no completed observers yet, no statistics will be sent // if there are no completed observers yet, no statistics will be sent
for _, observerDuration := range o.observerDurations { for _, observerDuration := range o.observerDurations {
key := monkit.NewSeriesKey("completed-observer-duration") key := monkit.NewSeriesKey("completed-observer-duration")
key = key.WithTag("observer", fmt.Sprintf("%T", observerDuration.Observer)) key = key.WithTag("observer", observerName(observerDuration.Observer))
cb(key, "duration", observerDuration.Duration.Seconds()) cb(key, "duration", observerDuration.Duration.Seconds())
} }
@ -57,3 +57,16 @@ func (o *completedObserverStats) setObserverDurations(observerDurations []Observ
o.observerDurations = observerDurations o.observerDurations = observerDurations
} }
type withClass interface {
GetClass() string
}
func observerName(o Observer) string {
name := fmt.Sprintf("%T", o)
// durability observers are per class instances.
if dr, ok := o.(withClass); ok {
name += fmt.Sprintf("[%s]", dr.GetClass())
}
return name
}

View File

@ -73,7 +73,7 @@ type RangedLoop struct {
} }
DurabilityReport struct { DurabilityReport struct {
Observer *durability.Report Observer []*durability.Report
} }
RangedLoop struct { RangedLoop struct {
@ -147,20 +147,23 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
} }
{ // setup { // setup
peer.DurabilityReport.Observer = durability.NewDurability(db.OverlayCache(), metabaseDB, []durability.NodeClassifier{ classes := map[string]func(node *nodeselection.SelectedNode) string{
func(node *nodeselection.SelectedNode) string { "email": func(node *nodeselection.SelectedNode) string {
return "e:" + node.Email return node.Email
}, },
func(node *nodeselection.SelectedNode) string { "wallet": func(node *nodeselection.SelectedNode) string {
return "w:" + node.Wallet return node.Wallet
}, },
func(node *nodeselection.SelectedNode) string { "net": func(node *nodeselection.SelectedNode) string {
return "n:" + node.LastNet return node.LastNet
}, },
func(node *nodeselection.SelectedNode) string { "country": func(node *nodeselection.SelectedNode) string {
return "c:" + node.CountryCode.String() return node.CountryCode.String()
}, },
}, config.Metainfo.RS.Total, config.Metainfo.RS.Repair, config.Metainfo.RS.Repair-config.Metainfo.RS.Min, config.RangedLoop.AsOfSystemInterval) }
for class, f := range classes {
peer.DurabilityReport.Observer = append(peer.DurabilityReport.Observer, durability.NewDurability(db.OverlayCache(), metabaseDB, class, f, config.Metainfo.RS.Total, config.Metainfo.RS.Repair, config.Metainfo.RS.Repair-config.Metainfo.RS.Min, config.RangedLoop.AsOfSystemInterval))
}
} }
{ // setup overlay { // setup overlay
@ -226,7 +229,9 @@ func NewRangedLoop(log *zap.Logger, db DB, metabaseDB *metabase.DB, config *Conf
} }
if config.DurabilityReport.Enabled { if config.DurabilityReport.Enabled {
observers = append(observers, peer.DurabilityReport.Observer) for _, observer := range peer.DurabilityReport.Observer {
observers = append(observers, observer)
}
} }
segments := rangedloop.NewMetabaseRangeSplitter(metabaseDB, config.RangedLoop.AsOfSystemInterval, config.RangedLoop.BatchSize) segments := rangedloop.NewMetabaseRangeSplitter(metabaseDB, config.RangedLoop.AsOfSystemInterval, config.RangedLoop.BatchSize)