da9ca0c650
Satellites set their configuration values to default values using cfgstruct, however, it turns out our tests don't test these values at all! Instead, they have a completely separate definition system that is easy to forget about. As is to be expected, these values have drifted, and it appears in a few cases test planet is testing unreasonable values that we won't see in production, or perhaps worse, features enabled in production were missed and weren't enabled in testplanet. This change makes it so all values are configured the same, systematic way, so it's easy to see when test values are different than dev values or release values, and it's less hard to forget to enable features in testplanet. In terms of reviewing, this change should be actually fairly easy to review, considering private/testplanet/satellite.go keeps the current config system and the new one and confirms that they result in identical configurations, so you can be certain that nothing was missed and the config is all correct. You can also check the config lock to see what actual config values changed. Change-Id: I6715d0794887f577e21742afcf56fd2b9d12170e
75 lines
2.4 KiB
Go
75 lines
2.4 KiB
Go
// Copyright (C) 2020 Storj Labs, Inc.
|
|
// See LICENSE for copying information.
|
|
|
|
package straynodes
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/spacemonkeygo/monkit/v3"
|
|
"go.uber.org/zap"
|
|
|
|
"storj.io/common/sync2"
|
|
"storj.io/storj/satellite/overlay"
|
|
)
|
|
|
|
var mon = monkit.Package()
|
|
|
|
// Config contains configurable values for stray nodes chore.
|
|
type Config struct {
|
|
EnableDQ bool `help:"whether nodes will be disqualified if they have not been contacted in some time" releaseDefault:"true" devDefault:"true"`
|
|
Interval time.Duration `help:"how often to check for and DQ stray nodes" releaseDefault:"168h" devDefault:"5m" testDefault:"1m"`
|
|
MaxDurationWithoutContact time.Duration `help:"length of time a node can go without contacting satellite before being disqualified" releaseDefault:"720h" devDefault:"7200h" testDefault:"30s"`
|
|
Limit int `help:"Max number of nodes to return in a single query. Chore will iterate until rows returned is less than limit" releaseDefault:"1000" devDefault:"1000"`
|
|
}
|
|
|
|
// Chore disqualifies stray nodes.
|
|
type Chore struct {
|
|
log *zap.Logger
|
|
cache overlay.DB
|
|
maxDurationWithoutContact time.Duration
|
|
limit int
|
|
Loop *sync2.Cycle
|
|
}
|
|
|
|
// NewChore creates a new stray nodes Chore.
|
|
func NewChore(log *zap.Logger, cache overlay.DB, config Config) *Chore {
|
|
return &Chore{
|
|
log: log,
|
|
cache: cache,
|
|
maxDurationWithoutContact: config.MaxDurationWithoutContact,
|
|
limit: config.Limit,
|
|
Loop: sync2.NewCycle(config.Interval),
|
|
}
|
|
}
|
|
|
|
// Run runs the chore.
|
|
func (chore *Chore) Run(ctx context.Context) (err error) {
|
|
defer mon.Task()(&ctx)(&err)
|
|
|
|
return chore.Loop.Run(ctx, func(ctx context.Context) error {
|
|
var total int
|
|
for {
|
|
n, err := chore.cache.DQNodesLastSeenBefore(ctx, time.Now().UTC().Add(-chore.maxDurationWithoutContact), chore.limit)
|
|
if err != nil {
|
|
chore.log.Error("error disqualifying stray nodes", zap.Error(err))
|
|
mon.IntVal("stray_nodes_dq_count").Observe(int64(total))
|
|
return nil
|
|
}
|
|
total += n
|
|
if n < chore.limit {
|
|
break
|
|
}
|
|
}
|
|
mon.IntVal("stray_nodes_dq_count").Observe(int64(total))
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// Close closes chore.
|
|
func (chore *Chore) Close() error {
|
|
chore.Loop.Close()
|
|
return nil
|
|
}
|