satellite/overlay: insert DQ node events for stray nodes

Change-Id: I99da11e506ab7f6bcebdb08a5815078a3297c932
This commit is contained in:
Cameron 2022-10-12 13:56:15 -04:00 committed by Storj Robot
parent 74ddfab810
commit cb0c359b81
6 changed files with 47 additions and 16 deletions

View File

@ -263,7 +263,7 @@ func New(log *zap.Logger, full *identity.FullIdentity, db DB,
})
if config.StrayNodes.EnableDQ {
peer.Overlay.DQStrayNodes = straynodes.NewChore(peer.Log.Named("overlay:dq-stray-nodes"), peer.Overlay.DB, config.StrayNodes)
peer.Overlay.DQStrayNodes = straynodes.NewChore(peer.Log.Named("overlay:dq-stray-nodes"), peer.Overlay.Service, config.StrayNodes)
peer.Services.Add(lifecycle.Item{
Name: "overlay:dq-stray-nodes",
Run: peer.Overlay.DQStrayNodes.Run,

View File

@ -63,7 +63,7 @@ func TestDQNodesLastSeenBefore(t *testing.T) {
// check that limit works, set limit = 1
// run twice to DQ nodes 1 and 2
for i := 0; i < 2; i++ {
n, err := cache.DQNodesLastSeenBefore(ctx, time.Now(), 1)
_, n, err := cache.DQNodesLastSeenBefore(ctx, time.Now(), 1)
require.NoError(t, err)
require.Equal(t, n, 1)
}
@ -81,7 +81,7 @@ func TestDQNodesLastSeenBefore(t *testing.T) {
// there should be no more nodes for DQ
// use higher limit to double check that DQ times
// for nodes 1 and 2 have not changed
n, err := cache.DQNodesLastSeenBefore(ctx, time.Now(), 100)
_, n, err := cache.DQNodesLastSeenBefore(ctx, time.Now(), 100)
require.NoError(t, err)
require.Equal(t, n, 0)

View File

@ -100,7 +100,7 @@ type DB interface {
// DQNodesLastSeenBefore disqualifies a limited number of nodes where last_contact_success < cutoff except those already disqualified
// or gracefully exited or where last_contact_success = '0001-01-01 00:00:00+00'.
DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (count int, err error)
DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (nodeEmails map[storj.NodeID]string, count int, err error)
// TestSuspendNodeUnknownAudit suspends a storage node for unknown audits.
TestSuspendNodeUnknownAudit(ctx context.Context, nodeID storj.NodeID, suspendedAt time.Time) (err error)
@ -700,6 +700,25 @@ func (service *Service) GetReliablePiecesInExcludedCountries(ctx context.Context
return piecesInExcluded, nil
}
// DQNodesLastSeenBefore disqualifies nodes who have not been contacted since the cutoff time.
func (service *Service) DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (count int, err error) {
defer mon.Task()(&ctx)(&err)
nodes, count, err := service.db.DQNodesLastSeenBefore(ctx, cutoff, limit)
if err != nil {
return 0, err
}
if service.config.SendNodeEmails {
for nodeID, email := range nodes {
_, err = service.nodeEvents.Insert(ctx, email, nodeID, nodeevents.Disqualified)
if err != nil {
service.log.Error("could not insert node disqualified into node events", zap.Error(err))
}
}
}
return count, err
}
// DisqualifyNode disqualifies a storage node.
func (service *Service) DisqualifyNode(ctx context.Context, nodeID storj.NodeID, reason DisqualificationReason) (err error) {
defer mon.Task()(&ctx)(&err)

View File

@ -27,14 +27,14 @@ type Config struct {
// Chore disqualifies stray nodes.
type Chore struct {
log *zap.Logger
cache overlay.DB
cache *overlay.Service
maxDurationWithoutContact time.Duration
limit int
Loop *sync2.Cycle
}
// NewChore creates a new stray nodes Chore.
func NewChore(log *zap.Logger, cache overlay.DB, config Config) *Chore {
func NewChore(log *zap.Logger, cache *overlay.Service, config Config) *Chore {
return &Chore{
log: log,
cache: cache,

View File

@ -14,6 +14,7 @@ import (
"storj.io/common/testcontext"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/nodeevents"
"storj.io/storj/satellite/overlay"
)
@ -23,6 +24,7 @@ func TestDQStrayNodes(t *testing.T) {
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour
config.Overlay.SendNodeEmails = true
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
@ -33,7 +35,7 @@ func TestDQStrayNodes(t *testing.T) {
sat.Overlay.DQStrayNodes.Loop.Pause()
cache := planet.Satellites[0].Overlay.DB
email := "test@storj.test"
strayInfo, err := cache.Get(ctx, strayNode.ID())
require.NoError(t, err)
require.Nil(t, strayInfo.Disqualified)
@ -50,6 +52,9 @@ func TestDQStrayNodes(t *testing.T) {
Timestamp: time.Time{},
Release: false,
},
Operator: &pb.NodeOperator{
Email: email,
},
}
// set strayNode last_contact_success to 48 hours ago
@ -57,6 +62,11 @@ func TestDQStrayNodes(t *testing.T) {
sat.Overlay.DQStrayNodes.Loop.TriggerWait()
ne, err := sat.DB.NodeEvents().GetLatestByEmailAndEvent(ctx, email, nodeevents.Disqualified)
require.NoError(t, err)
require.Equal(t, email, ne.Email)
require.Equal(t, strayNode.ID(), ne.NodeID)
strayInfo, err = cache.Get(ctx, strayNode.ID())
require.NoError(t, err)
require.NotNil(t, strayInfo.Disqualified)

View File

@ -1131,20 +1131,21 @@ func getNodeStats(dbNode *dbx.Node) *overlay.NodeStats {
// DQNodesLastSeenBefore disqualifies a limited number of nodes where last_contact_success < cutoff except those already disqualified
// or gracefully exited or where last_contact_success = '0001-01-01 00:00:00+00'.
func (cache *overlaycache) DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (count int, err error) {
func (cache *overlaycache) DQNodesLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (nodeEmails map[storj.NodeID]string, count int, err error) {
defer mon.Task()(&ctx)(&err)
var nodeIDs []storj.NodeID
nodeEmails = make(map[storj.NodeID]string)
for {
nodeIDs, err = cache.getNodesForDQLastSeenBefore(ctx, cutoff, limit)
if err != nil {
if cockroachutil.NeedsRetry(err) {
continue
}
return 0, err
return nil, 0, err
}
if len(nodeIDs) == 0 {
return 0, nil
return nil, 0, nil
}
break
}
@ -1159,28 +1160,29 @@ func (cache *overlaycache) DQNodesLastSeenBefore(ctx context.Context, cutoff tim
AND exit_finished_at IS NULL
AND last_contact_success < $2
AND last_contact_success != '0001-01-01 00:00:00+00'::timestamptz
RETURNING id, last_contact_success;
RETURNING id, email, last_contact_success;
`), pgutil.NodeIDArray(nodeIDs), cutoff, overlay.DisqualificationReasonNodeOffline)
if err != nil {
return 0, err
return nil, 0, err
}
defer func() { err = errs.Combine(err, rows.Close()) }()
for rows.Next() {
var id storj.NodeID
var email string
var lcs time.Time
err = rows.Scan(&id, &lcs)
err = rows.Scan(&id, &email, &lcs)
if err != nil {
return count, err
return nil, count, err
}
cache.db.log.Info("Disqualified",
zap.String("DQ type", "stray node"),
zap.Stringer("Node ID", id),
zap.Stringer("Last contacted", lcs))
nodeEmails[id] = email
count++
}
return count, rows.Err()
return nodeEmails, count, rows.Err()
}
func (cache *overlaycache) getNodesForDQLastSeenBefore(ctx context.Context, cutoff time.Time, limit int) (nodes []storj.NodeID, err error) {