storj/satellite/nodeevents/chore_test.go
Cameron 08c9d745f1 satellite/nodeevents: take last_attempted into account when selecting
Previously, the node events chore would select based on the earliest
created_at. However, if for some reason this batch fails, it would still
be the next item to select. If there is a consistent error, the chore
would be stuck retrying the same batch over and over. Now instead
GetNextBatch orders by `last_attempted NULLS FIRST ASC, created_at ASC`.
If a batch fails during Notify, last_attempted is updated so we can move
on to a new batch if one exists.

Change-Id: Ia8458e05ac358d85b2f2c6d690f3d607d631be61
2022-12-01 20:08:38 +00:00

157 lines
4.7 KiB
Go

// Copyright (C) 2022 Storj Labs, Inc.
// See LICENSE for copying information.
package nodeevents_test
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/zeebo/errs"
"go.uber.org/zap"
"storj.io/common/testcontext"
"storj.io/common/uuid"
"storj.io/storj/private/testplanet"
"storj.io/storj/satellite"
"storj.io/storj/satellite/nodeevents"
"storj.io/storj/satellite/overlay"
"storj.io/storj/storagenode"
)
type TestNotifier struct {
notifications map[string][]nodeevents.NodeEvent
}
func (tn *TestNotifier) Notify(ctx context.Context, satellite string, events []nodeevents.NodeEvent) error {
if len(events) == 0 {
return nil
}
email := events[0].Email
n := tn.notifications[email]
n = append(n, events...)
tn.notifications[email] = n
return nil
}
type ErrorNotifier struct {
errCount int
errID uuid.UUID
}
func (errN *ErrorNotifier) Notify(ctx context.Context, satellite string, events []nodeevents.NodeEvent) error {
if len(events) == 0 {
return errs.New("This shouldn't happen")
}
errN.errCount++
errN.errID = events[0].ID
return errs.New("test error")
}
func TestNodeEventsChore(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 2, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.SendNodeEmails = true
config.NodeEvents.SelectionWaitPeriod = 5 * time.Minute
},
StorageNode: func(index int, config *storagenode.Config) {
config.Operator.Email = "test@storj.test"
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
sat := planet.Satellites[0]
node0 := planet.StorageNodes[0]
node1 := planet.StorageNodes[1]
// email was reconfigured to be the same for all nodes.
email := node0.Config.Operator.Email
chore := sat.NodeEvents.Chore
chore.Loop.Pause()
tn := &TestNotifier{
notifications: make(map[string][]nodeevents.NodeEvent),
}
chore.SetNotifier(tn)
// First, test that chore does not notify because not enough time has elapsed since the oldest event of type Disqualified,
// with this email, was inserted.
//
// DQ nodes. Should create a node events in nodeevents DB.
require.NoError(t, sat.Overlay.Service.DisqualifyNode(ctx, node0.ID(), overlay.DisqualificationReasonUnknown))
require.NoError(t, sat.Overlay.Service.DisqualifyNode(ctx, node1.ID(), overlay.DisqualificationReasonUnknown))
// Trigger chore and check that Notifier.Notify was NOT called with the node events.
chore.Loop.TriggerWait()
events := tn.notifications[email]
require.Empty(t, events)
// Now, set nowFn on chore to 5 minutes in the future to test that chore does notify for the events.
futureTime := func() time.Time {
return time.Now().Add(5 * time.Minute)
}
chore.SetNow(futureTime)
// Trigger chore and check that Notifier.Notify was called with the node events.
chore.Loop.TriggerWait()
events = tn.notifications[email]
require.Len(t, events, 2)
var foundEvent1, foundEvent2 bool
for _, e := range events {
require.Equal(t, email, e.Email)
require.Equal(t, nodeevents.Disqualified, e.Event)
if e.NodeID == node0.ID() {
foundEvent1 = true
} else if e.NodeID == node1.ID() {
foundEvent2 = true
}
}
require.True(t, foundEvent1)
require.True(t, foundEvent2)
})
}
func TestNodeEventsChoreFailedNotify(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 0,
Reconfigure: testplanet.Reconfigure{
Satellite: func(log *zap.Logger, index int, config *satellite.Config) {
config.Overlay.SendNodeEmails = true
config.NodeEvents.SelectionWaitPeriod = 5 * time.Minute
},
},
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
sat := planet.Satellites[0]
node0 := planet.StorageNodes[0]
chore := sat.NodeEvents.Chore
chore.Loop.Pause()
errN := &ErrorNotifier{}
chore.SetNotifier(errN)
// DQ nodes. Should create a node events in nodeevents DB.
require.NoError(t, sat.Overlay.Service.DisqualifyNode(ctx, node0.ID(), overlay.DisqualificationReasonUnknown))
// Now, set nowFn on chore to 5 minutes in the future to test that chore does notify for the events.
futureTime := func() time.Time {
return time.Now().Add(5 * time.Minute)
}
chore.SetNow(futureTime)
// Trigger chore and check that error occurred, that last_attempted has been updated, and email_sent is null
chore.Loop.TriggerWait()
require.Equal(t, 1, errN.errCount)
event, err := sat.DB.NodeEvents().GetByID(ctx, errN.errID)
require.NoError(t, err)
require.NotNil(t, event.LastAttempted)
require.Nil(t, event.EmailSent)
})
}