storagenodedb/orders: select unsent satellite with expiration

In production we are seeing ~115 storage nodes (out of ~6,500) are not using the new SettlementWithWindow endpoint (but they are upgraded to > v1.12).

We analyzed data being reported by monkit for the nodes who were above version 1.11 but were not successfully submitting orders to the new endpoint.
The nodes fell into a few categories:
1. Always fail to list orders from the db; never get to try sending orders from the filestore
2. Successfully list/send orders from the db; never get to calling satellite endpoint for submitting filestore orders
3. Successfully list/send orders from the db; successfully list filestore orders, but satellite endpoint fails (with "unauthenticated" drpc error)

The code change here add the following to address these issues:
- modify the query for ordersDB.listUnsentBySatellite so that we no longer select expired orders from the unsent_orders table
- always process any orders that are in the ordersDB and also any orders stored in the filestore
- add monkit monitoring to filestore.ListUnsentBySatellite so that we can see the failures/successes

Change-Id: I0b473e5d75252e7ab5fa6b5c204ed260ab5094ec
This commit is contained in:
Jessica Grebenschikov 2020-10-15 11:57:02 -07:00 committed by Jess G
parent 360ab17869
commit 89bdb20a62
7 changed files with 89 additions and 59 deletions

View File

@ -52,7 +52,7 @@ func TestSendingReceivingOrders(t *testing.T) {
sumBeforeSend := 0
for _, storageNode := range planet.StorageNodes {
// change settle buffer so orders can be sent
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(tomorrow)
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
for _, satUnsent := range unsentMap {
sumBeforeSend += len(satUnsent.InfoList)
@ -66,7 +66,7 @@ func TestSendingReceivingOrders(t *testing.T) {
for _, storageNode := range planet.StorageNodes {
storageNode.Storage2.Orders.SendOrders(ctx, tomorrow)
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(tomorrow)
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
for _, satUnsent := range unsentMap {
sumUnsent += len(satUnsent.InfoList)
@ -105,7 +105,7 @@ func TestUnableToSendOrders(t *testing.T) {
sumBeforeSend := 0
for _, storageNode := range planet.StorageNodes {
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(tomorrow)
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
for _, satUnsent := range unsentMap {
sumBeforeSend += len(satUnsent.InfoList)
@ -121,7 +121,7 @@ func TestUnableToSendOrders(t *testing.T) {
for _, storageNode := range planet.StorageNodes {
storageNode.Storage2.Orders.SendOrders(ctx, tomorrow)
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(tomorrow)
unsentMap, err := storageNode.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
for _, satUnsent := range unsentMap {
sumUnsent += len(satUnsent.InfoList)
@ -171,7 +171,7 @@ func TestUploadDownloadBandwidth(t *testing.T) {
var expectedBucketBandwidth int64
expectedStorageBandwidth := make(map[storj.NodeID]int64)
for _, storageNode := range planet.StorageNodes {
infos, err := storageNode.OrdersStore.ListUnsentBySatellite(tomorrow)
infos, err := storageNode.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
for _, unsentInfo := range infos {
for _, orderInfo := range unsentInfo.InfoList {

View File

@ -103,12 +103,7 @@ func TestDB(t *testing.T) {
unsentGrouped, err := ordersdb.ListUnsentBySatellite(ctx)
require.NoError(t, err)
expectedGrouped := map[storj.NodeID][]*ordersfile.Info{
satellite0.ID: {
{Limit: infos[0].Limit, Order: infos[0].Order},
{Limit: infos[1].Limit, Order: infos[1].Order},
},
}
expectedGrouped := map[storj.NodeID][]*ordersfile.Info{}
require.Empty(t, cmp.Diff(expectedGrouped, unsentGrouped, cmp.Comparer(pb.Equal)))
// test archival
@ -217,9 +212,7 @@ func TestDB_Trivial(t *testing.T) {
{ // Ensure ListUnsentBySatellite works at all
infos, err := db.Orders().ListUnsentBySatellite(ctx)
require.NoError(t, err)
require.Len(t, infos, 1)
require.Contains(t, infos, satelliteID)
require.Len(t, infos[satelliteID], 1)
require.Len(t, infos, 0)
}
{ // Ensure Archive works at all

View File

@ -179,11 +179,7 @@ func (service *Service) SendOrders(ctx context.Context, now time.Time) {
// If there are orders in the database, send from there.
// Otherwise, send from the filestore.
hasOrders := service.sendOrdersFromDB(ctx)
if hasOrders {
return
}
service.sendOrdersFromDB(ctx)
service.sendOrdersFromFileStore(ctx, now)
}
@ -390,10 +386,9 @@ func (service *Service) sendOrdersFromFileStore(ctx context.Context, now time.Ti
// Continue sending until there are no more windows to send, or all relevant satellites are offline.
for {
ordersBySatellite, err := service.ordersStore.ListUnsentBySatellite(now)
ordersBySatellite, err := service.ordersStore.ListUnsentBySatellite(ctx, now)
if err != nil {
service.log.Error("listing orders", zap.Error(err))
return
}
if len(ordersBySatellite) == 0 {
service.log.Debug("no orders to send")

View File

@ -97,7 +97,7 @@ func TestOrderFileStoreSettle(t *testing.T) {
err := uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
require.NoError(t, err)
toSend, err := node.OrdersStore.ListUnsentBySatellite(tomorrow)
toSend, err := node.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, toSend, 1)
ordersForSat := toSend[satellite.ID()]
@ -106,7 +106,7 @@ func TestOrderFileStoreSettle(t *testing.T) {
// trigger order send
service.SendOrders(ctx, tomorrow)
toSend, err = node.OrdersStore.ListUnsentBySatellite(tomorrow)
toSend, err = node.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, toSend, 0)
@ -117,7 +117,7 @@ func TestOrderFileStoreSettle(t *testing.T) {
}
// TODO remove when db is removed.
// TestOrderFileStoreAndDBSettle ensures that if orders exist in both DB and filestore, that the DB orders are settled first.
// TestOrderFileStoreAndDBSettle ensures that if orders exist in both DB and filestore, that the DB orders and filestore are both settled.
func TestOrderFileStoreAndDBSettle(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 1,
@ -169,7 +169,7 @@ func TestOrderFileStoreAndDBSettle(t *testing.T) {
err = uplinkPeer.Upload(ctx, satellite, "testbucket", "test/path", testData)
require.NoError(t, err)
toSendFileStore, err := node.OrdersStore.ListUnsentBySatellite(tomorrow)
toSendFileStore, err := node.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, toSendFileStore, 1)
ordersForSat := toSendFileStore[satellite.ID()]
@ -178,7 +178,7 @@ func TestOrderFileStoreAndDBSettle(t *testing.T) {
// trigger order send
service.SendOrders(ctx, tomorrow)
// DB orders should be archived, but filestore orders should still be unsent.
// DB and filestore orders should both be archived.
toSendDB, err = node.DB.Orders().ListUnsent(ctx, 10)
require.NoError(t, err)
require.Len(t, toSendDB, 0)
@ -187,23 +187,12 @@ func TestOrderFileStoreAndDBSettle(t *testing.T) {
require.NoError(t, err)
require.Len(t, archived, 1)
toSendFileStore, err = node.OrdersStore.ListUnsentBySatellite(tomorrow)
require.NoError(t, err)
require.Len(t, toSendFileStore, 1)
ordersForSat = toSendFileStore[satellite.ID()]
require.Len(t, ordersForSat.InfoList, 1)
// trigger order send again
service.SendOrders(ctx, tomorrow)
// now FileStore orders should be archived too.
toSendFileStore, err = node.OrdersStore.ListUnsentBySatellite(tomorrow)
toSendFileStore, err = node.OrdersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, toSendFileStore, 0)
archived, err = node.OrdersStore.ListArchived()
filestoreArchived, err := node.OrdersStore.ListArchived()
require.NoError(t, err)
require.Len(t, archived, 1)
require.Len(t, filestoreArchived, 1)
})
}

View File

@ -4,6 +4,7 @@
package orders
import (
"context"
"io"
"os"
"path/filepath"
@ -180,7 +181,8 @@ type UnsentInfo struct {
// It only reads files where the order limit grace period has passed, meaning no new orders will be appended.
// There is a separate window for each created at hour, so if a satellite has 2 windows, `ListUnsentBySatellite`
// needs to be called twice, with calls to `Archive` in between each call, to see all unsent orders.
func (store *FileStore) ListUnsentBySatellite(now time.Time) (infoMap map[storj.NodeID]UnsentInfo, err error) {
func (store *FileStore) ListUnsentBySatellite(ctx context.Context, now time.Time) (infoMap map[storj.NodeID]UnsentInfo, err error) {
defer mon.Task()(&ctx)(&err)
// shouldn't be necessary, but acquire archiveMu to ensure we do not attempt to archive files during list
store.archiveMu.Lock()
defer store.archiveMu.Unlock()

View File

@ -16,6 +16,7 @@ import (
"storj.io/common/storj"
"storj.io/common/testcontext"
"storj.io/common/testrand"
"storj.io/storj/private/testplanet"
"storj.io/storj/storagenode/orders"
"storj.io/storj/storagenode/orders/ordersfile"
)
@ -82,7 +83,7 @@ func TestOrdersStore_ListUnsentBySatellite(t *testing.T) {
status2 := pb.SettlementWithWindowResponse_REJECTED
for i := 0; i < 3; i++ {
// This should return all the orders created more than 1 hour before "now".
unsentMap, err := ordersStore.ListUnsentBySatellite(now.Add(12 * time.Hour))
unsentMap, err := ordersStore.ListUnsentBySatellite(ctx, now.Add(12*time.Hour))
require.NoError(t, err)
// on last iteration, expect nothing returned
@ -189,7 +190,7 @@ func TestOrdersStore_ListUnsentBySatellite_Ongoing(t *testing.T) {
require.NoError(t, err)
// empty store means no orders can be listed
unsent, err := ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err := ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 0)
@ -209,7 +210,7 @@ func TestOrdersStore_ListUnsentBySatellite_Ongoing(t *testing.T) {
}))
// check that we can list it tomorrow
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
@ -218,7 +219,7 @@ func TestOrdersStore_ListUnsentBySatellite_Ongoing(t *testing.T) {
require.NoError(t, err)
// we should no longer be able to list that window
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 0)
@ -238,11 +239,60 @@ func TestOrdersStore_ListUnsentBySatellite_Ongoing(t *testing.T) {
}))
// check that we can list it tomorrow
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
}
func TestOrdersDB_ListUnsentBySatellite_Expired(t *testing.T) {
testplanet.Run(t, testplanet.Config{
SatelliteCount: 1, StorageNodeCount: 1, UplinkCount: 1,
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) {
satellitePeer := planet.Satellites[0]
storagenodePeer := planet.StorageNodes[0]
storageNodeOrdersDB := storagenodePeer.DB.Orders()
now := time.Now().UTC()
testSerialNumber := testrand.SerialNumber()
// Setup: add one order that is not expired
require.NoError(t, storageNodeOrdersDB.Enqueue(ctx, &ordersfile.Info{
Limit: &pb.OrderLimit{
SerialNumber: testSerialNumber,
SatelliteId: satellitePeer.ID(),
Action: pb.PieceAction_GET,
OrderCreation: now,
OrderExpiration: now.Add(5 * time.Hour),
},
Order: &pb.Order{
SerialNumber: testSerialNumber,
Amount: 100,
},
}))
testSerialNumber2 := testrand.SerialNumber()
// Setup: add one order that IS expired
require.NoError(t, storageNodeOrdersDB.Enqueue(ctx, &ordersfile.Info{
Limit: &pb.OrderLimit{
SerialNumber: testSerialNumber2,
SatelliteId: satellitePeer.ID(),
Action: pb.PieceAction_GET,
OrderExpiration: now.Add(-5 * time.Hour),
},
Order: &pb.Order{
SerialNumber: testSerialNumber2,
Amount: 20,
},
}))
// Confirm that when you list unsent orders that expired orders are not returned
unsentOrdersBySA, err := storageNodeOrdersDB.ListUnsentBySatellite(ctx)
require.NoError(t, err)
require.Equal(t, len(unsentOrdersBySA), 1)
// there should only be 1 unsent order, since the other order is expired
require.Equal(t, len(unsentOrdersBySA[satellitePeer.ID()]), 1)
// the unsent order should be the unexpired order
require.Equal(t, unsentOrdersBySA[satellitePeer.ID()][0].Limit.SerialNumber, testSerialNumber)
})
}
func TestOrdersStore_CorruptUnsentV0(t *testing.T) {
ctx := testcontext.New(t)
defer ctx.Cleanup()
@ -256,7 +306,7 @@ func TestOrdersStore_CorruptUnsentV0(t *testing.T) {
require.NoError(t, err)
// empty store means no orders can be listed
unsent, err := ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err := ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 0)
@ -284,7 +334,7 @@ func TestOrdersStore_CorruptUnsentV0(t *testing.T) {
require.NoError(t, of.Close())
// check that we can see both orders tomorrow
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 2)
@ -307,7 +357,7 @@ func TestOrdersStore_CorruptUnsentV0(t *testing.T) {
require.NoError(t, of.Close())
// only the second order should be corrupted, so we should still see one order
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 1)
@ -326,7 +376,7 @@ func TestOrdersStore_CorruptUnsentV1(t *testing.T) {
require.NoError(t, err)
// empty store means no orders can be listed
unsent, err := ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err := ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 0)
@ -352,7 +402,7 @@ func TestOrdersStore_CorruptUnsentV1(t *testing.T) {
require.NoError(t, ordersStore.Enqueue(info))
// check that we can see both orders tomorrow
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 2)
@ -369,7 +419,7 @@ func TestOrdersStore_CorruptUnsentV1(t *testing.T) {
require.NoError(t, err)
// only the second order should be corrupted, so we should still see one order (sn1)
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 1)
@ -381,7 +431,7 @@ func TestOrdersStore_CorruptUnsentV1(t *testing.T) {
require.NoError(t, ordersStore.Enqueue(info))
// only the second order should be corrupted, so we should still see first and last orders (sn1, sn3)
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 2)
@ -403,7 +453,7 @@ func TestOrdersStore_V0ToV1(t *testing.T) {
require.NoError(t, err)
// empty store means no orders can be listed
unsent, err := ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err := ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 0)
@ -436,7 +486,7 @@ func TestOrdersStore_V0ToV1(t *testing.T) {
require.NoError(t, of.Close())
// check that we can see both orders tomorrow
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 2)
@ -447,7 +497,7 @@ func TestOrdersStore_V0ToV1(t *testing.T) {
// new file should be created with version V1
require.NoError(t, ordersStore.Enqueue(info))
unsent, err = ordersStore.ListUnsentBySatellite(tomorrow)
unsent, err = ordersStore.ListUnsentBySatellite(ctx, tomorrow)
require.NoError(t, err)
require.Len(t, unsent, 1)
require.Len(t, unsent[satellite].InfoList, 1)

View File

@ -112,8 +112,8 @@ func (db *ordersDB) ListUnsent(ctx context.Context, limit int) (_ []*ordersfile.
return infos, ErrOrders.Wrap(rows.Err())
}
// ListUnsentBySatellite returns orders that haven't been sent yet grouped by
// satellite. Does not return uplink identity.
// ListUnsentBySatellite returns orders that haven't been sent yet and are not expired.
// The orders are ordered by the Satellite ID.
//
// If there is some unmarshal error while reading an order, the method proceed
// with the following ones and the function will return the ones which have
@ -128,7 +128,8 @@ func (db *ordersDB) ListUnsentBySatellite(ctx context.Context) (_ map[storj.Node
rows, err := db.QueryContext(ctx, `
SELECT order_limit_serialized, order_serialized
FROM unsent_order
`)
WHERE order_limit_expiration >= $1
`, time.Now().UTC())
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, nil