storage/postgreskv: fix ultra-slow nonrecursive list (#3564)

This is based on Jeff's most excellent work to identify why non-recursive listing under postgreskv was phenomenally slow. It turns out PostgreSQL's query planner was actually using two sequential scans of the pathdata table to do its job. It's unclear for how long that has been happening, but obviously it won't scale any further. The main change is propagating bucket association with pathnames through the CTE so that the query planner lets itself use the pathdata index on (bucket, fullpath) for the skipping-forward part. Jeff also had some changes to the range ends to keep NULL from being used- I believe with the intent of making sure the query planner was able to use the pathdata index. My tests on postgres 9.6 and 11 indicate that those changes don't make any appreciable difference in performance or query plan, so I'm going to leave them off for now to avoid a careful audit of the semantic differences. There is a test included here, which only serves to check that the new version of the function is indeed active. To actually ensure that no sequential scans are being used in the query plan anymore, our tests would need to be run against a test db with lots of data already loaded in it, and that isn't feasible for now. Change-Id: Iffe9a1f411c54a2f742a4abb8f2df0c64fd662cb
2019-11-13 17:52:14 -06:00 · 2019-11-13 17:52:14 -06:00 · d5963d81d0
commit d5963d81d0
parent bd89f51c66
4 changed files with 195 additions and 9 deletions
--- a/storage/postgreskv/client_test.go
+++ b/storage/postgreskv/client_test.go
@ -6,6 +6,7 @@ package postgreskv
 import (
 	"context"
 	"database/sql"
+	"strings"
 	"testing"

 	"github.com/lib/pq"
@ -44,6 +45,44 @@ func TestSuite(t *testing.T) {
 	testsuite.RunTests(t, store)
 }

+func TestThatMigrationActuallyHappened(t *testing.T) {
+	store, cleanup := newTestPostgres(t)
+	defer cleanup()
+
+	rows, err := store.pgConn.Query(`
+		SELECT prosrc
+		  FROM pg_catalog.pg_proc p,
+		       pg_catalog.pg_namespace n
+		 WHERE p.pronamespace = n.oid
+		       AND p.proname = 'list_directory'
+		       AND n.nspname = ANY(current_schemas(true))
+		       AND p.pronargs = 4
+	`)
+	if err != nil {
+		t.Fatalf("failed to get list_directory source: %v", err)
+	}
+	defer func() {
+		if err := rows.Close(); err != nil {
+			t.Fatalf("failed to close rows: %v", err)
+		}
+	}()
+
+	numFound := 0
+	for rows.Next() {
+		numFound++
+		if numFound > 1 {
+			t.Fatal("there are multiple eligible list_directory() functions??")
+		}
+		var source string
+		if err := rows.Scan(&source); err != nil {
+			t.Fatalf("failed to read list_directory source: %v", err)
+		}
+		if strings.Contains(source, "distinct_prefix (truncatedpath)") {
+			t.Fatal("list_directory() function in pg appears to be the oldnbusted one")
+		}
+	}
+}
+
 func BenchmarkSuite(b *testing.B) {
 	store, cleanup := newTestPostgres(b)
 	defer cleanup()
--- a/storage/postgreskv/schema/2019111301_fix_seq_scans_in_list.down.sql
+++ b/storage/postgreskv/schema/2019111301_fix_seq_scans_in_list.down.sql
@ -0,0 +1,44 @@
+CREATE OR REPLACE FUNCTION list_directory(bucket BYTEA, dirpath BYTEA, start_at BYTEA = ''::BYTEA, limit_to INTEGER = NULL)
+RETURNS SETOF path_and_meta AS $$
+    WITH RECURSIVE
+        inputs AS (
+            SELECT CASE WHEN dirpath = ''::BYTEA THEN NULL ELSE dirpath END AS range_low,
+                   CASE WHEN dirpath = ''::BYTEA THEN NULL ELSE bytea_increment(dirpath) END AS range_high,
+                   octet_length(dirpath) + 1 AS component_start,
+                   b.delim AS delim,
+                   b.bucketname AS bucket
+              FROM buckets b
+             WHERE bucketname = bucket
+        ),
+        distinct_prefix (truncatedpath) AS (
+            SELECT (SELECT truncate_after(pd.fullpath, i.delim, i.component_start)
+                      FROM pathdata pd
+                     WHERE (i.range_low IS NULL OR pd.fullpath > i.range_low)
+                       AND (i.range_high IS NULL OR pd.fullpath < i.range_high)
+                       AND (start_at = '' OR pd.fullpath >= start_at)
+                       AND pd.bucket = i.bucket
+                     ORDER BY pd.fullpath
+                     LIMIT 1)
+              FROM inputs i
+            UNION ALL
+            SELECT (SELECT truncate_after(pd.fullpath, i.delim, i.component_start)
+                      FROM pathdata pd
+                     WHERE pd.fullpath >= component_increment(pfx.truncatedpath, i.delim)
+                       AND (i.range_high IS NULL OR pd.fullpath < i.range_high)
+                       AND pd.bucket = i.bucket
+                     ORDER BY pd.fullpath
+                     LIMIT 1)
+              FROM distinct_prefix pfx, inputs i
+             WHERE pfx.truncatedpath IS NOT NULL
+        )
+    SELECT pfx.truncatedpath AS fullpath,
+           pd.metadata
+      FROM distinct_prefix pfx LEFT OUTER JOIN pathdata pd ON pfx.truncatedpath = pd.fullpath
+     WHERE pfx.truncatedpath IS NOT NULL
+    UNION ALL
+    -- this one, if it exists, can't be part of distinct_prefix (or it would cause us to skip over all
+    -- subcontents of the prefix we're looking for), so we tack it on here
+    SELECT pd.fullpath, pd.metadata FROM pathdata pd, inputs i WHERE pd.fullpath = i.range_low
+     ORDER BY fullpath
+     LIMIT limit_to;
+$$ LANGUAGE 'sql' STABLE;
--- a/storage/postgreskv/schema/2019111301_fix_seq_scans_in_list.up.sql
+++ b/storage/postgreskv/schema/2019111301_fix_seq_scans_in_list.up.sql
@ -0,0 +1,46 @@
+CREATE OR REPLACE FUNCTION list_directory(bucket BYTEA, dirpath BYTEA, start_at BYTEA = ''::BYTEA, limit_to INTEGER = NULL)
+RETURNS SETOF path_and_meta AS $$
+    WITH RECURSIVE
+        inputs AS (
+            SELECT CASE WHEN dirpath = ''::BYTEA THEN NULL ELSE dirpath END AS range_low,
+                   CASE WHEN dirpath = ''::BYTEA THEN NULL ELSE bytea_increment(dirpath) END AS range_high,
+                   octet_length(dirpath) + 1 AS component_start,
+                   b.delim AS delim,
+                   b.bucketname AS bucket
+              FROM buckets b
+             WHERE bucketname = bucket
+        ),
+        distinct_prefix (bucket, truncatedpath) AS (
+            SELECT i.bucket,
+                   (SELECT truncate_after(pd.fullpath, i.delim, i.component_start)
+                      FROM pathdata pd
+                     WHERE (i.range_low IS NULL OR pd.fullpath > i.range_low)
+                       AND (i.range_high IS NULL OR pd.fullpath < i.range_high)
+                       AND (start_at = '' OR pd.fullpath >= start_at)
+                       AND pd.bucket = i.bucket
+                     ORDER BY pd.fullpath
+                     LIMIT 1)
+              FROM inputs i
+            UNION ALL
+            SELECT i.bucket,
+                   (SELECT truncate_after(pd.fullpath, i.delim, i.component_start)
+                      FROM pathdata pd
+                     WHERE pd.fullpath >= component_increment(pfx.truncatedpath, i.delim)
+                       AND (i.range_high IS NULL OR pd.fullpath < i.range_high)
+                       AND pd.bucket = i.bucket
+                     ORDER BY pd.fullpath
+                     LIMIT 1)
+              FROM distinct_prefix pfx, inputs i
+             WHERE pfx.truncatedpath IS NOT NULL
+        )
+    SELECT pfx.truncatedpath AS fullpath,
+           pd.metadata
+      FROM distinct_prefix pfx LEFT OUTER JOIN pathdata pd ON pfx.truncatedpath = pd.fullpath AND pd.bucket = pfx.bucket
+     WHERE pfx.truncatedpath IS NOT NULL
+    UNION ALL
+    -- this one, if it exists, can't be part of distinct_prefix (or it would cause us to skip over all
+    -- subcontents of the prefix we're looking for), so we tack it on here
+    SELECT pd.fullpath, pd.metadata FROM pathdata pd, inputs i WHERE pd.fullpath = i.range_low AND pd.bucket = i.bucket
+     ORDER BY fullpath
+     LIMIT limit_to;
+$$ LANGUAGE 'sql' STABLE;
--- a/storage/postgreskv/schema/data.go
+++ b/storage/postgreskv/schema/data.go