Fix locking when fixing an incomplete split of a GIN internal page

ginFinishSplit() expects the caller to hold an exclusive lock on the buffer, but when finishing an earlier "leftover" incomplete split of an internal page, the caller held a shared lock. That caused an assertion failure in MarkBufferDirty(). Without assertions, it could lead to corruption if two backends tried to complete the split at the same time. On master, add a test case using the new injection point facility. Report and analysis by Fei Changhong. Backpatch the fix to all supported versions. Reviewed-by: Fei Changhong, Michael Paquier Discussion: https://www.postgresql.org/message-id/tencent_A3CE810F59132D8E230475A5F0F7A08C8307@qq.com
2024-01-29 13:46:22 +02:00 · 2024-01-29 13:46:22 +02:00 · 6a1ea02c49
commit 6a1ea02c49
parent 6d4565a05f
7 changed files with 418 additions and 20 deletions
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@ -19,6 +19,7 @@
 #include "access/xloginsert.h"
 #include "miscadmin.h"
 #include "storage/predicate.h"
+#include "utils/injection_point.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"

@ -28,6 +29,8 @@ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 						   Buffer childbuf, GinStatsData *buildStats);
 static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
 						   bool freestack, GinStatsData *buildStats);
+static void ginFinishOldSplit(GinBtree btree, GinBtreeStack *stack,
+							  GinStatsData *buildStats, int access);

 /*
 * Lock buffer by needed method for search.
@ -108,7 +111,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
 		 * encounter on the way.
 		 */
 		if (!searchMode && GinPageIsIncompleteSplit(page))
-			ginFinishSplit(btree, stack, false, NULL);
+			ginFinishOldSplit(btree, stack, NULL, access);

 		/*
 		 * ok, page is correctly locked, we should check to move right ..,
@ -128,7 +131,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
 			page = BufferGetPage(stack->buffer);

 			if (!searchMode && GinPageIsIncompleteSplit(page))
-				ginFinishSplit(btree, stack, false, NULL);
+				ginFinishOldSplit(btree, stack, NULL, access);
 		}

 		if (GinPageIsLeaf(page))	/* we found, return locked page */
@ -164,8 +167,11 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
 * Step right from current page.
 *
 * The next page is locked first, before releasing the current page. This is
- * crucial to protect from concurrent page deletion (see comment in
- * ginDeletePage).
+ * crucial to prevent concurrent VACUUM from deleting a page that we are about
+ * to step to. (The lock-coupling isn't strictly necessary when we are
+ * traversing the tree to find an insert location, because page deletion grabs
+ * a cleanup lock on the root to prevent any concurrent inserts. See Page
+ * deletion section in the README. But there's no harm in doing it always.)
 */
 Buffer
 ginStepRight(Buffer buffer, Relation index, int lockmode)
@ -262,7 +268,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
 			ptr->parent = root;
 			ptr->off = InvalidOffsetNumber;

-			ginFinishSplit(btree, ptr, false, NULL);
+			ginFinishOldSplit(btree, ptr, NULL, GIN_EXCLUSIVE);
 		}

 		leftmostBlkno = btree->getLeftMostChild(btree, page);
@ -291,7 +297,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
 				ptr->parent = root;
 				ptr->off = InvalidOffsetNumber;

-				ginFinishSplit(btree, ptr, false, NULL);
+				ginFinishOldSplit(btree, ptr, NULL, GIN_EXCLUSIVE);
 			}
 		}

@ -670,15 +676,6 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 	bool		done;
 	bool		first = true;

-	/*
-	 * freestack == false when we encounter an incompletely split page during
-	 * a scan, while freestack == true is used in the normal scenario that a
-	 * split is finished right after the initial insert.
-	 */
-	if (!freestack)
-		elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
-			 stack->blkno, RelationGetRelationName(btree->index));
-
 	/* this loop crawls up the stack until the insertion is complete */
 	do
 	{
@ -686,6 +683,13 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 		void	   *insertdata;
 		BlockNumber updateblkno;

+#ifdef USE_INJECTION_POINTS
+		if (GinPageIsLeaf(BufferGetPage(stack->buffer)))
+			INJECTION_POINT("gin-leave-leaf-split-incomplete");
+		else
+			INJECTION_POINT("gin-leave-internal-split-incomplete");
+#endif
+
 		/* search parent to lock */
 		LockBuffer(parent->buffer, GIN_EXCLUSIVE);

@ -699,7 +703,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 		 * would fail.
 		 */
 		if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
-			ginFinishSplit(btree, parent, false, buildStats);
+			ginFinishOldSplit(btree, parent, buildStats, GIN_EXCLUSIVE);

 		/* move right if it's needed */
 		page = BufferGetPage(parent->buffer);
@ -723,7 +727,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 			page = BufferGetPage(parent->buffer);

 			if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
-				ginFinishSplit(btree, parent, false, buildStats);
+				ginFinishOldSplit(btree, parent, buildStats, GIN_EXCLUSIVE);
 		}

 		/* insert the downlink */
@ -759,6 +763,43 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 		freeGinBtreeStack(stack);
 }

+/*
+ * An entry point to ginFinishSplit() that is used when we stumble upon an
+ * existing incompletely split page in the tree, as opposed to completing a
+ * split that we just made outselves. The difference is that stack->buffer may
+ * be merely share-locked on entry, and will be upgraded to exclusive mode.
+ *
+ * Note: Upgrading the lock momentarily releases it. Doing that in a scan
+ * would not be OK, because a concurrent VACUUM might delete the page while
+ * we're not holding the lock. It's OK in an insert, though, because VACUUM
+ * has a different mechanism that prevents it from running concurrently with
+ * inserts. (Namely, it holds a cleanup lock on the root.)
+ */
+static void
+ginFinishOldSplit(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats, int access)
+{
+	INJECTION_POINT("gin-finish-incomplete-split");
+	elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
+		 stack->blkno, RelationGetRelationName(btree->index));
+
+	if (access == GIN_SHARE)
+	{
+		LockBuffer(stack->buffer, GIN_UNLOCK);
+		LockBuffer(stack->buffer, GIN_EXCLUSIVE);
+
+		if (!GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
+		{
+			/*
+			 * Someone else already completed the split while we were not
+			 * holding the lock.
+			 */
+			return;
+		}
+	}
+
+	ginFinishSplit(btree, stack, false, buildStats);
+}
+
 /*
 * Insert a value to tree described by stack.
 *
@ -779,7 +820,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata,

 	/* If the leaf page was incompletely split, finish the split first */
 	if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
-		ginFinishSplit(btree, stack, false, buildStats);
+		ginFinishOldSplit(btree, stack, buildStats, GIN_EXCLUSIVE);

 	done = ginPlaceToPage(btree, stack,
 						  insertdata, InvalidBlockNumber,
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@ -40,9 +40,9 @@ SUBDIRS = \


 ifeq ($(enable_injection_points),yes)
-SUBDIRS += injection_points
+SUBDIRS += injection_points gin
 else
-ALWAYS_SUBDIRS += injection_points
+ALWAYS_SUBDIRS += injection_points gin
 endif

 ifeq ($(with_ssl),openssl)
--- a/src/test/modules/gin/Makefile
+++ b/src/test/modules/gin/Makefile
@ -0,0 +1,16 @@
+# src/test/modules/gin/Makefile
+
+EXTRA_INSTALL = src/test/modules/injection_points
+
+REGRESS = gin_incomplete_splits
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/gin
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
--- a/src/test/modules/gin/expected/gin_incomplete_splits.out
+++ b/src/test/modules/gin/expected/gin_incomplete_splits.out
@ -0,0 +1,180 @@
+--
+-- The test uses a GIN index over int[].  The table contains arrays
+-- with integers from 1 to :next_i.  Each integer occurs exactly once,
+-- no gaps or duplicates, although the index does contain some
+-- duplicate elements because some of the inserting transactions are
+-- rolled back during the test. The exact contents of the table depend
+-- on the physical layout of the index, which in turn depends at least
+-- on the block size, so instead of check for the exact contents, we
+-- check those invariants.  :next_i psql variable is maintained at all
+-- times to hold the last inserted integer + 1.
+--
+-- This uses injection points to cause errors that leave some page
+-- splits in "incomplete" state
+create extension injection_points;
+-- Use the index for all the queries
+set enable_seqscan=off;
+-- Print a NOTICE whenever an incomplete split gets fixed
+SELECT injection_points_attach('gin-finish-incomplete-split', 'notice');
+ injection_points_attach 
+-------------------------
+ 
+(1 row)
+
+--
+-- First create the test table and some helper functions
+--
+create table gin_incomplete_splits(i int4[]) with (autovacuum_enabled = off);
+create index on gin_incomplete_splits using gin (i) with (fastupdate = off);
+-- Creates an array with all integers from $1 (inclusive) $2 (exclusive)
+create function range_array(int, int) returns int[] language sql immutable as $$
+  select array_agg(g) from generate_series($1, $2 - 1) g
+$$;
+-- Inserts an array with 'n' rows to the test table. Pass :next_i as
+-- the first argument, returns the new value for :next_i.
+create function insert_n(first_i int, n int) returns int language plpgsql as $$
+begin
+  insert into gin_incomplete_splits values (range_array(first_i, first_i+n));
+  return first_i + n;
+end;
+$$;
+-- Inserts to the table until an insert fails. Like insert_n(), returns the
+-- new value for :next_i.
+create function insert_until_fail(next_i int, step int default 1) returns int language plpgsql as $$
+declare
+  i integer;
+begin
+  -- Insert arrays with 'step' elements each, until an error occurs.
+  loop
+    begin
+      select insert_n(next_i, step) into next_i;
+    exception when others then
+      raise notice 'failed with: %', sqlerrm;
+      exit;
+    end;
+
+    -- The caller is expected to set an injection point that eventuall
+    -- causes an error. But bail out if still no error after 10000
+    -- attempts, so that we don't get stuck in an infinite loop.
+    i := i + 1;
+    if i = 10000 then
+      raise 'no error on inserts after ';
+    end if;
+  end loop;
+
+  return next_i;
+end;
+$$;
+-- Check the invariants.
+create function verify(next_i int) returns bool language plpgsql as $$
+declare
+  a integer[];
+  elem integer;
+  c integer;
+begin
+  -- Perform a scan over the trailing part of the index, where the
+  -- possible incomplete splits are. (We don't check the whole table,
+  -- because that'd be pretty slow.)
+  c := 0;
+  -- Find all arrays that overlap with the last 200 inserted integers. Or
+  -- the next 100, which shouldn't exist.
+  for a in select i from gin_incomplete_splits where i && range_array(next_i - 200, next_i + 100)
+  loop
+    -- count all elements in those arrays in the window.
+    foreach elem in ARRAY a loop
+      if elem >= next_i then
+        raise 'unexpected value % in array', elem;
+      end if;
+      if elem >= next_i - 200 then
+        c := c + 1;
+      end if;
+    end loop;
+  end loop;
+  if c <> 200 then
+    raise 'unexpected count % ', c;
+  end if;
+
+  return true;
+end;
+$$;
+-- Insert one array to get started.
+select insert_n(1, 1000) as next_i
+\gset
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+--
+-- Test incomplete leaf split
+--
+SELECT injection_points_attach('gin-leave-leaf-split-incomplete', 'error');
+ injection_points_attach 
+-------------------------
+ 
+(1 row)
+
+select insert_until_fail(:next_i) as next_i
+\gset
+NOTICE:  failed with: error triggered for injection point gin-leave-leaf-split-incomplete
+SELECT injection_points_detach('gin-leave-leaf-split-incomplete');
+ injection_points_detach 
+-------------------------
+ 
+(1 row)
+
+-- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+NOTICE:  notice triggered for injection point gin-finish-incomplete-split
+-- Verify that a scan still works
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+--
+-- Test incomplete internal page split
+--
+SELECT injection_points_attach('gin-leave-internal-split-incomplete', 'error');
+ injection_points_attach 
+-------------------------
+ 
+(1 row)
+
+select insert_until_fail(:next_i, 100) as next_i
+\gset
+NOTICE:  failed with: error triggered for injection point gin-leave-internal-split-incomplete
+SELECT injection_points_detach('gin-leave-internal-split-incomplete');
+ injection_points_detach 
+-------------------------
+ 
+(1 row)
+
+ -- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+NOTICE:  notice triggered for injection point gin-finish-incomplete-split
+-- Verify that a scan still works
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
--- a/src/test/modules/gin/meson.build
+++ b/src/test/modules/gin/meson.build
@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+if not get_option('injection_points')
+  subdir_done()
+endif
+
+tests += {
+  'name': 'gin',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'gin_incomplete_splits',
+    ],
+  },
+}
--- a/src/test/modules/gin/sql/gin_incomplete_splits.sql
+++ b/src/test/modules/gin/sql/gin_incomplete_splits.sql
@ -0,0 +1,144 @@
+--
+-- The test uses a GIN index over int[].  The table contains arrays
+-- with integers from 1 to :next_i.  Each integer occurs exactly once,
+-- no gaps or duplicates, although the index does contain some
+-- duplicate elements because some of the inserting transactions are
+-- rolled back during the test. The exact contents of the table depend
+-- on the physical layout of the index, which in turn depends at least
+-- on the block size, so instead of check for the exact contents, we
+-- check those invariants.  :next_i psql variable is maintained at all
+-- times to hold the last inserted integer + 1.
+--
+
+-- This uses injection points to cause errors that leave some page
+-- splits in "incomplete" state
+create extension injection_points;
+
+-- Use the index for all the queries
+set enable_seqscan=off;
+
+-- Print a NOTICE whenever an incomplete split gets fixed
+SELECT injection_points_attach('gin-finish-incomplete-split', 'notice');
+
+--
+-- First create the test table and some helper functions
+--
+create table gin_incomplete_splits(i int4[]) with (autovacuum_enabled = off);
+
+create index on gin_incomplete_splits using gin (i) with (fastupdate = off);
+
+-- Creates an array with all integers from $1 (inclusive) $2 (exclusive)
+create function range_array(int, int) returns int[] language sql immutable as $$
+  select array_agg(g) from generate_series($1, $2 - 1) g
+$$;
+
+-- Inserts an array with 'n' rows to the test table. Pass :next_i as
+-- the first argument, returns the new value for :next_i.
+create function insert_n(first_i int, n int) returns int language plpgsql as $$
+begin
+  insert into gin_incomplete_splits values (range_array(first_i, first_i+n));
+  return first_i + n;
+end;
+$$;
+
+-- Inserts to the table until an insert fails. Like insert_n(), returns the
+-- new value for :next_i.
+create function insert_until_fail(next_i int, step int default 1) returns int language plpgsql as $$
+declare
+  i integer;
+begin
+  -- Insert arrays with 'step' elements each, until an error occurs.
+  loop
+    begin
+      select insert_n(next_i, step) into next_i;
+    exception when others then
+      raise notice 'failed with: %', sqlerrm;
+      exit;
+    end;
+
+    -- The caller is expected to set an injection point that eventuall
+    -- causes an error. But bail out if still no error after 10000
+    -- attempts, so that we don't get stuck in an infinite loop.
+    i := i + 1;
+    if i = 10000 then
+      raise 'no error on inserts after ';
+    end if;
+  end loop;
+
+  return next_i;
+end;
+$$;
+
+-- Check the invariants.
+create function verify(next_i int) returns bool language plpgsql as $$
+declare
+  a integer[];
+  elem integer;
+  c integer;
+begin
+  -- Perform a scan over the trailing part of the index, where the
+  -- possible incomplete splits are. (We don't check the whole table,
+  -- because that'd be pretty slow.)
+  c := 0;
+  -- Find all arrays that overlap with the last 200 inserted integers. Or
+  -- the next 100, which shouldn't exist.
+  for a in select i from gin_incomplete_splits where i && range_array(next_i - 200, next_i + 100)
+  loop
+    -- count all elements in those arrays in the window.
+    foreach elem in ARRAY a loop
+      if elem >= next_i then
+        raise 'unexpected value % in array', elem;
+      end if;
+      if elem >= next_i - 200 then
+        c := c + 1;
+      end if;
+    end loop;
+  end loop;
+  if c <> 200 then
+    raise 'unexpected count % ', c;
+  end if;
+
+  return true;
+end;
+$$;
+
+-- Insert one array to get started.
+select insert_n(1, 1000) as next_i
+\gset
+select verify(:next_i);
+
+
+--
+-- Test incomplete leaf split
+--
+SELECT injection_points_attach('gin-leave-leaf-split-incomplete', 'error');
+select insert_until_fail(:next_i) as next_i
+\gset
+SELECT injection_points_detach('gin-leave-leaf-split-incomplete');
+
+-- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+-- Verify that a scan still works
+select verify(:next_i);
+
+
+--
+-- Test incomplete internal page split
+--
+SELECT injection_points_attach('gin-leave-internal-split-incomplete', 'error');
+select insert_until_fail(:next_i, 100) as next_i
+\gset
+SELECT injection_points_detach('gin-leave-internal-split-incomplete');
+
+ -- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+-- Verify that a scan still works
+select verify(:next_i);
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@ -5,6 +5,7 @@ subdir('commit_ts')
 subdir('delay_execution')
 subdir('dummy_index_am')
 subdir('dummy_seclabel')
+subdir('gin')
 subdir('injection_points')
 subdir('ldap_password_func')
 subdir('libpq_pipeline')