From f695b032bf669ce602f1ef527108b343605dd30f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 05:22:25 +0300 Subject: [PATCH 01/79] Custom user shared invalidation message --- src/backend/utils/cache/inval.c | 48 +++++++++++++++++++++++++++++++++ src/include/storage/sinval.h | 11 ++++++++ src/include/utils/inval.h | 4 +++ 3 files changed, 63 insertions(+) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 66e04f973f6..4f1bdbba328 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -252,6 +252,7 @@ int debug_discard_caches = 0; #define MAX_SYSCACHE_CALLBACKS 64 #define MAX_RELCACHE_CALLBACKS 10 +#define MAX_USERCACHE_CALLBACKS 10 static struct SYSCACHECALLBACK { @@ -273,6 +274,14 @@ static struct RELCACHECALLBACK static int relcache_callback_count = 0; +static struct USERCACHECALLBACK +{ + UsercacheCallbackFunction function; + Datum arg; +} usercache_callback_list[MAX_RELCACHE_CALLBACKS]; + +static int usercache_callback_count = 0; + /* ---------------------------------------------------------------- * Invalidation subgroup support functions * ---------------------------------------------------------------- @@ -693,6 +702,16 @@ InvalidateSystemCachesExtended(bool debug_discard) ccitem->function(ccitem->arg, InvalidOid); } + + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + InvalidOid, + InvalidOid, + InvalidOid); + } } /* @@ -774,6 +793,19 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) else if (msg->sn.dbId == MyDatabaseId) InvalidateCatalogSnapshot(); } + else if (msg->id == SHAREDINVALUSERCACHE_ID) + { + int i; + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + msg->usr.arg1, + msg->usr.arg2, + msg->usr.arg3); + } + } else elog(FATAL, "unrecognized SI message ID: %d", msg->id); } @@ -1570,6 +1602,22 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, ++relcache_callback_count; } +/* + * CacheRegisterUsercacheCallback + */ +void +CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg) +{ + if (usercache_callback_count >= MAX_USERCACHE_CALLBACKS) + elog(FATAL, "out of usercache_callback_list slots"); + + usercache_callback_list[usercache_callback_count].function = func; + usercache_callback_list[usercache_callback_count].arg = arg; + + ++usercache_callback_count; +} + /* * CallSyscacheCallbacks * diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h index 8f5744b21bc..6d262ed080c 100644 --- a/src/include/storage/sinval.h +++ b/src/include/storage/sinval.h @@ -110,6 +110,16 @@ typedef struct Oid relId; /* relation ID */ } SharedInvalSnapshotMsg; +#define SHAREDINVALUSERCACHE_ID (-6) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid arg1; /* user-specific values */ + Oid arg2; + Oid arg3; +} SharedInvalUserMsg; + typedef union { int8 id; /* type field --- must be first */ @@ -119,6 +129,7 @@ typedef union SharedInvalSmgrMsg sm; SharedInvalRelmapMsg rm; SharedInvalSnapshotMsg sn; + SharedInvalUserMsg usr; } SharedInvalidationMessage; diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 24695facf22..225b8e4ddaa 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -22,6 +22,7 @@ extern PGDLLIMPORT int debug_discard_caches; typedef void (*SyscacheCallbackFunction) (Datum arg, int cacheid, uint32 hashvalue); typedef void (*RelcacheCallbackFunction) (Datum arg, Oid relid); +typedef void (*UsercacheCallbackFunction) (Datum arg, Oid arg1, Oid arg2, Oid arg3); extern void AcceptInvalidationMessages(void); @@ -59,6 +60,9 @@ extern void CacheRegisterSyscacheCallback(int cacheid, extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, Datum arg); +extern void CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg); + extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue); extern void InvalidateSystemCaches(void); From ef0774f29074978c3dd99862d85916ccc86de1b4 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 05:24:57 +0300 Subject: [PATCH 02/79] CacheInvalidateRelcacheByDbidRelid() --- src/backend/utils/cache/inval.c | 19 +++++++++++++++++++ src/include/utils/inval.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 4f1bdbba328..f5775880416 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1464,6 +1464,25 @@ CacheInvalidateRelcacheByRelid(Oid relid) ReleaseSysCache(tup); } +/* + * CacheInvalidateRelcacheByDbidRelid + */ +void +CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid) +{ + SharedInvalidationMessage msg; + + PrepareInvalidationState(); + + msg.rc.id = SHAREDINVALRELCACHE_ID; + msg.rc.dbId = dbid; + msg.rc.relId = relid; + /* check AddCatcacheInvalidationMessage() for an explanation */ + VALGRIND_MAKE_MEM_DEFINED(&msg, sizeof(msg)); + + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheInvalidateSmgr diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 225b8e4ddaa..69498b9f77f 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -49,6 +49,8 @@ extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); extern void CacheInvalidateRelcacheByRelid(Oid relid); +extern void CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid); + extern void CacheInvalidateSmgr(RelFileLocatorBackend rlocator); extern void CacheInvalidateRelmap(Oid databaseId); From 78ffa84441413ca59b6da62937ddd13aae7d4e27 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 20:26:34 +0300 Subject: [PATCH 03/79] CommitSeqNo data type --- src/include/access/transam.h | 16 ++++++++++++++++ src/include/c.h | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 07b9be6cdfc..fcf8466cba7 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -196,6 +196,22 @@ FullTransactionIdAdvance(FullTransactionId *dest) #define FirstUnpinnedObjectId 12000 #define FirstNormalObjectId 16384 +#define COMMITSEQNO_INPROGRESS UINT64CONST(0x0) +#define COMMITSEQNO_NON_DELETED UINT64CONST(0x1) +#define COMMITSEQNO_ABORTED UINT64CONST(0x2) +#define COMMITSEQNO_FROZEN UINT64CONST(0x3) +#define COMMITSEQNO_COMMITTING UINT64CONST(0x4) +#define COMMITSEQNO_FIRST_NORMAL UINT64CONST(0x5) +#define COMMITSEQNO_MAX_NORMAL UINT64CONST(0x7FFFFFFFFFFFFFFF) + +#define COMMITSEQNO_IS_INPROGRESS(csn) ((csn) == COMMITSEQNO_INPROGRESS || (csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_NON_DELETED(csn) ((csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_ABORTED(csn) ((csn) == COMMITSEQNO_ABORTED) +#define COMMITSEQNO_IS_FROZEN(csn) ((csn) == COMMITSEQNO_FROZEN) +#define COMMITSEQNO_IS_NORMAL(csn) ((csn) >= COMMITSEQNO_FIRST_NORMAL) +#define COMMITSEQNO_IS_COMMITTING(csn) ((csn) == COMMITSEQNO_COMMITTING) +#define COMMITSEQNO_IS_COMMITTED(csn) ((csn) >= COMMITSEQNO_FROZEN) + /* * TransamVariables is a data structure in shared memory that is used to track * OID and XID assignment state. For largely historical reasons, there is diff --git a/src/include/c.h b/src/include/c.h index bc26c6aa7f1..90da28a80e9 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -637,7 +637,7 @@ typedef double float8; /* * Oid, RegProcedure, TransactionId, SubTransactionId, MultiXactId, - * CommandId + * CommandId, CommitSeqNo */ /* typedef Oid is in postgres_ext.h */ @@ -668,6 +668,8 @@ typedef uint32 CommandId; #define FirstCommandId ((CommandId) 0) #define InvalidCommandId (~(CommandId)0) +typedef uint64 CommitSeqNo; + /* ---------------- * Variable-length datatypes all share the 'struct varlena' header. From a78a5834c327b34258040035d9d70912826d9da3 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 20:36:18 +0300 Subject: [PATCH 04/79] Custom TOAST --- contrib/pageinspect/heapfuncs.c | 1 + contrib/test_decoding/test_decoding.c | 2 +- src/backend/access/common/detoast.c | 42 ++++++++++++++++--- src/backend/access/common/toast_compression.c | 7 +++- src/backend/access/common/toast_internals.c | 4 +- src/backend/access/table/toast_helper.c | 6 +-- src/backend/replication/logical/proto.c | 2 +- src/backend/replication/pgoutput/pgoutput.c | 4 +- src/backend/utils/adt/varlena.c | 2 +- src/include/access/detoast.h | 14 +++++++ src/include/varatt.h | 33 ++++++++++++++- src/test/regress/regress.c | 2 +- 12 files changed, 99 insertions(+), 20 deletions(-) diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 9fc5f815fda..4330bca0785 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -377,6 +377,7 @@ tuple_data_split_internal(Oid relid, char *tupdata, */ if (VARATT_IS_EXTERNAL(tupdata + off) && !VARATT_IS_EXTERNAL_ONDISK(tupdata + off) && + !VARATT_IS_EXTERNAL_ORIOLEDB(tupdata + off) && !VARATT_IS_EXTERNAL_INDIRECT(tupdata + off)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index 7c50d139698..02d5c2e07da 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -578,7 +578,7 @@ tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple, bool skip_ /* print data */ if (isnull) appendStringInfoString(s, "null"); - else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval)) + else if (typisvarlena && (VARATT_IS_EXTERNAL_ONDISK(origval) || VARATT_IS_EXTERNAL_ORIOLEDB(origval))) appendStringInfoString(s, "unchanged-toast-datum"); else if (!typisvarlena) print_literal(s, typid, diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index 3547cdba56e..d9ab4fb0956 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -26,7 +26,6 @@ static struct varlena *toast_fetch_datum(struct varlena *attr); static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); -static struct varlena *toast_decompress_datum(struct varlena *attr); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); /* ---------- @@ -46,7 +45,7 @@ detoast_external_attr(struct varlena *attr) { struct varlena *result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an external stored plain value @@ -115,7 +114,7 @@ detoast_external_attr(struct varlena *attr) struct varlena * detoast_attr(struct varlena *attr) { - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an externally stored datum --- fetch it back from there @@ -332,6 +331,20 @@ detoast_attr_slice(struct varlena *attr, return result; } +static ToastFunc o_detoast_func = NULL; + +void +register_o_detoast_func(ToastFunc func) +{ + o_detoast_func = func; +} + +void +deregister_o_detoast_func() +{ + o_detoast_func = NULL; +} + /* ---------- * toast_fetch_datum - * @@ -347,6 +360,17 @@ toast_fetch_datum(struct varlena *attr) struct varatt_external toast_pointer; int32 attrsize; + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + if (o_detoast_func != NULL) + { + result = o_detoast_func(attr); + if (result == NULL) + elog(ERROR, "unexpected NULL detoast result"); + return result; + } + } + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums"); @@ -467,7 +491,7 @@ toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, * * Decompress a compressed version of a varlena datum */ -static struct varlena * +struct varlena * toast_decompress_datum(struct varlena *attr) { ToastCompressionId cmid; @@ -547,11 +571,17 @@ toast_raw_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->raw_size + VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { - /* va_rawsize is the size of the original datum -- including header */ struct varatt_external toast_pointer; + /* va_rawsize is the size of the original datum -- including header */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); result = toast_pointer.va_rawsize; } diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 52230f31c68..0717947d689 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -260,7 +260,12 @@ toast_get_compression_id(struct varlena *attr) * the external toast pointer. If compressed inline, fetch it from the * toast compression header. */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + cmid = toasted->formatFlags >> ORIOLEDB_EXT_FORMAT_FLAGS_BITS; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 90d0654e629..538a554c917 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -239,7 +239,7 @@ toast_save_datum(Relation rel, Datum value, { struct varatt_external old_toast_pointer; - Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal) || VARATT_IS_EXTERNAL_ORIOLEDB(oldexternal)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) @@ -395,7 +395,7 @@ toast_delete_datum(Relation rel, Datum value, bool is_speculative) int validIndex; SnapshotData SnapshotToast; - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!VARATT_IS_EXTERNAL_ONDISK(attr) && !VARATT_IS_EXTERNAL_ORIOLEDB(attr)) return; /* Must copy to access aligned fields */ diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index 53224932f0d..a0738622657 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -71,10 +71,10 @@ toast_tuple_init(ToastTupleContext *ttc) * we have to delete it later. */ if (att->attlen == -1 && !ttc->ttc_oldisnull[i] && - VARATT_IS_EXTERNAL_ONDISK(old_value)) + (VARATT_IS_EXTERNAL_ONDISK(old_value) || VARATT_IS_EXTERNAL_ORIOLEDB(old_value))) { if (ttc->ttc_isnull[i] || - !VARATT_IS_EXTERNAL_ONDISK(new_value) || + !(VARATT_IS_EXTERNAL_ONDISK(new_value) || VARATT_IS_EXTERNAL_ORIOLEDB(new_value)) || memcmp((char *) old_value, (char *) new_value, VARSIZE_EXTERNAL(old_value)) != 0) { @@ -330,7 +330,7 @@ toast_delete_external(Relation rel, const Datum *values, const bool *isnull, if (isnull[i]) continue; - else if (VARATT_IS_EXTERNAL_ONDISK(value)) + else if (VARATT_IS_EXTERNAL_ONDISK(value) || VARATT_IS_EXTERNAL_ORIOLEDB(value)) toast_delete_datum(rel, value, is_speculative); } } diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 95c09c95167..db41c955ec1 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -814,7 +814,7 @@ logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, continue; } - if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + if (att->attlen == -1 && (VARATT_IS_EXTERNAL_ONDISK(values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(values[i]))) { /* * Unchanged toasted datum. (Note that we don't promise to detect diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 99518c6b6dd..759f23f176d 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -1346,8 +1346,8 @@ pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, * VARTAG_INDIRECT. See ReorderBufferToastReplace. */ if (att->attlen == -1 && - VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && - !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + (VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(new_slot->tts_values[i])) && + !(VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(old_slot->tts_values[i])) ) { if (!tmp_new_slot) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index d2e2e9bbba0..66625735b21 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -5139,7 +5139,7 @@ pg_column_toast_chunk_id(PG_FUNCTION_ARGS) attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0)); - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!(VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr))) PG_RETURN_NULL(); VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h index 12d8cdb356a..9d78980e986 100644 --- a/src/include/access/detoast.h +++ b/src/include/access/detoast.h @@ -63,6 +63,13 @@ extern struct varlena *detoast_attr_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); +/* ---------- + * toast_decompress_datum - + * + * Decompress a compressed version of a varlena datum + */ +extern struct varlena *toast_decompress_datum(struct varlena *attr); + /* ---------- * toast_raw_datum_size - * @@ -79,4 +86,11 @@ extern Size toast_raw_datum_size(Datum value); */ extern Size toast_datum_size(Datum value); +/* + * for in_memory module + */ +typedef struct varlena* (*ToastFunc) (struct varlena *attr); +extern void register_o_detoast_func(ToastFunc func); +extern void deregister_o_detoast_func(void); + #endif /* DETOAST_H */ diff --git a/src/include/varatt.h b/src/include/varatt.h index f04435e9ef3..9da76dea1d6 100644 --- a/src/include/varatt.h +++ b/src/include/varatt.h @@ -38,6 +38,23 @@ typedef struct varatt_external Oid va_toastrelid; /* RelID of TOAST table containing it */ } varatt_external; +typedef struct OToastExternal +{ + uint16 data_size; /* length of OToastExternal data */ + int16 attnum; + int32 raw_size; /* original data size */ + int32 toasted_size; /* compressed original data size */ + /* for fetching data from TOAST tree */ + CommitSeqNo csn; + /* for finding TOAST tree */ + Oid datoid; + Oid relid; + Oid relnode; + /* for storing primary index tuple */ + uint8 formatFlags; /* primary index tuple flags */ + char data[FLEXIBLE_ARRAY_MEMBER]; /* data (primary index tuple) */ +} OToastExternal; + /* * These macros define the "saved size" portion of va_extinfo. Its remaining * two high-order bits identify the compression method. @@ -86,17 +103,21 @@ typedef enum vartag_external VARTAG_INDIRECT = 1, VARTAG_EXPANDED_RO = 2, VARTAG_EXPANDED_RW = 3, - VARTAG_ONDISK = 18 + VARTAG_ONDISK = 18, + VARTAG_ORIOLEDB = 34 } vartag_external; /* this test relies on the specific tag values above */ #define VARTAG_IS_EXPANDED(tag) \ (((tag) & ~1) == VARTAG_EXPANDED_RO) +#define O_TOAST_EXTERNAL_SZ offsetof(OToastExternal, data) + #define VARTAG_SIZE(tag) \ ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ + (tag) == VARTAG_ORIOLEDB ? O_TOAST_EXTERNAL_SZ : \ (AssertMacro(false), 0)) /* @@ -282,11 +303,16 @@ typedef struct #define VARDATA_SHORT(PTR) VARDATA_1B(PTR) #define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) +#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)) \ + + (VARATT_IS_EXTERNAL_ORIOLEDB(PTR) ? \ + *((uint16 *) VARDATA_1B_E(PTR)) \ + : 0)) + #define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) #define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) #define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) + #define VARATT_IS_EXTERNAL_ONDISK(PTR) \ (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) #define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ @@ -299,6 +325,9 @@ typedef struct (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) #define VARATT_IS_EXTERNAL_NON_EXPANDED(PTR) \ (VARATT_IS_EXTERNAL(PTR) && !VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) +#define VARATT_IS_EXTERNAL_ORIOLEDB(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ORIOLEDB) + #define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) #define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 423add66502..c77200729cb 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -606,7 +606,7 @@ make_tuple_indirect(PG_FUNCTION_ARGS) continue; /* copy datum, so it still lives later */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) attr = detoast_external_attr(attr); else { From 5eb27dbee214a58aee008d1e701f93d55c659936 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 23 Mar 2023 00:12:00 +0300 Subject: [PATCH 05/79] Allow locking updated tuples in tuple_update() and tuple_delete() Discussion: https://postgr.es/m/CAPpHfdua-YFw3XTprfutzGp28xXLigFtzNbuFY8yPhqeq6X5kg%40mail.gmail.com Reviewed-by: Aleksander Alekseev, Pavel Borisov, Vignesh C, Mason Sharp Reviewed-by: Andres Freund, Chris Travers --- src/backend/access/heap/heapam.c | 205 ++++++++++---- src/backend/access/heap/heapam_handler.c | 94 +++++-- src/backend/access/table/tableam.c | 26 +- src/backend/commands/trigger.c | 57 +--- src/backend/executor/execReplication.c | 19 +- src/backend/executor/nodeModifyTable.c | 342 +++++++++-------------- src/include/access/heapam.h | 19 +- src/include/access/tableam.h | 69 +++-- src/include/commands/trigger.h | 4 +- 9 files changed, 480 insertions(+), 355 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index cce38f482bd..7a735261e35 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2717,10 +2717,11 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) } /* - * heap_delete - delete a tuple + * heap_delete - delete a tuple, optionally fetching it into a slot * * See table_tuple_delete() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -2729,8 +2730,9 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) */ TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + CommandId cid, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2810,7 +2812,7 @@ heap_delete(Relation relation, ItemPointer tid, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to delete invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -2951,7 +2953,30 @@ heap_delete(Relation relation, ItemPointer tid, tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resources on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); if (vmbuffer != InvalidBuffer) @@ -3125,8 +3150,24 @@ heap_delete(Relation relation, ItemPointer tid, */ CacheInvalidateHeapTuple(relation, &tp, NULL); - /* Now we can release the buffer */ - ReleaseBuffer(buffer); + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } /* * Release the lmgr tuple lock, if we had it. @@ -3158,8 +3199,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) result = heap_delete(relation, tid, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, false /* changingPart */ , NULL); switch (result) { case TM_SelfModified: @@ -3186,10 +3227,11 @@ simple_heap_delete(Relation relation, ItemPointer tid) } /* - * heap_update - replace a tuple + * heap_update - replace a tuple, optionally fetching it into a slot * * See table_tuple_update() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -3198,9 +3240,9 @@ simple_heap_delete(Relation relation, ItemPointer tid) */ TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -3428,7 +3470,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); /* see below about the "no wait" case */ - Assert(result != TM_BeingModified || wait); + Assert(result != TM_BeingModified || (options & TABLE_MODIFY_WAIT)); if (result == TM_Invisible) { @@ -3437,7 +3479,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to update invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -3641,7 +3683,30 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resouces on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); if (vmbuffer != InvalidBuffer) @@ -4120,7 +4185,26 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, /* Now we can release the buffer(s) */ if (newbuf != buffer) ReleaseBuffer(newbuf); - ReleaseBuffer(buffer); + + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } + if (BufferIsValid(vmbuffer_new)) ReleaseBuffer(vmbuffer_new); if (BufferIsValid(vmbuffer)) @@ -4450,8 +4534,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, &lockmode, update_indexes, NULL); switch (result) { case TM_SelfModified: @@ -4514,12 +4598,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * tuples. * * Output parameters: - * *tuple: all fields filled in - * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *slot: BufferHeapTupleTableSlot filled with tuple * *tmfd: filled in failure cases (see below) * * Function results are the same as the ones for table_tuple_lock(). * + * If *slot already contains the target tuple, it takes advantage on that by + * skipping the ReadBuffer() call. + * * In the failure cases other than TM_Invisible, the routine fills * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, * if necessary), and t_cmax (the last only for TM_SelfModified, @@ -4530,15 +4616,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * See README.tuplock for a thorough explanation of this mechanism. */ TM_Result -heap_lock_tuple(Relation relation, HeapTuple tuple, +heap_lock_tuple(Relation relation, ItemPointer tid, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, TM_FailureData *tmfd) + bool follow_updates, TM_FailureData *tmfd) { TM_Result result; - ItemPointer tid = &(tuple->t_self); ItemId lp; Page page; + Buffer buffer; Buffer vmbuffer = InvalidBuffer; BlockNumber block; TransactionId xid, @@ -4550,8 +4635,24 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, bool skip_tuple_lock = false; bool have_tuple_lock = false; bool cleared_all_frozen = false; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + HeapTuple tuple = &bslot->base.tupdata; + + Assert(TTS_IS_BUFFERTUPLE(slot)); - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + /* Take advantage if slot already contains the relevant tuple */ + if (!TTS_EMPTY(slot) && + slot->tts_tableOid == relation->rd_id && + ItemPointerCompare(&slot->tts_tid, tid) == 0 && + BufferIsValid(bslot->buffer)) + { + buffer = bslot->buffer; + IncrBufferRefCount(buffer); + } + else + { + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + } block = ItemPointerGetBlockNumber(tid); /* @@ -4560,21 +4661,22 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * in the middle of changing this, so we'll need to recheck after we have * the lock. */ - if (PageIsAllVisible(BufferGetPage(*buffer))) + if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = BufferGetPage(*buffer); + page = BufferGetPage(buffer); lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); Assert(ItemIdIsNormal(lp)); + tuple->t_self = *tid; tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); l3: - result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); + result = HeapTupleSatisfiesUpdate(tuple, cid, buffer); if (result == TM_Invisible) { @@ -4603,7 +4705,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, infomask2 = tuple->t_data->t_infomask2; ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* * If any subtransaction of the current top transaction already holds @@ -4755,12 +4857,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4795,7 +4897,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4823,7 +4925,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * No conflict, but if the xmax changed under us in the * meantime, start over. */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4835,7 +4937,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, } else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* if the xmax changed in the meantime, start over */ if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || @@ -4863,7 +4965,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, TransactionIdIsCurrentTransactionId(xwait)) { /* ... but if the xmax changed in the meantime, start over */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4885,7 +4987,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (require_sleep && (result == TM_Updated || result == TM_Deleted)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } else if (require_sleep) @@ -4910,7 +5012,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } @@ -4936,7 +5038,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4976,7 +5078,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -5002,12 +5104,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * xwait is done, but if xwait had just locked the tuple then some @@ -5029,7 +5131,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * don't check for this in the multixact case, because some * locker transactions might still be running. */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + UpdateXmaxHintBits(tuple->t_data, buffer, xwait); } } @@ -5088,9 +5190,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto l3; } @@ -5153,7 +5255,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, cleared_all_frozen = true; - MarkBufferDirty(*buffer); + MarkBufferDirty(buffer); /* * XLOG stuff. You might think that we don't need an XLOG record because @@ -5173,7 +5275,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, XLogRecPtr recptr; XLogBeginInsert(); - XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.xmax = xid; @@ -5194,7 +5296,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, result = TM_Ok; out_locked: - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); out_unlocked: if (BufferIsValid(vmbuffer)) @@ -5212,6 +5314,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (have_tuple_lock) UnlockTupleTuplock(relation, tid, mode); + /* Put the target tuple to the slot */ + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + return result; } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 6f8b1b79298..ed830464aea 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -46,6 +46,12 @@ #include "utils/builtins.h" #include "utils/rel.h" +static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid, + Snapshot snapshot, TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd); + static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); @@ -299,23 +305,55 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, static TM_Result heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { + TM_Result result; + /* * Currently Deleting of index tuples are handled at vacuum, in case if * the storage itself is cleaning the dead tuples by itself, it is the * time to call the index tuple deletion also. */ - return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); + result = heap_delete(relation, tid, cid, crosscheck, options, + tmfd, changingPart, oldSlot); + + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * delete should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_delete(). + */ + result = heapam_tuple_lock(relation, tid, snapshot, + oldSlot, cid, LockTupleExclusive, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + + return result; } static TM_Result heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, - LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, + LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); @@ -325,8 +363,8 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - result = heap_update(relation, otid, tuple, cid, crosscheck, wait, - tmfd, lockmode, update_indexes); + result = heap_update(relation, otid, tuple, cid, crosscheck, options, + tmfd, lockmode, update_indexes, oldSlot); ItemPointerCopy(&tuple->t_self, &slot->tts_tid); /* @@ -353,6 +391,31 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, if (shouldFree) pfree(tuple); + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * update should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_update(). + */ + result = heapam_tuple_lock(relation, otid, snapshot, + oldSlot, cid, *lockmode, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + return result; } @@ -364,7 +427,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, { BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; - Buffer buffer; HeapTuple tuple = &bslot->base.tupdata; bool follow_updates; @@ -374,9 +436,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, Assert(TTS_IS_BUFFERTUPLE(slot)); tuple_lock_retry: - tuple->t_self = *tid; - result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy, - follow_updates, &buffer, tmfd); + result = heap_lock_tuple(relation, tid, slot, cid, mode, wait_policy, + follow_updates, tmfd); if (result == TM_Updated && (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) @@ -384,8 +445,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* Should not encounter speculative tuple on recheck */ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); - ReleaseBuffer(buffer); - if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) { SnapshotData SnapshotDirty; @@ -407,6 +466,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, InitDirtySnapshot(SnapshotDirty); for (;;) { + Buffer buffer = InvalidBuffer; + if (ItemPointerIndicatesMovedPartitions(tid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), @@ -501,7 +562,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* * This is a live tuple, so try to lock it again. */ - ReleaseBuffer(buffer); + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); goto tuple_lock_retry; } @@ -512,7 +573,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, */ if (tuple->t_data == NULL) { - Assert(!BufferIsValid(buffer)); + ReleaseBuffer(buffer); return TM_Deleted; } @@ -565,9 +626,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - /* store in slot, transferring existing pin */ - ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); - return result; } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 3ec11fb52c0..4445af3a44e 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -288,16 +288,23 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) +simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; result = table_tuple_delete(rel, tid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + options, + &tmfd, false /* changingPart */ , + oldSlot); switch (result) { @@ -336,17 +343,24 @@ void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; LockTupleMode lockmode; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; result = table_tuple_update(rel, otid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + options, + &tmfd, &lockmode, update_indexes, + oldSlot); switch (result) { diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index c89763440c9..2a4b40422e3 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2811,8 +2811,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update) { @@ -2830,21 +2830,11 @@ ExecARDeleteTriggers(EState *estate, if ((trigdesc && trigdesc->trig_delete_after_row) || (transition_capture && transition_capture->tcs_delete_old_table)) { - TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); - - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) - GetTupleForTrigger(estate, - NULL, - relinfo, - tupleid, - LockTupleExclusive, - slot, - false, - NULL, - NULL, - NULL); - else + /* + * Put the FDW old tuple to the slot. Otherwise, caller is expected + * to have old tuple alredy fetched to the slot. + */ + if (fdw_trigtuple != NULL) ExecForceStoreHeapTuple(fdw_trigtuple, slot, false); AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, @@ -3161,18 +3151,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, * Note: 'src_partinfo' and 'dst_partinfo', when non-NULL, refer to the source * and destination partitions, respectively, of a cross-partition update of * the root partitioned table mentioned in the query, given by 'relinfo'. - * 'tupleid' in that case refers to the ctid of the "old" tuple in the source - * partition, and 'newslot' contains the "new" tuple in the destination - * partition. This interface allows to support the requirements of - * ExecCrossPartitionUpdateForeignKey(); is_crosspart_update must be true in - * that case. + * 'oldslot' contains the "old" tuple in the source partition, and 'newslot' + * contains the "new" tuple in the destination partition. This interface + * allows to support the requirements of ExecCrossPartitionUpdateForeignKey(); + * is_crosspart_update must be true in that case. */ void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, @@ -3201,30 +3190,14 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, * separately for DELETE and INSERT to capture transition table rows. * In such case, either old tuple or new tuple can be NULL. */ - TupleTableSlot *oldslot; - ResultRelInfo *tupsrc; - Assert((src_partinfo != NULL && dst_partinfo != NULL) || !is_crosspart_update); - tupsrc = src_partinfo ? src_partinfo : relinfo; - oldslot = ExecGetTriggerOldSlot(estate, tupsrc); - - if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) - GetTupleForTrigger(estate, - NULL, - tupsrc, - tupleid, - LockTupleExclusive, - oldslot, - false, - NULL, - NULL, - NULL); - else if (fdw_trigtuple != NULL) + if (fdw_trigtuple != NULL) + { + Assert(oldslot); ExecForceStoreHeapTuple(fdw_trigtuple, oldslot, false); - else - ExecClearTuple(oldslot); + } AfterTriggerSaveEvent(estate, relinfo, src_partinfo, dst_partinfo, diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index cb1202e4506..0133b2a8538 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -581,6 +581,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { List *recheckIndexes = NIL; TU_UpdateIndexes update_indexes; + TupleTableSlot *oldSlot = NULL; /* Compute stored generated columns */ if (rel->rd_att->constr && @@ -594,8 +595,12 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_after_row) + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, - &update_indexes); + &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, @@ -606,7 +611,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tid, NULL, slot, + NULL, oldSlot, slot, recheckIndexes, NULL, false); list_free(recheckIndexes); @@ -640,12 +645,18 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, if (!skip_tuple) { + TupleTableSlot *oldSlot = NULL; + + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_delete_after_row) + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot); + simple_table_tuple_delete(rel, tid, estate->es_snapshot, oldSlot); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, - tid, NULL, NULL, false); + NULL, oldSlot, NULL, false); } } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c230b666706..78fd7690b28 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -137,7 +137,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, ItemPointer tupleid, - TupleTableSlot *oldslot, + TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, @@ -597,6 +597,10 @@ ExecInitInsertProjection(ModifyTableState *mtstate, resultRelInfo->ri_newTupleSlot = table_slot_create(resultRelInfo->ri_RelationDesc, &estate->es_tupleTable); + if (node->onConflictAction == ONCONFLICT_UPDATE) + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); /* Build ProjectionInfo if needed (it probably isn't). */ if (need_projection) @@ -1186,7 +1190,7 @@ ExecInsert(ModifyTableContext *context, ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, NULL, - NULL, + resultRelInfo->ri_oldTupleSlot, slot, NULL, mtstate->mt_transition_capture, @@ -1367,7 +1371,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart) + ItemPointer tupleid, bool changingPart, int options, + TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1375,9 +1380,10 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, - changingPart); + changingPart, + oldSlot); } /* @@ -1389,7 +1395,8 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool changingPart) + ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; EState *estate = context->estate; @@ -1407,8 +1414,8 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, { ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, - NULL, NULL, mtstate->mt_transition_capture, + oldtuple, + slot, NULL, NULL, mtstate->mt_transition_capture, false); /* @@ -1419,10 +1426,30 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } /* AFTER ROW DELETE Triggers */ - ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, + ExecARDeleteTriggers(estate, resultRelInfo, oldtuple, slot, ar_delete_trig_tcs, changingPart); } +/* + * Initializes the tuple slot in a ResultRelInfo for DELETE action. + * + * We mark 'projectNewInfoValid' even though the projections themselves + * are not initialized here. + */ +static void +ExecInitDeleteTupleSlot(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo) +{ + EState *estate = mtstate->ps.state; + + Assert(!resultRelInfo->ri_projectNewInfoValid); + + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + resultRelInfo->ri_projectNewInfoValid = true; +} + /* ---------------------------------------------------------------- * ExecDelete * @@ -1450,6 +1477,7 @@ ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *oldSlot, bool processReturning, bool changingPart, bool canSetTag, @@ -1513,6 +1541,11 @@ ExecDelete(ModifyTableContext *context, } else { + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * delete the tuple * @@ -1523,7 +1556,8 @@ ExecDelete(ModifyTableContext *context, * transaction-snapshot mode transactions. */ ldelete: - result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart); + result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart, + options, oldSlot); if (tmresult) *tmresult = result; @@ -1570,7 +1604,6 @@ ExecDelete(ModifyTableContext *context, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; if (IsolationUsesXactSnapshot()) @@ -1579,87 +1612,29 @@ ExecDelete(ModifyTableContext *context, errmsg("could not serialize access due to concurrent update"))); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - EvalPlanQualBegin(context->epqstate); - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - LockTupleExclusive, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) + /* + * If requested, skip delete and pass back the updated + * row. + */ + if (epqreturnslot) { - case TM_Ok: - Assert(context->tmfd.traversed); - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* - * If requested, skip delete and pass back the - * updated row. - */ - if (epqreturnslot) - { - *epqreturnslot = epqslot; - return NULL; - } - else - goto ldelete; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously updated by this - * command, ignore the delete, otherwise error - * out. - * - * See also TM_SelfModified response to - * table_tuple_delete() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be deleted was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - default: - - /* - * TM_Invisible should be impossible because we're - * waiting for updated row versions, and would - * already have errored out if the first version - * is invisible. - * - * TM_Updated should be impossible, because we're - * locking the latest version via - * TUPLE_LOCK_FLAG_FIND_LAST_VERSION. - */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; + *epqreturnslot = epqslot; + return NULL; } - - Assert(false); - break; + else + goto ldelete; } case TM_Deleted: @@ -1693,7 +1668,8 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, changingPart); + ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, + oldSlot, changingPart); /* Process RETURNING if present and if requested */ if (processReturning && resultRelInfo->ri_projectReturning) @@ -1711,17 +1687,13 @@ ExecDelete(ModifyTableContext *context, } else { + /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); if (oldtuple != NULL) - { ExecForceStoreHeapTuple(oldtuple, slot, false); - } else - { - if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid, - SnapshotAny, slot)) - elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING"); - } + ExecCopySlot(slot, oldSlot); + Assert(!TupIsNull(slot)); } rslot = ExecProcessReturning(resultRelInfo, slot, context->planSlot); @@ -1821,12 +1793,16 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, MemoryContextSwitchTo(oldcxt); } + /* Make sure ri_oldTupleSlot is initialized. */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(mtstate, resultRelInfo); + /* * Row movement, part 1. Delete the tuple, but skip RETURNING processing. * We want to return rows from INSERT. */ ExecDelete(context, resultRelInfo, - tupleid, oldtuple, + tupleid, oldtuple, resultRelInfo->ri_oldTupleSlot, false, /* processReturning */ true, /* changingPart */ false, /* canSetTag */ @@ -1867,21 +1843,13 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, return true; else { - /* Fetch the most recent version of old tuple. */ - TupleTableSlot *oldSlot; - - /* ... but first, make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(mtstate, resultRelInfo); - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - /* and project the new tuple to retry the UPDATE with */ + /* + * ExecDelete already fetches the most recent version of old tuple + * to resultRelInfo->ri_RelationDesc. So, just project the new + * tuple to retry the UPDATE with. + */ *retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot, - oldSlot); + resultRelInfo->ri_oldTupleSlot); return false; } } @@ -2001,7 +1969,8 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag, UpdateContext *updateCxt) + bool canSetTag, int options, TupleTableSlot *oldSlot, + UpdateContext *updateCxt) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2093,7 +2062,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ExecCrossPartitionUpdateForeignKey(context, resultRelInfo, insert_destrel, - tupleid, slot, + tupleid, + resultRelInfo->ri_oldTupleSlot, inserted_tuple); return TM_Ok; @@ -2136,10 +2106,10 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, &updateCxt->lockmode, - &updateCxt->updateIndexes); - + &updateCxt->updateIndexes, + oldSlot); return result; } @@ -2152,7 +2122,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, ResultRelInfo *resultRelInfo, ItemPointer tupleid, - HeapTuple oldtuple, TupleTableSlot *slot) + HeapTuple oldtuple, TupleTableSlot *slot, + TupleTableSlot *oldSlot) { ModifyTableState *mtstate = context->mtstate; List *recheckIndexes = NIL; @@ -2168,7 +2139,7 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, slot, + oldtuple, oldSlot, slot, recheckIndexes, mtstate->operation == CMD_INSERT ? mtstate->mt_oc_transition_capture : @@ -2257,7 +2228,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, /* Perform the root table's triggers. */ ExecARUpdateTriggers(context->estate, rootRelInfo, sourcePartInfo, destPartInfo, - tupleid, NULL, newslot, NIL, NULL, true); + NULL, oldslot, newslot, NIL, NULL, true); } /* ---------------------------------------------------------------- @@ -2279,6 +2250,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, * NULL when the foreign table has no relevant triggers. * * slot contains the new tuple value to be stored. + * oldSlot is the slot to store the old tuple. * planSlot is the output of the ModifyTable's subplan; we use it * to access values from other input tables (for RETURNING), * row-ID junk columns, etc. @@ -2291,7 +2263,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag) + TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2346,6 +2318,11 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, { ItemPointerData lockedtid; + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!locked && !IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here to try again. (We don't need to redo triggers, @@ -2356,7 +2333,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, redo_act: lockedtid = *tupleid; result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot, - canSetTag, &updateCxt); + canSetTag, options, oldSlot, &updateCxt); /* * If ExecUpdateAct reports that a cross-partition update was done, @@ -2407,96 +2384,39 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; - TupleTableSlot *oldSlot; if (IsolationUsesXactSnapshot()) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + Assert(!locked); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - updateCxt.lockmode, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) + if (resultRelInfo->ri_needLockTagTuple) { - case TM_Ok: - Assert(context->tmfd.traversed); - - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* Make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(context->mtstate, - resultRelInfo); - - if (resultRelInfo->ri_needLockTagTuple) - { - UnlockTuple(resultRelationDesc, - &lockedtid, InplaceUpdateTupleLock); - LockTuple(resultRelationDesc, - tupleid, InplaceUpdateTupleLock); - } - - /* Fetch the most recent version of old tuple. */ - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - slot = ExecGetUpdateNewTuple(resultRelInfo, - epqslot, oldSlot); - goto redo_act; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously modified by - * this command, ignore the redundant update, - * otherwise error out. - * - * See also TM_SelfModified response to - * table_tuple_update() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be updated was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - default: - /* see table_tuple_lock call in ExecDelete() */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; + UnlockTuple(resultRelationDesc, + &lockedtid, InplaceUpdateTupleLock); + LockTuple(resultRelationDesc, + tupleid, InplaceUpdateTupleLock); } + + slot = ExecGetUpdateNewTuple(resultRelInfo, + epqslot, + oldSlot); + goto redo_act; } break; @@ -2520,7 +2440,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, (estate->es_processed)++; ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, - slot); + slot, oldSlot); /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) @@ -2746,7 +2666,8 @@ ExecOnConflictUpdate(ModifyTableContext *context, *returning = ExecUpdate(context, resultRelInfo, conflictTid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, - canSetTag); + existing, + canSetTag, true); /* * Clear out existing tuple, as there might not be another conflict among @@ -3046,7 +2967,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, Assert(oldtuple == NULL); result = ExecUpdateAct(context, resultRelInfo, tupleid, - NULL, newslot, canSetTag, + NULL, newslot, canSetTag, TABLE_MODIFY_WAIT, NULL, &updateCxt); /* @@ -3069,7 +2990,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot); + tupleid, NULL, newslot, + resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } break; @@ -3099,13 +3021,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, Assert(oldtuple == NULL); result = ExecDeleteAct(context, resultRelInfo, tupleid, - false); + false, TABLE_MODIFY_WAIT, NULL); } if (result == TM_Ok) { ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, - false); + resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } break; @@ -4314,15 +4236,21 @@ ExecModifyTable(PlanState *pstate) /* Now apply the update. */ slot = ExecUpdate(&context, resultRelInfo, tupleid, oldtuple, - slot, node->canSetTag); + slot, resultRelInfo->ri_oldTupleSlot, + node->canSetTag, false); if (tuplock) UnlockTuple(resultRelInfo->ri_RelationDesc, tupleid, InplaceUpdateTupleLock); break; case CMD_DELETE: + /* Initialize slot for DELETE to fetch the old tuple */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitDeleteTupleSlot(node, resultRelInfo); + slot = ExecDelete(&context, resultRelInfo, tupleid, oldtuple, - true, false, node->canSetTag, NULL, NULL, NULL); + resultRelInfo->ri_oldTupleSlot, true, false, + node->canSetTag, NULL, NULL, NULL); break; case CMD_MERGE: diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 65999dd64e1..ad71a220ed9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -322,19 +322,22 @@ extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate); extern TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, bool changingPart); + CommandId cid, Snapshot crosscheck, int options, + struct TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot); extern void heap_finish_speculative(Relation relation, ItemPointer tid); extern void heap_abort_speculative(Relation relation, ItemPointer tid); extern TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, struct TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); -extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, struct TM_FailureData *tmfd); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); +extern TM_Result heap_lock_tuple(Relation relation, ItemPointer tid, + TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool follow_updates, + struct TM_FailureData *tmfd); extern bool heap_inplace_lock(Relation relation, HeapTuple oldtup_ptr, Buffer buffer, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 7be7887b4a8..c2d6972b310 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -268,6 +268,11 @@ typedef struct TM_IndexDeleteOp /* Follow update chain and lock latest version of tuple */ #define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1) +/* "options" flag bits for table_tuple_update and table_tuple_delete */ +#define TABLE_MODIFY_WAIT 0x0001 +#define TABLE_MODIFY_FETCH_OLD_TUPLE 0x0002 +#define TABLE_MODIFY_LOCK_UPDATED 0x0004 + /* Typedef for callback function for table_index_build_scan */ typedef void (*IndexBuildCallback) (Relation index, @@ -537,9 +542,10 @@ typedef struct TableAmRoutine CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, - bool changingPart); + bool changingPart, + TupleTableSlot *oldSlot); /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, @@ -548,10 +554,11 @@ typedef struct TableAmRoutine CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, @@ -1463,7 +1470,7 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, } /* - * Delete a tuple. + * Delete a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless prepared to deal with * concurrent-update conditions. Use simple_table_tuple_delete instead. @@ -1474,11 +1481,21 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * cid - delete command ID (used for visibility test, and stored into * cmax if successful) * crosscheck - if not InvalidSnapshot, also check tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * changingPart - true iff the tuple is being moved to another partition * table due to an update of the partition key. Otherwise, false. + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. * * Normal, successful return value is TM_Ok, which means we did actually * delete it. Failure return codes are TM_SelfModified, TM_Updated, and @@ -1490,16 +1507,18 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, */ static inline TM_Result table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { return rel->rd_tableam->tuple_delete(rel, tid, cid, snapshot, crosscheck, - wait, tmfd, changingPart); + options, tmfd, changingPart, + oldSlot); } /* - * Update a tuple. + * Update a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless you are prepared to deal with * concurrent-update conditions. Use simple_table_tuple_update instead. @@ -1511,13 +1530,23 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * cid - update command ID (used for visibility test, and stored into * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * lockmode - filled with lock mode acquired on tuple * update_indexes - in success cases this is set to true if new index entries * are required for this tuple - * + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. + * Normal, successful return value is TM_Ok, which means we did actually * update it. Failure return codes are TM_SelfModified, TM_Updated, and * TM_BeingModified (the last only possible if wait == false). @@ -1535,13 +1564,15 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, static inline TM_Result table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { return rel->rd_tableam->tuple_update(rel, otid, slot, cid, snapshot, crosscheck, - wait, tmfd, - lockmode, update_indexes); + options, tmfd, + lockmode, update_indexes, + oldSlot); } /* @@ -2055,10 +2086,12 @@ table_scan_sample_next_tuple(TableScanDesc scan, extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, - Snapshot snapshot); + Snapshot snapshot, + TupleTableSlot *oldSlot); extern void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* ---------------------------------------------------------------------------- diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index f9e4dc4f3cd..ca0165e6e03 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -225,8 +225,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, TM_FailureData *tmfd); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update); extern bool ExecIRDeleteTriggers(EState *estate, @@ -258,8 +258,8 @@ extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, From 61adf29cee80206d03f82bc5de4f52a786c7e377 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 22 Mar 2023 16:47:09 -0700 Subject: [PATCH 06/79] Add EvalPlanQual delete returning isolation test Author: Andres Freund Reviewed-by: Pavel Borisov Discussion: https://www.postgresql.org/message-id/flat/CAPpHfdua-YFw3XTprfutzGp28xXLigFtzNbuFY8yPhqeq6X5kg%40mail.gmail.com --- .../isolation/expected/eval-plan-qual-2.out | 37 +++++++++++++++++++ src/test/isolation/isolation_schedule | 1 + .../isolation/specs/eval-plan-qual-2.spec | 30 +++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 src/test/isolation/expected/eval-plan-qual-2.out create mode 100644 src/test/isolation/specs/eval-plan-qual-2.spec diff --git a/src/test/isolation/expected/eval-plan-qual-2.out b/src/test/isolation/expected/eval-plan-qual-2.out new file mode 100644 index 00000000000..117a3d3be8d --- /dev/null +++ b/src/test/isolation/expected/eval-plan-qual-2.out @@ -0,0 +1,37 @@ +Parsed test spec with 3 sessions + +starting permutation: read_u wx2 wb1 c2 c1 read_u read +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 600| 1200 +savings | 600| 1200 +(2 rows) + +step wx2: UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; +balance +------- + 1050 +(1 row) + +step wb1: DELETE FROM accounts WHERE balance = 600 RETURNING *; +step c2: COMMIT; +step wb1: <... completed> +accountid|balance|balance2 +---------+-------+-------- +savings | 600| 1200 +(1 row) + +step c1: COMMIT; +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + +step read: SELECT * FROM accounts ORDER BY accountid; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 143109aa4da..f4df2146488 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -36,6 +36,7 @@ test: fk-partitioned-2 test: fk-snapshot test: subxid-overflow test: eval-plan-qual +test: eval-plan-qual-2 test: eval-plan-qual-trigger test: inplace-inval test: intra-grant-inplace diff --git a/src/test/isolation/specs/eval-plan-qual-2.spec b/src/test/isolation/specs/eval-plan-qual-2.spec new file mode 100644 index 00000000000..30447bef24a --- /dev/null +++ b/src/test/isolation/specs/eval-plan-qual-2.spec @@ -0,0 +1,30 @@ +setup +{ + CREATE TABLE accounts (accountid text PRIMARY KEY, balance numeric not null, + balance2 numeric GENERATED ALWAYS AS (balance * 2) STORED); + INSERT INTO accounts VALUES ('checking', 600), ('savings', 600); +} + +teardown +{ + DROP TABLE accounts; +} + +session s1 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wb1 { DELETE FROM accounts WHERE balance = 600 RETURNING *; } +step c1 { COMMIT; } + +session s2 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wx2 { UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; } +step c2 { COMMIT; } + +session s3 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step read { SELECT * FROM accounts ORDER BY accountid; } +step read_u { SELECT * FROM accounts; } + +teardown { COMMIT; } + +permutation read_u wx2 wb1 c2 c1 read_u read From d92b22a9942909aef71708f517530f5a9393d9c5 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 00:04:21 +0300 Subject: [PATCH 07/79] Improvements to TableAM API --- contrib/amcheck/verify_nbtree.c | 2 +- src/backend/access/common/detoast.c | 20 +- src/backend/access/common/heaptuple.c | 4 + src/backend/access/common/reloptions.c | 6 +- src/backend/access/heap/heapam_handler.c | 343 +++++++++++++++++- src/backend/access/table/tableam.c | 4 +- src/backend/access/table/tableamapi.c | 26 +- src/backend/catalog/aclchk.c | 2 +- src/backend/commands/analyze.c | 14 +- src/backend/commands/tablecmds.c | 58 +-- src/backend/commands/trigger.c | 265 ++++++++++---- src/backend/executor/execExprInterp.c | 4 +- src/backend/executor/execMain.c | 28 +- src/backend/executor/execReplication.c | 10 +- src/backend/executor/nodeLockRows.c | 17 +- src/backend/executor/nodeModifyTable.c | 440 ++++++++--------------- src/backend/executor/nodeTidscan.c | 2 +- src/backend/nodes/read.c | 11 + src/backend/optimizer/plan/planner.c | 16 +- src/backend/optimizer/prep/preptlist.c | 20 +- src/backend/optimizer/util/appendinfo.c | 32 +- src/backend/optimizer/util/inherit.c | 48 ++- src/backend/parser/parse_relation.c | 13 + src/backend/postmaster/autovacuum.c | 4 +- src/backend/rewrite/rewriteHandler.c | 1 + src/backend/utils/adt/ri_triggers.c | 5 +- src/backend/utils/cache/relcache.c | 38 +- src/backend/utils/sort/tuplestore.c | 30 ++ src/include/access/reloptions.h | 2 + src/include/access/sysattr.h | 3 +- src/include/access/tableam.h | 189 ++++++---- src/include/commands/trigger.h | 8 +- src/include/commands/vacuum.h | 3 + src/include/foreign/fdwapi.h | 6 +- src/include/nodes/execnodes.h | 3 + src/include/nodes/parsenodes.h | 1 + src/include/nodes/plannodes.h | 4 +- src/include/nodes/primnodes.h | 7 + src/include/nodes/readfuncs.h | 1 + src/include/optimizer/appendinfo.h | 5 + src/include/optimizer/planner.h | 3 +- src/include/utils/tuplestore.h | 3 + src/include/varatt.h | 2 + 43 files changed, 1146 insertions(+), 557 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 34990c5cea3..ed4497f9620 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -994,7 +994,7 @@ heap_entry_is_visible(BtreeCheckState *state, ItemPointer tid) TupleTableSlot *slot = table_slot_create(state->heaprel, NULL); tid_visible = table_tuple_fetch_row_version(state->heaprel, - tid, state->snapshot, slot); + PointerGetDatum(tid), state->snapshot, slot); if (slot != NULL) ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index d9ab4fb0956..27d0e37607a 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -28,6 +28,8 @@ static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 slicelength); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); +static ToastFunc o_detoast_func = NULL; + /* ---------- * detoast_external_attr - * @@ -222,7 +224,14 @@ detoast_attr_slice(struct varlena *attr, else if (pg_add_s32_overflow(sliceoffset, slicelength, &slicelimit)) slicelength = slicelimit = -1; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + Assert(o_detoast_func != NULL); + preslice = o_detoast_func(attr); + if (preslice == NULL) + elog(ERROR, "unexpected NULL detoast result"); + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; @@ -331,8 +340,6 @@ detoast_attr_slice(struct varlena *attr, return result; } -static ToastFunc o_detoast_func = NULL; - void register_o_detoast_func(ToastFunc func) { @@ -633,7 +640,12 @@ toast_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->toasted_size - VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { /* * Attribute is stored externally - return the extsize whether diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 9e3407bf987..a1b8a99b739 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -755,6 +755,10 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) case TableOidAttributeNumber: result = ObjectIdGetDatum(tup->t_tableOid); break; + case RowIdAttributeNumber: + *isnull = true; + result = 0; + break; default: elog(ERROR, "invalid attnum: %d", attnum); result = 0; /* keep compiler quiet */ diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index c6a2d13be8d..515dd383cbf 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/spgist_private.h" +#include "access/tableam.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" @@ -1386,7 +1387,7 @@ untransformRelOptions(Datum options) */ bytea * extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, - amoptions_function amoptions) + const TableAmRoutine *tableam, amoptions_function amoptions) { bytea *options; bool isnull; @@ -1408,7 +1409,8 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - options = heap_reloptions(classForm->relkind, datum, false); + options = tableam_reloptions(tableam, classForm->relkind, + datum, false); break; case RELKIND_PARTITIONED_TABLE: options = partitioned_table_reloptions(datum, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ed830464aea..2c2c7061189 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -23,6 +23,7 @@ #include "access/heapam.h" #include "access/heaptoast.h" #include "access/multixact.h" +#include "access/reloptions.h" #include "access/rewriteheap.h" #include "access/syncscan.h" #include "access/tableam.h" @@ -46,7 +47,7 @@ #include "utils/builtins.h" #include "utils/rel.h" -static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid, +static TM_Result heapam_tuple_lock(Relation relation, Datum tid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, @@ -76,6 +77,20 @@ heapam_slot_callbacks(Relation relation) return &TTSOpsBufferHeapTuple; } +static RowRefType +heapam_get_row_ref_type(Relation rel) +{ + return ROW_REF_TID; +} + +static void +heapam_free_rd_amcache(Relation rel) +{ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; +} + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM @@ -185,7 +200,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, static bool heapam_fetch_row_version(Relation relation, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -194,7 +209,7 @@ heapam_fetch_row_version(Relation relation, Assert(TTS_IS_BUFFERTUPLE(slot)); - bslot->base.tupdata.t_self = *tid; + bslot->base.tupdata.t_self = *DatumGetItemPointer(tupleid); if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) { /* store in slot, transferring existing pin */ @@ -244,7 +259,7 @@ heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, * ---------------------------------------------------------------------------- */ -static void +static TupleTableSlot * heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, int options, BulkInsertState bistate) { @@ -261,6 +276,8 @@ heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, if (shouldFree) pfree(tuple); + + return slot; } static void @@ -303,13 +320,285 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, pfree(tuple); } +/* + * ExecCheckTupleVisible -- verify tuple is visible + * + * It would not be consistent with guarantees of the higher isolation levels to + * proceed with avoiding insertion (taking speculative insertion's alternative + * path) on the basis of another tuple that is not visible to MVCC snapshot. + * Check for the need to raise a serialization failure, and do so as necessary. + */ +static void +ExecCheckTupleVisible(EState *estate, + Relation rel, + TupleTableSlot *slot) +{ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) + { + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + /* + * We should not raise a serialization failure if the conflict is + * against a tuple inserted by our own transaction, even if it's not + * visible to our snapshot. (This would happen, for example, if + * conflicting keys are proposed for insertion in a single command.) + */ + if (!TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } +} + +/* + * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() + */ +static void +ExecCheckTIDVisible(EState *estate, + Relation rel, + ItemPointer tid, + TupleTableSlot *tempSlot) +{ + /* Redundantly check isolation level */ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_fetch_row_version(rel, PointerGetDatum(tid), + SnapshotAny, tempSlot)) + elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); + ExecCheckTupleVisible(estate, rel, tempSlot); + ExecClearTuple(tempSlot); +} + +static inline TupleTableSlot * +heapam_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + uint32 specToken; + ItemPointerData conflictTid; + bool specConflict; + List *recheckIndexes = NIL; + + while (true) + { + specConflict = false; + if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, &conflictTid, + arbiterIndexes)) + { + if (lockedSlot) + { + TM_Result test; + TM_FailureData tmfd; + Datum xminDatum; + TransactionId xmin; + bool isnull; + + /* Determine lock mode to use */ + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + + /* + * Lock tuple for update. Don't follow updates when tuple cannot be + * locked without doing so. A row locking conflict here means our + * previous conclusion that the tuple is conclusively committed is not + * true anymore. + */ + test = table_tuple_lock(rel, PointerGetDatum(&conflictTid), + estate->es_snapshot, + lockedSlot, estate->es_output_cid, + lockmode, LockWaitBlock, 0, + &tmfd); + switch (test) + { + case TM_Ok: + /* success! */ + break; + + case TM_Invisible: + + /* + * This can occur when a just inserted tuple is updated again in + * the same command. E.g. because multiple rows with the same + * conflicting key values are inserted. + * + * This is somewhat similar to the ExecUpdate() TM_SelfModified + * case. We do not want to proceed because it would lead to the + * same row being updated a second time in some unspecified order, + * and in contrast to plain UPDATEs there's no historical behavior + * to break. + * + * It is the user's responsibility to prevent this situation from + * occurring. These problems are why the SQL standard similarly + * specifies that for SQL MERGE, an exception must be raised in + * the event of an attempt to update the same row twice. + */ + xminDatum = slot_getsysattr(lockedSlot, + MinTransactionIdAttributeNumber, + &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + if (TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + /* translator: %s is a SQL command name */ + errmsg("%s command cannot affect row a second time", + "ON CONFLICT DO UPDATE"), + errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); + + /* This shouldn't happen */ + elog(ERROR, "attempted to lock invisible tuple"); + break; + + case TM_SelfModified: + + /* + * This state should never be reached. As a dirty snapshot is used + * to find conflicting tuples, speculative insertion wouldn't have + * seen this row to conflict with. + */ + elog(ERROR, "unexpected self-updated tuple"); + break; + + case TM_Updated: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * As long as we don't support an UPDATE of INSERT ON CONFLICT for + * a partitioned table we shouldn't reach to a case where tuple to + * be lock is moved to another partition due to concurrent update + * of the partition key. + */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + + /* + * Tell caller to try again from the very start. + * + * It does not make sense to use the usual EvalPlanQual() style + * loop here, as the new version of the row might not conflict + * anymore, or the conflicting tuple has actually been deleted. + */ + ExecClearTuple(lockedSlot); + return false; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + + /* see TM_Updated case */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + ExecClearTuple(lockedSlot); + return false; + + default: + elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + } + + /* Success, the tuple is locked. */ + + /* + * Verify that the tuple is visible to our MVCC snapshot if the current + * isolation level mandates that. + * + * It's not sufficient to rely on the check within ExecUpdate() as e.g. + * CONFLICT ... WHERE clause may prevent us from reaching that. + * + * This means we only ever continue when a new command in the current + * transaction could see the row, even though in READ COMMITTED mode the + * tuple will not be visible according to the current statement's + * snapshot. This is in line with the way UPDATE deals with newer tuple + * versions. + */ + ExecCheckTupleVisible(estate, rel, lockedSlot); + return NULL; + } + else + { + ExecCheckTIDVisible(estate, rel, &conflictTid, tempSlot); + return NULL; + } + } + + /* + * Before we start insertion proper, acquire our "speculative + * insertion lock". Others can use that to wait for us to decide + * if we're going to go ahead with the insertion, instead of + * waiting for the whole transaction to complete. + */ + specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); + + /* insert the tuple, with the speculative token */ + heapam_tuple_insert_speculative(rel, slot, + estate->es_output_cid, + 0, + NULL, + specToken); + + /* insert index entries for tuple */ + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, true, + &specConflict, + arbiterIndexes, + false); + + /* adjust the tuple's state accordingly */ + heapam_tuple_complete_speculative(rel, slot, + specToken, !specConflict); + + /* + * Wake up anyone waiting for our decision. They will re-check + * the tuple, see that it's no longer speculative, and wait on our + * XID as if this was a regularly inserted tuple all along. Or if + * we killed the tuple, they will see it's dead, and proceed as if + * the tuple never existed. + */ + SpeculativeInsertionLockRelease(GetCurrentTransactionId()); + + /* + * If there was a conflict, start from the beginning. We'll do + * the pre-check again, which will now find the conflicting tuple + * (unless it aborts before we get there). + */ + if (specConflict) + { + list_free(recheckIndexes); + CHECK_FOR_INTERRUPTS(); + continue; + } + + return slot; + } +} + static TM_Result -heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, +heapam_tuple_delete(Relation relation, Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, bool changingPart, TupleTableSlot *oldSlot) { TM_Result result; + ItemPointer tid = DatumGetItemPointer(tupleid); /* * Currently Deleting of index tuples are handled at vacuum, in case if @@ -332,7 +621,7 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, * heapam_tuple_lock() will take advantage of tuple loaded into * oldSlot by heap_delete(). */ - result = heapam_tuple_lock(relation, tid, snapshot, + result = heapam_tuple_lock(relation, tupleid, snapshot, oldSlot, cid, LockTupleExclusive, (options & TABLE_MODIFY_WAIT) ? LockWaitBlock : @@ -349,7 +638,7 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, static TM_Result -heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, +heapam_tuple_update(Relation relation, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, @@ -358,6 +647,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); TM_Result result; + ItemPointer otid = DatumGetItemPointer(tupleid); /* Update the tuple with table oid */ slot->tts_tableOid = RelationGetRelid(relation); @@ -404,7 +694,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, * heapam_tuple_lock() will take advantage of tuple loaded into * oldSlot by heap_update(). */ - result = heapam_tuple_lock(relation, otid, snapshot, + result = heapam_tuple_lock(relation, tupleid, snapshot, oldSlot, cid, *lockmode, (options & TABLE_MODIFY_WAIT) ? LockWaitBlock : @@ -420,7 +710,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, } static TM_Result -heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, +heapam_tuple_lock(Relation relation, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) @@ -428,6 +718,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; HeapTuple tuple = &bslot->base.tupdata; + ItemPointer tid = DatumGetItemPointer(tupleid); bool follow_updates; follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; @@ -2641,6 +2932,29 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +static bool +heapam_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + return TransactionIdIsCurrentTransactionId(xmin); +} + +static bytea * +heapam_reloptions(char relkind, Datum reloptions, bool validate) +{ + if (relkind == RELKIND_RELATION || + relkind == RELKIND_TOASTVALUE || + relkind == RELKIND_MATVIEW) + return heap_reloptions(relkind, reloptions, validate); + + return NULL; +} /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2651,6 +2965,8 @@ static const TableAmRoutine heapam_methods = { .type = T_TableAmRoutine, .slot_callbacks = heapam_slot_callbacks, + .get_row_ref_type = heapam_get_row_ref_type, + .free_rd_amcache = heapam_free_rd_amcache, .scan_begin = heap_beginscan, .scan_end = heap_endscan, @@ -2670,8 +2986,7 @@ static const TableAmRoutine heapam_methods = { .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, - .tuple_insert_speculative = heapam_tuple_insert_speculative, - .tuple_complete_speculative = heapam_tuple_complete_speculative, + .tuple_insert_with_arbiter = heapam_tuple_insert_with_arbiter, .multi_insert = heap_multi_insert, .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, @@ -2703,7 +3018,11 @@ static const TableAmRoutine heapam_methods = { .scan_bitmap_next_block = heapam_scan_bitmap_next_block, .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, - .scan_sample_next_tuple = heapam_scan_sample_next_tuple + .scan_sample_next_tuple = heapam_scan_sample_next_tuple, + + .tuple_is_current = heapam_tuple_is_current, + + .reloptions = heapam_reloptions }; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 4445af3a44e..0c02e428e4e 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -299,7 +299,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_delete(rel, tid, + result = table_tuple_delete(rel, PointerGetDatum(tid), GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, @@ -355,7 +355,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, otid, slot, + result = table_tuple_update(rel, PointerGetDatum(otid), slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index e9b598256fb..cd01bd9934f 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -13,10 +13,11 @@ #include "access/tableam.h" #include "access/xact.h" +#include "catalog/pg_am.h" #include "commands/defrem.h" #include "miscadmin.h" #include "utils/guc_hooks.h" - +#include "utils/syscache.h" /* * GetTableAmRoutine @@ -68,8 +69,7 @@ GetTableAmRoutine(Oid amhandler) * Could be made optional, but would require throwing error during * parse-analysis. */ - Assert(routine->tuple_insert_speculative != NULL); - Assert(routine->tuple_complete_speculative != NULL); + Assert(routine->tuple_insert_with_arbiter != NULL); Assert(routine->multi_insert != NULL); Assert(routine->tuple_delete != NULL); @@ -97,9 +97,29 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->scan_sample_next_block != NULL); Assert(routine->scan_sample_next_tuple != NULL); + Assert(routine->tuple_is_current != NULL); + return routine; } +const TableAmRoutine * +GetTableAmRoutineByAmOid(Oid amoid) +{ + HeapTuple ht_am; + Form_pg_am amrec; + const TableAmRoutine *tableam = NULL; + + ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(amoid)); + if (!HeapTupleIsValid(ht_am)) + elog(ERROR, "cache lookup failed for access method %u", + amoid); + amrec = (Form_pg_am)GETSTRUCT(ht_am); + + tableam = GetTableAmRoutine(amrec->amhandler); + ReleaseSysCache(ht_am); + return tableam; +} + /* check_hook: validate new default_table_access_method */ bool check_default_table_access_method(char **newval, void **extra, GucSource source) diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index bc0e259f69f..4dd03c48fc7 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -1639,7 +1639,7 @@ expand_all_col_privileges(Oid table_oid, Form_pg_class classForm, AttrNumber curr_att; Assert(classForm->relnatts - FirstLowInvalidHeapAttributeNumber < num_col_privileges); - for (curr_att = FirstLowInvalidHeapAttributeNumber + 1; + for (curr_att = FirstLowInvalidHeapAttributeNumber + 2; curr_att <= classForm->relnatts; curr_att++) { diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index c590a2adc35..f63faedfcfb 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -87,9 +87,6 @@ static void compute_index_stats(Relation onerel, double totalrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, Node *index_expr); -static int acquire_sample_rows(Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); static int compare_rows(const void *a, const void *b, void *arg); static int acquire_inherited_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, @@ -190,10 +187,7 @@ analyze_rel(Oid relid, RangeVar *relation, if (onerel->rd_rel->relkind == RELKIND_RELATION || onerel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so we'll use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - /* Also get regular table's size */ - relpages = RelationGetNumberOfBlocks(onerel); + table_analyze(onerel, &acquirefunc, &relpages); } else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { @@ -1154,7 +1148,7 @@ block_sampling_read_stream_next(ReadStream *stream, * block. The previous sampling method put too much credence in the row * density near the start of the table. */ -static int +int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) @@ -1421,9 +1415,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, if (childrel->rd_rel->relkind == RELKIND_RELATION || childrel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - relpages = RelationGetNumberOfBlocks(childrel); + table_analyze(childrel, &acquirefunc, &relpages); } else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index fb64730a7e1..76912a87b8c 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -717,6 +717,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, ObjectAddress address; LOCKMODE parentLockmode; Oid accessMethodId = InvalidOid; + const TableAmRoutine *tableam = NULL; /* * Truncate relname to appropriate length (probably a waste of time, as @@ -852,6 +853,29 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, if (!OidIsValid(ownerId)) ownerId = GetUserId(); + + /* + * For relations with table AM and partitioned tables, select access + * method to use: an explicitly indicated one, or (in the case of a + * partitioned table) the parent's, if it has one. + */ + if (stmt->accessMethod != NULL) + { + Assert(RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE); + accessMethodId = get_table_am_oid(stmt->accessMethod, false); + } + else if (RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE) + { + if (stmt->partbound) + { + Assert(list_length(inheritOids) == 1); + accessMethodId = get_rel_relam(linitial_oid(inheritOids)); + } + + if (RELKIND_HAS_TABLE_AM(relkind) && !OidIsValid(accessMethodId)) + accessMethodId = get_table_am_oid(default_table_access_method, false); + } + /* * Parse and validate reloptions, if any. */ @@ -860,6 +884,12 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, switch (relkind) { + case RELKIND_RELATION: + case RELKIND_TOASTVALUE: + case RELKIND_MATVIEW: + tableam = GetTableAmRoutineByAmOid(accessMethodId); + (void) tableam_reloptions(tableam, relkind, reloptions, true); + break; case RELKIND_VIEW: (void) view_reloptions(reloptions, true); break; @@ -868,6 +898,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, break; default: (void) heap_reloptions(relkind, reloptions, true); + break; } if (stmt->ofTypename) @@ -958,28 +989,6 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, } } - /* - * For relations with table AM and partitioned tables, select access - * method to use: an explicitly indicated one, or (in the case of a - * partitioned table) the parent's, if it has one. - */ - if (stmt->accessMethod != NULL) - { - Assert(RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE); - accessMethodId = get_table_am_oid(stmt->accessMethod, false); - } - else if (RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE) - { - if (stmt->partbound) - { - Assert(list_length(inheritOids) == 1); - accessMethodId = get_rel_relam(linitial_oid(inheritOids)); - } - - if (RELKIND_HAS_TABLE_AM(relkind) && !OidIsValid(accessMethodId)) - accessMethodId = get_table_am_oid(default_table_access_method, false); - } - /* * Create the relation. Inherited defaults and constraints are passed in * for immediate handling --- since they don't need parsing, they can be @@ -6327,8 +6336,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* Write the tuple out to the new relation */ if (newrel) + { table_tuple_insert(newrel, insertslot, mycid, ti_options, bistate); + } ResetExprContext(econtext); @@ -15099,7 +15110,8 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - (void) heap_reloptions(rel->rd_rel->relkind, newOptions, true); + (void) table_reloptions(rel, rel->rd_rel->relkind, + newOptions, true); break; case RELKIND_PARTITIONED_TABLE: (void) partitioned_table_reloptions(newOptions, true); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 2a4b40422e3..afea7b45e34 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -76,7 +76,7 @@ static void SetTriggerFlags(TriggerDesc *trigdesc, Trigger *trigger); static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, bool do_epq_recheck, @@ -2692,13 +2692,13 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, */ bool ExecBRDeleteTriggersNew(EState *estate, EPQState *epqstate, - ResultRelInfo *relinfo, - ItemPointer tupleid, - HeapTuple fdw_trigtuple, - TupleTableSlot **epqslot, - TM_Result *tmresult, - TM_FailureData *tmfd, - bool is_merge_delete) + ResultRelInfo *relinfo, + Datum tupleid, + HeapTuple fdw_trigtuple, + TupleTableSlot **epqslot, + TM_Result *tmresult, + TM_FailureData *tmfd, + bool is_merge_delete) { TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2708,7 +2708,7 @@ ExecBRDeleteTriggersNew(EState *estate, EPQState *epqstate, bool should_free = false; int i; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -2793,7 +2793,7 @@ ExecBRDeleteTriggersNew(EState *estate, EPQState *epqstate, bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -2970,13 +2970,13 @@ ExecASUpdateTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRUpdateTriggersNew(EState *estate, EPQState *epqstate, - ResultRelInfo *relinfo, - ItemPointer tupleid, - HeapTuple fdw_trigtuple, - TupleTableSlot *newslot, - TM_Result *tmresult, - TM_FailureData *tmfd, - bool is_merge_update) + ResultRelInfo *relinfo, + Datum tupleid, + HeapTuple fdw_trigtuple, + TupleTableSlot *newslot, + TM_Result *tmresult, + TM_FailureData *tmfd, + bool is_merge_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); @@ -2992,7 +2992,7 @@ ExecBRUpdateTriggersNew(EState *estate, EPQState *epqstate, /* Determine lock mode to use */ lockmode = ExecUpdateLockMode(estate, relinfo); - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -3136,7 +3136,7 @@ ExecBRUpdateTriggersNew(EState *estate, EPQState *epqstate, bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, @@ -3344,7 +3344,7 @@ static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, bool do_epq_recheck, @@ -3370,7 +3370,9 @@ GetTupleForTrigger(EState *estate, */ if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(relation, tid, estate->es_snapshot, oldslot, + + test = table_tuple_lock(relation, tupleid, + estate->es_snapshot, oldslot, estate->es_output_cid, lockmode, LockWaitBlock, lockflags, @@ -3467,8 +3469,8 @@ GetTupleForTrigger(EState *estate, * We expect the tuple to be present, thus very simple error handling * suffices. */ - if (!table_tuple_fetch_row_version(relation, tid, SnapshotAny, - oldslot)) + if (!table_tuple_fetch_row_version(relation, tupleid, + SnapshotAny, oldslot)) elog(ERROR, "failed to fetch tuple for trigger"); } @@ -3674,18 +3676,22 @@ typedef SetConstraintStateData *SetConstraintState; * cycles. So we need only ensure that ats_firing_id is zero when attaching * a new event to an existing AfterTriggerSharedData record. */ -typedef uint32 TriggerFlags; +typedef uint64 TriggerFlags; -#define AFTER_TRIGGER_OFFSET 0x07FFFFFF /* must be low-order bits */ -#define AFTER_TRIGGER_DONE 0x80000000 -#define AFTER_TRIGGER_IN_PROGRESS 0x40000000 +#define AFTER_TRIGGER_SIZE UINT64CONST(0xFFFF000000000) /* must be low-order bits */ +#define AFTER_TRIGGER_SIZE_SHIFT (36) +#define AFTER_TRIGGER_OFFSET UINT64CONST(0x000000FFFFFFF) /* must be low-order bits */ +#define AFTER_TRIGGER_DONE UINT64CONST(0x0000800000000) +#define AFTER_TRIGGER_IN_PROGRESS UINT64CONST(0x0000400000000) /* bits describing the size and tuple sources of this event */ -#define AFTER_TRIGGER_FDW_REUSE 0x00000000 -#define AFTER_TRIGGER_FDW_FETCH 0x20000000 -#define AFTER_TRIGGER_1CTID 0x10000000 -#define AFTER_TRIGGER_2CTID 0x30000000 -#define AFTER_TRIGGER_CP_UPDATE 0x08000000 -#define AFTER_TRIGGER_TUP_BITS 0x38000000 +#define AFTER_TRIGGER_FDW_REUSE UINT64CONST(0x0000000000000) +#define AFTER_TRIGGER_FDW_FETCH UINT64CONST(0x0000200000000) +#define AFTER_TRIGGER_1CTID UINT64CONST(0x0000100000000) +#define AFTER_TRIGGER_ROWID1 UINT64CONST(0x0000010000000) +#define AFTER_TRIGGER_2CTID UINT64CONST(0x0000300000000) +#define AFTER_TRIGGER_ROWID2 UINT64CONST(0x0000020000000) +#define AFTER_TRIGGER_CP_UPDATE UINT64CONST(0x0000080000000) +#define AFTER_TRIGGER_TUP_BITS UINT64CONST(0x0000380000000) typedef struct AfterTriggerSharedData *AfterTriggerShared; typedef struct AfterTriggerSharedData @@ -3737,6 +3743,9 @@ typedef struct AfterTriggerEventDataZeroCtids } AfterTriggerEventDataZeroCtids; #define SizeofTriggerEvent(evt) \ + (((evt)->ate_flags & AFTER_TRIGGER_SIZE) >> AFTER_TRIGGER_SIZE_SHIFT) + +#define BasicSizeofTriggerEvent(evt) \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_CP_UPDATE ? \ sizeof(AfterTriggerEventData) : \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID ? \ @@ -4082,14 +4091,34 @@ afterTriggerCopyBitmap(Bitmapset *src) */ static void afterTriggerAddEvent(AfterTriggerEventList *events, - AfterTriggerEvent event, AfterTriggerShared evtshared) + AfterTriggerEvent event, AfterTriggerShared evtshared, + bytea *rowid1, bytea *rowid2) { - Size eventsize = SizeofTriggerEvent(event); - Size needed = eventsize + sizeof(AfterTriggerSharedData); + Size basiceventsize = MAXALIGN(BasicSizeofTriggerEvent(event)); + Size eventsize; + Size needed; AfterTriggerEventChunk *chunk; AfterTriggerShared newshared; AfterTriggerEvent newevent; + if (SizeofTriggerEvent(event) == 0) + { + eventsize = basiceventsize; + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + eventsize += MAXALIGN(VARSIZE(rowid1)); + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + eventsize += MAXALIGN(VARSIZE(rowid2)); + + event->ate_flags |= eventsize << AFTER_TRIGGER_SIZE_SHIFT; + } + else + { + eventsize = SizeofTriggerEvent(event); + } + + needed = eventsize + sizeof(AfterTriggerSharedData); + /* * If empty list or not enough room in the tail chunk, make a new chunk. * We assume here that a new shared record will always be needed. @@ -4122,7 +4151,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, * sizes used should be MAXALIGN multiples, to ensure that the shared * records will be aligned safely. */ -#define MIN_CHUNK_SIZE 1024 +#define MIN_CHUNK_SIZE (1024*4) #define MAX_CHUNK_SIZE (1024*1024) #if MAX_CHUNK_SIZE > (AFTER_TRIGGER_OFFSET+1) @@ -4141,6 +4170,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, chunksize *= 2; /* okay, double it */ else chunksize /= 2; /* too many shared records */ + chunksize = Max(chunksize, MIN_CHUNK_SIZE); chunksize = Min(chunksize, MAX_CHUNK_SIZE); } chunk = MemoryContextAlloc(afterTriggers.event_cxt, chunksize); @@ -4186,7 +4216,26 @@ afterTriggerAddEvent(AfterTriggerEventList *events, /* Insert the data */ newevent = (AfterTriggerEvent) chunk->freeptr; - memcpy(newevent, event, eventsize); + if (!rowid1 && !rowid2) + { + memcpy(newevent, event, eventsize); + } + else + { + Pointer ptr = chunk->freeptr; + + memcpy(newevent, event, basiceventsize); + ptr += basiceventsize; + + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + memcpy(ptr, rowid1, MAXALIGN(VARSIZE(rowid1))); + ptr += MAXALIGN(VARSIZE(rowid1)); + } + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + memcpy(ptr, rowid2, MAXALIGN(VARSIZE(rowid2))); + } /* ... and link the new event to its shared record */ newevent->ate_flags &= ~AFTER_TRIGGER_OFFSET; newevent->ate_flags |= (char *) newshared - (char *) newevent; @@ -4346,6 +4395,7 @@ AfterTriggerExecute(EState *estate, int tgindx; bool should_free_trig = false; bool should_free_new = false; + Pointer ptr; /* * Locate trigger in trigdesc. It might not be present, and in fact the @@ -4381,15 +4431,17 @@ AfterTriggerExecute(EState *estate, { Tuplestorestate *fdw_tuplestore = GetCurrentFDWTuplestore(); - if (!tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot1)) + if (!tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot1)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); if ((evtshared->ats_event & TRIGGER_EVENT_OPMASK) == TRIGGER_EVENT_UPDATE && - !tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot2)) + !tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot2)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); + trig_tuple_slot1->tts_tid = event->ate_ctid1; + trig_tuple_slot2->tts_tid = event->ate_ctid2; } /* fall through */ case AFTER_TRIGGER_FDW_REUSE: @@ -4421,13 +4473,26 @@ AfterTriggerExecute(EState *estate, break; default: - if (ItemPointerIsValid(&(event->ate_ctid1))) + ptr = (Pointer) event + MAXALIGN(BasicSizeofTriggerEvent(event)); + if (ItemPointerIsValid(&(event->ate_ctid1)) || + (event->ate_flags & AFTER_TRIGGER_ROWID1)) { + Datum tupleid; + TupleTableSlot *src_slot = ExecGetTriggerOldSlot(estate, src_relInfo); - if (!table_tuple_fetch_row_version(src_rel, - &(event->ate_ctid1), + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + tupleid = PointerGetDatum(ptr); + ptr += MAXALIGN(VARSIZE(ptr)); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid1)); + } + + if (!table_tuple_fetch_row_version(src_rel, tupleid, SnapshotAny, src_slot)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); @@ -4463,13 +4528,23 @@ AfterTriggerExecute(EState *estate, /* don't touch ctid2 if not there */ if (((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID || (event->ate_flags & AFTER_TRIGGER_CP_UPDATE)) && - ItemPointerIsValid(&(event->ate_ctid2))) + (ItemPointerIsValid(&(event->ate_ctid2)) || + (event->ate_flags & AFTER_TRIGGER_ROWID2))) { + Datum tupleid; + TupleTableSlot *dst_slot = ExecGetTriggerNewSlot(estate, dst_relInfo); - if (!table_tuple_fetch_row_version(dst_rel, - &(event->ate_ctid2), + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + { + tupleid = PointerGetDatum(ptr); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid2)); + } + if (!table_tuple_fetch_row_version(dst_rel, tupleid, SnapshotAny, dst_slot)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); @@ -4643,7 +4718,7 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, { deferred_found = true; /* add it to move_list */ - afterTriggerAddEvent(move_list, event, evtshared); + afterTriggerAddEvent(move_list, event, evtshared, NULL, NULL); /* mark original copy "done" so we don't do it again */ event->ate_flags |= AFTER_TRIGGER_DONE; } @@ -4747,6 +4822,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, /* caution: trigdesc could be NULL here */ finfo = rInfo->ri_TrigFunctions; instr = rInfo->ri_TrigInstrument; + if (slot1 != NULL) { ExecDropSingleTupleTableSlot(slot1); @@ -6136,6 +6212,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, int tgtype_level; int i; Tuplestorestate *fdw_tuplestore = NULL; + bytea *rowId1 = NULL; + bytea *rowId2 = NULL; /* * Check state. We use a normal test not Assert because it is possible to @@ -6229,6 +6307,21 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, * if so. This preserves the behavior that statement-level triggers fire * just once per statement and fire after row-level triggers. */ + + /* Determine flags */ + if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) + { + if (row_trigger && event == TRIGGER_EVENT_UPDATE) + { + if (relkind == RELKIND_PARTITIONED_TABLE) + new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; + else + new_event.ate_flags = AFTER_TRIGGER_2CTID; + } + else + new_event.ate_flags = AFTER_TRIGGER_1CTID; + } + switch (event) { case TRIGGER_EVENT_INSERT: @@ -6239,6 +6332,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot != NULL); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(newslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6258,6 +6358,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot == NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(oldslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6273,10 +6380,54 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, tgtype_event = TRIGGER_TYPE_UPDATE; if (row_trigger) { + bool src_rowid = false, + dst_rowid = false; Assert(oldslot != NULL); Assert(newslot != NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid2)); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + Relation src_rel = src_partinfo->ri_RelationDesc; + Relation dst_rel = dst_partinfo->ri_RelationDesc; + + src_rowid = table_get_row_ref_type(src_rel) == + ROW_REF_ROWID; + dst_rowid = table_get_row_ref_type(dst_rel) == + ROW_REF_ROWID; + } + else + { + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + src_rowid = true; + dst_rowid = true; + } + } + + if (src_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(oldslot, + RowIdAttributeNumber, + &isnull); + rowId1 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + } + + if (dst_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(newslot, + RowIdAttributeNumber, + &isnull); + rowId2 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID2; + } /* * Also remember the OIDs of partitions to fetch these tuples @@ -6314,20 +6465,6 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; } - /* Determine flags */ - if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) - { - if (row_trigger && event == TRIGGER_EVENT_UPDATE) - { - if (relkind == RELKIND_PARTITIONED_TABLE) - new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; - else - new_event.ate_flags = AFTER_TRIGGER_2CTID; - } - else - new_event.ate_flags = AFTER_TRIGGER_1CTID; - } - /* else, we'll initialize ate_flags for each trigger */ tgtype_level = (row_trigger ? TRIGGER_TYPE_ROW : TRIGGER_TYPE_STATEMENT); @@ -6493,7 +6630,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, new_shared.ats_modifiedcols = modifiedCols; afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth].events, - &new_event, &new_shared); + &new_event, &new_shared, rowId1, rowId2); } /* diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 366975dad68..a70d6e25f9b 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -5005,7 +5005,9 @@ ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext, op->resnull); *op->resvalue = d; /* this ought to be unreachable, but it's cheap enough to check */ - if (unlikely(*op->resnull)) + if (op->d.var.attnum != RowIdAttributeNumber && + op->d.var.attnum != SelfItemPointerAttributeNumber && + unlikely(*op->resnull)) elog(ERROR, "failed to fetch attribute from slot"); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index e35ddd0e898..6f5a572be62 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -860,13 +860,15 @@ InitPlan(QueryDesc *queryDesc, int eflags) Oid relid; Relation relation; ExecRowMark *erm; + RangeTblEntry *rangeEntry; /* ignore "parent" rowmarks; they are irrelevant at runtime */ if (rc->isParent) continue; /* get relation's OID (will produce InvalidOid if subquery) */ - relid = exec_rt_fetch(rc->rti, estate)->relid; + rangeEntry = exec_rt_fetch(rc->rti, estate); + relid = rangeEntry->relid; /* open relation, if we need to access it for this mark type */ switch (rc->markType) @@ -899,6 +901,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) erm->prti = rc->prti; erm->rowmarkId = rc->rowmarkId; erm->markType = rc->markType; + if (erm->markType == ROW_MARK_COPY) + erm->refType = ROW_REF_COPY; + else + erm->refType = rangeEntry->reftype; erm->strength = rc->strength; erm->waitPolicy = rc->waitPolicy; erm->ermActive = false; @@ -1270,6 +1276,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ChildToRootMap = NULL; resultRelInfo->ri_ChildToRootMapValid = false; resultRelInfo->ri_CopyMultiInsertBuffer = NULL; + + resultRelInfo->ri_RowRefType = table_get_row_ref_type(resultRelationDesc); } /* @@ -2407,17 +2415,28 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) aerm->rowmark = erm; /* Look up the resjunk columns associated with this rowmark */ - if (erm->markType != ROW_MARK_COPY) + if (erm->refType == ROW_REF_TID) { + Assert(erm->markType != ROW_MARK_COPY); /* need ctid for all methods other than COPY */ snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId); aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, resname); if (!AttributeNumberIsValid(aerm->ctidAttNo)) elog(ERROR, "could not find junk %s column", resname); + } else if (erm->refType == ROW_REF_ROWID) + { + Assert(erm->markType != ROW_MARK_COPY); + /* need ctid for all methods other than COPY */ + snprintf(resname, sizeof(resname), "rowid%u", erm->rowmarkId); + aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->ctidAttNo)) + elog(ERROR, "could not find junk %s column", resname); } else { + Assert(erm->markType == ROW_MARK_COPY); /* need wholerow if COPY */ snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId); aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist, @@ -2707,8 +2726,9 @@ EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot) { /* ordinary table, fetch the tuple */ if (!table_tuple_fetch_row_version(erm->relation, - (ItemPointer) DatumGetPointer(datum), - SnapshotAny, slot)) + datum, + SnapshotAny, + slot)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); return true; } diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 0133b2a8538..5c2a03d257a 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -250,7 +250,8 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetActiveSnapshot(), + res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + GetActiveSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -434,7 +435,8 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetActiveSnapshot(), + res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + GetActiveSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -573,7 +575,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + PointerGetDatum(tid), NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -640,7 +642,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + PointerGetDatum(tid), NULL, NULL, NULL, NULL); } if (!skip_tuple) diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index 41754ddfea9..ac401d7a470 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -27,6 +27,7 @@ #include "executor/nodeLockRows.h" #include "foreign/fdwapi.h" #include "miscadmin.h" +#include "utils/datum.h" #include "utils/rel.h" @@ -157,7 +158,16 @@ ExecLockRows(PlanState *pstate) } /* okay, try to lock (and fetch) the tuple */ - tid = *((ItemPointer) DatumGetPointer(datum)); + if (erm->refType == ROW_REF_TID) + { + tid = *((ItemPointer) DatumGetPointer(datum)); + datum = PointerGetDatum(&tid); + } + else + { + Assert(erm->refType = ROW_REF_ROWID); + datum = datumCopy(datum, false, -1); + } switch (erm->markType) { case ROW_MARK_EXCLUSIVE: @@ -182,12 +192,15 @@ ExecLockRows(PlanState *pstate) if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot, + test = table_tuple_lock(erm->relation, datum, estate->es_snapshot, markSlot, estate->es_output_cid, lockmode, erm->waitPolicy, lockflags, &tmfd); + if (erm->refType == ROW_REF_ROWID) + pfree(DatumGetPointer(datum)); + switch (test) { case TM_WouldBlock: diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 78fd7690b28..c81a9a49582 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -136,12 +136,11 @@ static void ExecPendingInserts(EState *estate); static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning); @@ -154,13 +153,13 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, static TupleTableSlot *ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, bool canSetTag); static void ExecInitMerge(ModifyTableState *mtstate, EState *estate); static TupleTableSlot *ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, bool canSetTag, bool *matched); @@ -168,7 +167,6 @@ static TupleTableSlot *ExecMergeNotMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, bool canSetTag); - /* * Verify that the tuples to be produced by INSERT match the * target relation's rowtype @@ -297,66 +295,6 @@ ExecProcessReturning(ResultRelInfo *resultRelInfo, return ExecProject(projectReturning); } -/* - * ExecCheckTupleVisible -- verify tuple is visible - * - * It would not be consistent with guarantees of the higher isolation levels to - * proceed with avoiding insertion (taking speculative insertion's alternative - * path) on the basis of another tuple that is not visible to MVCC snapshot. - * Check for the need to raise a serialization failure, and do so as necessary. - */ -static void -ExecCheckTupleVisible(EState *estate, - Relation rel, - TupleTableSlot *slot) -{ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) - { - Datum xminDatum; - TransactionId xmin; - bool isnull; - - xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - /* - * We should not raise a serialization failure if the conflict is - * against a tuple inserted by our own transaction, even if it's not - * visible to our snapshot. (This would happen, for example, if - * conflicting keys are proposed for insertion in a single command.) - */ - if (!TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - } -} - -/* - * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() - */ -static void -ExecCheckTIDVisible(EState *estate, - ResultRelInfo *relinfo, - ItemPointer tid, - TupleTableSlot *tempSlot) -{ - Relation rel = relinfo->ri_RelationDesc; - - /* Redundantly check isolation level */ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot)) - elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); - ExecCheckTupleVisible(estate, rel, tempSlot); - ExecClearTuple(tempSlot); -} - /* * Initialize to compute stored generated columns for a tuple * @@ -1042,12 +980,19 @@ ExecInsert(ModifyTableContext *context, if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) { /* Perform a speculative insertion. */ - uint32 specToken; - ItemPointerData conflictTid; - bool specConflict; List *arbiterIndexes; + TupleTableSlot *existing = NULL, + *returningSlot, + *inserted; + LockTupleMode lockmode = LockTupleExclusive; arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes; + returningSlot = ExecGetReturningSlot(estate, resultRelInfo); + if (onconflict == ONCONFLICT_UPDATE) + { + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + existing = resultRelInfo->ri_onConflict->oc_Existing; + } /* * Do a non-conclusive check for conflicts first. @@ -1064,23 +1009,29 @@ ExecInsert(ModifyTableContext *context, */ vlock: CHECK_FOR_INTERRUPTS(); - specConflict = false; - if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, - &conflictTid, arbiterIndexes)) + + inserted = table_tuple_insert_with_arbiter(resultRelInfo, + slot, estate->es_output_cid, + 0, NULL, arbiterIndexes, estate, + lockmode, existing, returningSlot); + if (!inserted) { - /* committed conflict tuple found */ if (onconflict == ONCONFLICT_UPDATE) { + TupleTableSlot *returning = NULL; + + if (TTS_EMPTY(existing)) + goto vlock; + /* * In case of ON CONFLICT DO UPDATE, execute the UPDATE * part. Be prepared to retry if the UPDATE fails because * of another concurrent UPDATE/DELETE to the conflict * tuple. */ - TupleTableSlot *returning = NULL; if (ExecOnConflictUpdate(context, resultRelInfo, - &conflictTid, slot, canSetTag, + slot, canSetTag, &returning)) { InstrCountTuples2(&mtstate->ps, 1); @@ -1103,57 +1054,13 @@ ExecInsert(ModifyTableContext *context, * ExecGetReturningSlot() in the DO NOTHING case... */ Assert(onconflict == ONCONFLICT_NOTHING); - ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid, - ExecGetReturningSlot(estate, resultRelInfo)); InstrCountTuples2(&mtstate->ps, 1); return NULL; } } - - /* - * Before we start insertion proper, acquire our "speculative - * insertion lock". Others can use that to wait for us to decide - * if we're going to go ahead with the insertion, instead of - * waiting for the whole transaction to complete. - */ - specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); - - /* insert the tuple, with the speculative token */ - table_tuple_insert_speculative(resultRelationDesc, slot, - estate->es_output_cid, - 0, - NULL, - specToken); - - /* insert index entries for tuple */ - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, true, - &specConflict, - arbiterIndexes, - false); - - /* adjust the tuple's state accordingly */ - table_tuple_complete_speculative(resultRelationDesc, slot, - specToken, !specConflict); - - /* - * Wake up anyone waiting for our decision. They will re-check - * the tuple, see that it's no longer speculative, and wait on our - * XID as if this was a regularly inserted tuple all along. Or if - * we killed the tuple, they will see it's dead, and proceed as if - * the tuple never existed. - */ - SpeculativeInsertionLockRelease(GetCurrentTransactionId()); - - /* - * If there was a conflict, start from the beginning. We'll do - * the pre-check again, which will now find the conflicting tuple - * (unless it aborts before we get there). - */ - if (specConflict) + else { - list_free(recheckIndexes); - goto vlock; + slot = inserted; } /* Since there was no insertion conflict, we're done */ @@ -1161,9 +1068,9 @@ ExecInsert(ModifyTableContext *context, else { /* insert the tuple normally */ - table_tuple_insert(resultRelationDesc, slot, - estate->es_output_cid, - 0, NULL); + slot = table_tuple_insert(resultRelationDesc, slot, + estate->es_output_cid, + 0, NULL); /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) @@ -1339,7 +1246,7 @@ ExecPendingInserts(EState *estate) */ static bool ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot **epqreturnslot, TM_Result *result) { if (result) @@ -1371,7 +1278,7 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart, int options, + Datum tupleid, bool changingPart, int options, TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1395,7 +1302,7 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + HeapTuple oldtuple, TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; @@ -1475,7 +1382,7 @@ ExecInitDeleteTupleSlot(ModifyTableState *mtstate, static TupleTableSlot * ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *oldSlot, bool processReturning, @@ -1668,7 +1575,7 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, + ExecDeleteEpilogue(context, resultRelInfo, oldtuple, oldSlot, changingPart); /* Process RETURNING if present and if requested */ @@ -1685,7 +1592,7 @@ ExecDelete(ModifyTableContext *context, /* FDW must have provided a slot containing the deleted row */ Assert(!TupIsNull(slot)); } - else + else if (!slot || TupIsNull(slot)) { /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); @@ -1734,7 +1641,7 @@ ExecDelete(ModifyTableContext *context, static bool ExecCrossPartitionUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, UpdateContext *updateCxt, @@ -1890,7 +1797,7 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, */ static bool ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TM_Result *result) { Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -1968,7 +1875,7 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, */ static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, int options, TupleTableSlot *oldSlot, UpdateContext *updateCxt) { @@ -2121,7 +2028,7 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, - ResultRelInfo *resultRelInfo, ItemPointer tupleid, + ResultRelInfo *resultRelInfo, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *oldSlot) { @@ -2171,7 +2078,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldslot, TupleTableSlot *newslot) { @@ -2262,7 +2169,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, */ static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; @@ -2318,10 +2225,14 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, { ItemPointerData lockedtid; - int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + int options = TABLE_MODIFY_WAIT; - if (!locked && !IsolationUsesXactSnapshot()) - options |= TABLE_MODIFY_LOCK_UPDATED; + if (!locked) + { + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + } /* * If we generate a new candidate tuple after EvalPlanQual testing, we @@ -2331,7 +2242,11 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * to do them again.) */ redo_act: - lockedtid = *tupleid; + if (resultRelInfo->ri_needLockTagTuple) + { + Assert(resultRelInfo->ri_RowRefType == ROW_REF_TID); + lockedtid = *((ItemPointer) tupleid); + } result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot, canSetTag, options, oldSlot, &updateCxt); @@ -2410,7 +2325,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, UnlockTuple(resultRelationDesc, &lockedtid, InplaceUpdateTupleLock); LockTuple(resultRelationDesc, - tupleid, InplaceUpdateTupleLock); + (ItemPointer) tupleid, InplaceUpdateTupleLock); } slot = ExecGetUpdateNewTuple(resultRelInfo, @@ -2439,7 +2354,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (canSetTag) (estate->es_processed)++; - ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, + ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, oldtuple, slot, oldSlot); /* Process RETURNING if present */ @@ -2463,22 +2378,15 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning) { ModifyTableState *mtstate = context->mtstate; ExprContext *econtext = mtstate->ps.ps_ExprContext; - Relation relation = resultRelInfo->ri_RelationDesc; ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; - TM_FailureData tmfd; - LockTupleMode lockmode; - TM_Result test; - Datum xminDatum; - TransactionId xmin; - bool isnull; + Datum tupleid; /* * Parse analysis should have blocked ON CONFLICT for all system @@ -2488,127 +2396,16 @@ ExecOnConflictUpdate(ModifyTableContext *context, */ Assert(!resultRelInfo->ri_needLockTagTuple); - /* Determine lock mode to use */ - lockmode = ExecUpdateLockMode(context->estate, resultRelInfo); - - /* - * Lock tuple for update. Don't follow updates when tuple cannot be - * locked without doing so. A row locking conflict here means our - * previous conclusion that the tuple is conclusively committed is not - * true anymore. - */ - test = table_tuple_lock(relation, conflictTid, - context->estate->es_snapshot, - existing, context->estate->es_output_cid, - lockmode, LockWaitBlock, 0, - &tmfd); - switch (test) + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { - case TM_Ok: - /* success! */ - break; - - case TM_Invisible: - - /* - * This can occur when a just inserted tuple is updated again in - * the same command. E.g. because multiple rows with the same - * conflicting key values are inserted. - * - * This is somewhat similar to the ExecUpdate() TM_SelfModified - * case. We do not want to proceed because it would lead to the - * same row being updated a second time in some unspecified order, - * and in contrast to plain UPDATEs there's no historical behavior - * to break. - * - * It is the user's responsibility to prevent this situation from - * occurring. These problems are why the SQL standard similarly - * specifies that for SQL MERGE, an exception must be raised in - * the event of an attempt to update the same row twice. - */ - xminDatum = slot_getsysattr(existing, - MinTransactionIdAttributeNumber, - &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - if (TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - /* translator: %s is a SQL command name */ - errmsg("%s command cannot affect row a second time", - "ON CONFLICT DO UPDATE"), - errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); - - /* This shouldn't happen */ - elog(ERROR, "attempted to lock invisible tuple"); - break; - - case TM_SelfModified: - - /* - * This state should never be reached. As a dirty snapshot is used - * to find conflicting tuples, speculative insertion wouldn't have - * seen this row to conflict with. - */ - elog(ERROR, "unexpected self-updated tuple"); - break; - - case TM_Updated: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - - /* - * As long as we don't support an UPDATE of INSERT ON CONFLICT for - * a partitioned table we shouldn't reach to a case where tuple to - * be lock is moved to another partition due to concurrent update - * of the partition key. - */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - - /* - * Tell caller to try again from the very start. - * - * It does not make sense to use the usual EvalPlanQual() style - * loop here, as the new version of the row might not conflict - * anymore, or the conflicting tuple has actually been deleted. - */ - ExecClearTuple(existing); - return false; - - case TM_Deleted: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent delete"))); - - /* see TM_Updated case */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - ExecClearTuple(existing); - return false; - - default: - elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + bool isnull; + tupleid = slot_getsysattr(existing, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&existing->tts_tid); } - - /* Success, the tuple is locked. */ - - /* - * Verify that the tuple is visible to our MVCC snapshot if the current - * isolation level mandates that. - * - * It's not sufficient to rely on the check within ExecUpdate() as e.g. - * CONFLICT ... WHERE clause may prevent us from reaching that. - * - * This means we only ever continue when a new command in the current - * transaction could see the row, even though in READ COMMITTED mode the - * tuple will not be visible according to the current statement's - * snapshot. This is in line with the way UPDATE deals with newer tuple - * versions. - */ - ExecCheckTupleVisible(context->estate, relation, existing); /* * Make tuple and any needed join variables available to ExecQual and @@ -2664,7 +2461,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, /* Execute UPDATE with projection */ *returning = ExecUpdate(context, resultRelInfo, - conflictTid, NULL, + tupleid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, existing, canSetTag, true); @@ -2683,7 +2480,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, */ static TupleTableSlot * ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool canSetTag) + Datum tupleid, HeapTuple oldtuple, bool canSetTag) { TupleTableSlot *rslot = NULL; bool matched; @@ -2749,7 +2546,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * update chain and we never switch from ExecMergeNotMatched() to * ExecMergeMatched(), there is no risk of a livelock. */ - matched = tupleid != NULL || oldtuple != NULL; + matched = DatumGetPointer(tupleid) != NULL || oldtuple != NULL; if (matched) rslot = ExecMergeMatched(context, resultRelInfo, tupleid, oldtuple, canSetTag, &matched); @@ -2809,7 +2606,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TupleTableSlot * ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool canSetTag, + Datum tupleid, HeapTuple oldtuple, bool canSetTag, bool *matched) { ModifyTableState *mtstate = context->mtstate; @@ -2850,7 +2647,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * either have the tupleid of the target row, or an old tuple from the * target wholerow junk attr. */ - Assert(tupleid != NULL || oldtuple != NULL); + Assert(DatumGetPointer(tupleid) != NULL || oldtuple != NULL); ItemPointerSetInvalid(&lockedtid); if (oldtuple != NULL) { @@ -2867,9 +2664,10 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * that don't match mas_whenqual. MERGE on system catalogs is a * minor use case, so don't bother optimizing those. */ - LockTuple(resultRelInfo->ri_RelationDesc, tupleid, + Assert(resultRelInfo->ri_RowRefType == ROW_REF_TID); + LockTuple(resultRelInfo->ri_RelationDesc, (ItemPointer) tupleid, InplaceUpdateTupleLock); - lockedtid = *tupleid; + lockedtid = *((ItemPointer) tupleid); } if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc, tupleid, @@ -2990,7 +2788,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot, + NULL, newslot, resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } @@ -3026,7 +2824,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { - ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, + ExecDeleteEpilogue(context, resultRelInfo, NULL, resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } @@ -3138,7 +2936,6 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, switch (result) { case TM_Ok: - /* * If the tuple was updated and migrated to * another partition concurrently, the current @@ -3180,9 +2977,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * join quals no longer pass and we switch to * the NOT MATCHED BY SOURCE case. */ - (void) ExecGetJunkAttribute(epqslot, - resultRelInfo->ri_RowIdAttNo, - &isNull); + /* + * Update tupleid to that of the new tuple, for + * the refetch we do at the top. + */ + tupleid = ExecGetJunkAttribute(epqslot, + resultRelInfo->ri_RowIdAttNo, + &isNull); if (isNull) *matched = false; @@ -3200,10 +3001,10 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, InplaceUpdateTupleLock); lockedtid = context->tmfd.ctid; } - if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, - SnapshotAny, - resultRelInfo->ri_oldTupleSlot)) + if (!isNull && !table_tuple_fetch_row_version(resultRelationDesc, + tupleid, + SnapshotAny, + resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); if (*matched) @@ -3218,6 +3019,11 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, /* * Loop back and process the MATCHED or NOT * MATCHED BY SOURCE actions from the start. + * A non-NULL ctid means that we are still dealing + * with MATCHED case. Restart the loop so that we + * apply all the MATCHED rules again, to ensure + * that the first qualifying WHEN MATCHED action + * is executed. */ goto lmerge_matched; @@ -3882,10 +3688,10 @@ ExecModifyTable(PlanState *pstate) PlanState *subplanstate; TupleTableSlot *slot; TupleTableSlot *oldSlot; + Datum tupleid; ItemPointerData tuple_ctid; HeapTupleData oldtupdata; HeapTuple oldtuple; - ItemPointer tupleid; bool tuplock; CHECK_FOR_INTERRUPTS(); @@ -3935,6 +3741,8 @@ ExecModifyTable(PlanState *pstate) */ for (;;) { + RowRefType refType; + /* * Reset the per-output-tuple exprcontext. This is needed because * triggers expect to use that context as workspace. It's a bit ugly @@ -4010,7 +3818,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the caller. @@ -4054,7 +3862,8 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = context.planSlot; - tupleid = NULL; + refType = resultRelInfo->ri_RowRefType; + tupleid = PointerGetDatum(NULL); oldtuple = NULL; /* @@ -4097,7 +3906,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the @@ -4112,9 +3921,24 @@ ExecModifyTable(PlanState *pstate) elog(ERROR, "ctid is NULL"); } - tupleid = (ItemPointer) DatumGetPointer(datum); - tuple_ctid = *tupleid; /* be sure we don't free ctid!! */ - tupleid = &tuple_ctid; + if (refType == ROW_REF_TID) + { + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "ctid is NULL"); + + tuple_ctid = *((ItemPointer) DatumGetPointer(datum)); /* be sure we don't free ctid!! */ + tupleid = PointerGetDatum(&tuple_ctid); + } + else + { + Assert(refType == ROW_REF_ROWID); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "rowid is NULL"); + + tupleid = datumCopy(datum, false, -1); + } } /* @@ -4154,7 +3978,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the @@ -4223,9 +4047,11 @@ ExecModifyTable(PlanState *pstate) if (resultRelInfo->ri_needLockTagTuple) { - LockTuple(relation, tupleid, InplaceUpdateTupleLock); + Assert(resultRelInfo->ri_RowRefType == ROW_REF_TID); + LockTuple(relation, (ItemPointer) tupleid, InplaceUpdateTupleLock); tuplock = true; } + Assert(DatumGetPointer(tupleid) != NULL); if (!table_tuple_fetch_row_version(relation, tupleid, SnapshotAny, oldSlot)) @@ -4239,7 +4065,8 @@ ExecModifyTable(PlanState *pstate) slot, resultRelInfo->ri_oldTupleSlot, node->canSetTag, false); if (tuplock) - UnlockTuple(resultRelInfo->ri_RelationDesc, tupleid, + UnlockTuple(resultRelInfo->ri_RelationDesc, + (ItemPointer) tupleid, InplaceUpdateTupleLock); break; @@ -4263,6 +4090,9 @@ ExecModifyTable(PlanState *pstate) break; } + if (refType == ROW_REF_ROWID && DatumGetPointer(tupleid) != NULL) + pfree(DatumGetPointer(tupleid)); + /* * If we got a RETURNING result, return it to caller. We'll continue * the work on next call. @@ -4507,10 +4337,20 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { - resultRelInfo->ri_RowIdAttNo = - ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); - if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) - elog(ERROR, "could not find junk ctid column"); + if (resultRelInfo->ri_RowRefType == ROW_REF_TID) + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk ctid column"); + } + else + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "rowid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk rowid column"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { @@ -4820,6 +4660,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_auxmodifytables = lcons(mtstate, estate->es_auxmodifytables); + + return mtstate; } diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 864a9013b62..f4a124ac4eb 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -377,7 +377,7 @@ TidNext(TidScanState *node) if (node->tss_isCurrentOf) table_tuple_get_latest_tid(scan, &tid); - if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot)) + if (table_tuple_fetch_row_version(heapRelation, PointerGetDatum(&tid), snapshot, slot)) return slot; /* Bad TID or failed snapshot qual; try next */ diff --git a/src/backend/nodes/read.c b/src/backend/nodes/read.c index 4eb42445c52..ffa147ee4c8 100644 --- a/src/backend/nodes/read.c +++ b/src/backend/nodes/read.c @@ -205,6 +205,17 @@ pg_strtok(int *length) return ret_str; } +bool +pg_str_hasfield(void) +{ + const char *local_str = pg_strtok_ptr; + + while (*local_str == ' ' || *local_str == '\n' || *local_str == '\t') + local_str++; + + return (*local_str == ':'); +} + /* * debackslash - * create a palloc'd string holding the given token. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 5e2af9808f6..80a1e353a44 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2341,6 +2341,7 @@ preprocess_rowmarks(PlannerInfo *root) RowMarkClause *rc = lfirst_node(RowMarkClause, l); RangeTblEntry *rte = rt_fetch(rc->rti, parse->rtable); PlanRowMark *newrc; + RowRefType refType; /* * Currently, it is syntactically impossible to have FOR UPDATE et al @@ -2363,8 +2364,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = rc->rti; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, rc->strength); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, rc->strength, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = rc->strength; newrc->waitPolicy = rc->waitPolicy; newrc->isParent = false; @@ -2380,6 +2381,7 @@ preprocess_rowmarks(PlannerInfo *root) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, l); PlanRowMark *newrc; + RowRefType refType = ROW_REF_TID; i++; if (!bms_is_member(i, rels)) @@ -2388,8 +2390,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = i; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, LCS_NONE); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, LCS_NONE, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = LCS_NONE; newrc->waitPolicy = LockWaitBlock; /* doesn't matter */ newrc->isParent = false; @@ -2404,11 +2406,13 @@ preprocess_rowmarks(PlannerInfo *root) * Select RowMarkType to use for a given table */ RowMarkType -select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) +select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength, + RowRefType *refType) { if (rte->rtekind != RTE_RELATION) { /* If it's not a table at all, use ROW_MARK_COPY */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else if (rte->relkind == RELKIND_FOREIGN_TABLE) @@ -2419,10 +2423,12 @@ select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) if (fdwroutine->GetForeignRowMarkType != NULL) return fdwroutine->GetForeignRowMarkType(rte, strength); /* Otherwise, use ROW_MARK_COPY by default */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else { + *refType = rte->reftype; /* Regular table, apply the appropriate lock type */ switch (strength) { diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 35358b085d5..470841d4871 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -237,7 +237,7 @@ preprocess_targetlist(PlannerInfo *root) if (rc->rti != rc->prti) continue; - if (rc->allMarkTypes & ~(1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_TID)) { /* Need to fetch TID */ var = makeVar(rc->rti, @@ -253,7 +253,23 @@ preprocess_targetlist(PlannerInfo *root) true); tlist = lappend(tlist, tle); } - if (rc->allMarkTypes & (1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_ROWID)) + { + /* Need to fetch TID */ + var = makeVar(rc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", rc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(tlist) + 1, + pstrdup(resname), + true); + tlist = lappend(tlist, tle); + } + if (rc->allRefTypes & (1 << ROW_REF_COPY)) { /* Need the whole row as a junk var */ var = makeWholeRowVar(rt_fetch(rc->rti, range_table), diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index 6ba4eba224a..ea012b2c164 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -895,17 +895,35 @@ add_row_identity_columns(PlannerInfo *root, Index rtindex, relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { + RowRefType refType = ROW_REF_TID; + + refType = table_get_row_ref_type(target_relation); + /* * Emit CTID so that executor can find the row to merge, update or * delete. */ - var = makeVar(rtindex, - SelfItemPointerAttributeNumber, - TIDOID, - -1, - InvalidOid, - 0); - add_row_identity_var(root, var, rtindex, "ctid"); + if (refType == ROW_REF_TID) + { + var = makeVar(rtindex, + SelfItemPointerAttributeNumber, + TIDOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "ctid"); + } + else + { + Assert(refType == ROW_REF_ROWID); + var = makeVar(rtindex, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "rowid"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index c5b906a9d43..17c36c03202 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -16,6 +16,7 @@ #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/partition.h" #include "catalog/pg_inherits.h" #include "catalog/pg_type.h" @@ -91,7 +92,7 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, LOCKMODE lockmode; PlanRowMark *oldrc; bool old_isParent = false; - int old_allMarkTypes = 0; + int old_allRefTypes = 0; Assert(rte->inh); /* else caller error */ @@ -131,8 +132,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, { old_isParent = oldrc->isParent; oldrc->isParent = true; - /* Save initial value of allMarkTypes before children add to it */ - old_allMarkTypes = oldrc->allMarkTypes; + /* Save initial value of allRefTypes before children add to it */ + old_allRefTypes = oldrc->allRefTypes; } /* Scan the inheritance set and expand it */ @@ -239,15 +240,15 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, */ if (oldrc) { - int new_allMarkTypes = oldrc->allMarkTypes; + int new_allRefTypes = oldrc->allRefTypes; Var *var; TargetEntry *tle; char resname[32]; List *newvars = NIL; /* Add TID junk Var if needed, unless we had it already */ - if (new_allMarkTypes & ~(1 << ROW_MARK_COPY) && - !(old_allMarkTypes & ~(1 << ROW_MARK_COPY))) + if (new_allRefTypes & (1 << ROW_REF_TID) && + !(old_allRefTypes & (1 << ROW_REF_TID))) { /* Need to fetch TID */ var = makeVar(oldrc->rti, @@ -266,8 +267,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, } /* Add whole-row junk Var if needed, unless we had it already */ - if ((new_allMarkTypes & (1 << ROW_MARK_COPY)) && - !(old_allMarkTypes & (1 << ROW_MARK_COPY))) + if ((new_allRefTypes & (1 << ROW_REF_COPY)) && + !(old_allRefTypes & (1 << ROW_REF_COPY))) { var = makeWholeRowVar(planner_rt_fetch(oldrc->rti, root), oldrc->rti, @@ -282,6 +283,24 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, newvars = lappend(newvars, var); } + if ((new_allRefTypes & (1 << ROW_REF_ROWID)) && + !(old_allRefTypes & (1 << ROW_REF_ROWID))) + { + var = makeVar(oldrc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", oldrc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(root->processed_tlist) + 1, + pstrdup(resname), + true); + root->processed_tlist = lappend(root->processed_tlist, tle); + newvars = lappend(newvars, var); + } + /* Add tableoid junk Var, unless we had it already */ if (!old_isParent) { @@ -450,7 +469,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, * where the hierarchy is flattened during RTE expansion.) * * PlanRowMarks still carry the top-parent's RTI, and the top-parent's - * allMarkTypes field still accumulates values from all descendents. + * allRefTypes field still accumulates values from all descendents. * * "parentrte" and "parentRTindex" are immediate parent's RTE and * RTI. "top_parentrc" is top parent's PlanRowMark. @@ -494,6 +513,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Assert(parentrte->rtekind == RTE_RELATION); /* else this is dubious */ childrte->relid = childOID; childrte->relkind = childrel->rd_rel->relkind; + childrte->reftype = table_get_row_ref_type(childrel); /* A partitioned child will need to be expanded further. */ if (childrte->relkind == RELKIND_PARTITIONED_TABLE) { @@ -583,14 +603,16 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, if (top_parentrc) { PlanRowMark *childrc = makeNode(PlanRowMark); + RowRefType refType; childrc->rti = childRTindex; childrc->prti = top_parentrc->rti; childrc->rowmarkId = top_parentrc->rowmarkId; /* Reselect rowmark type, because relkind might not match parent */ childrc->markType = select_rowmark_type(childrte, - top_parentrc->strength); - childrc->allMarkTypes = (1 << childrc->markType); + top_parentrc->strength, + &refType); + childrc->allRefTypes = (1 << refType); childrc->strength = top_parentrc->strength; childrc->waitPolicy = top_parentrc->waitPolicy; @@ -601,8 +623,8 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, */ childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in top parent's allMarkTypes */ - top_parentrc->allMarkTypes |= childrc->allMarkTypes; + /* Include child's rowmark type in top parent's allRefTypes */ + top_parentrc->allRefTypes |= childrc->allRefTypes; root->rowMarks = lappend(root->rowMarks, childrc); } diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 2f64eaf0e37..37d9b072b38 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -20,6 +20,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/heap.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" @@ -1503,6 +1504,7 @@ addRangeTableEntry(ParseState *pstate, rte->inh = inh; rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1588,6 +1590,7 @@ addRangeTableEntryForRelation(ParseState *pstate, rte->inh = inh; rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1656,6 +1659,7 @@ addRangeTableEntryForSubquery(ParseState *pstate, rte->rtekind = RTE_SUBQUERY; rte->subquery = subquery; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_subquery", NIL); numaliases = list_length(eref->colnames); @@ -1763,6 +1767,7 @@ addRangeTableEntryForFunction(ParseState *pstate, rte->functions = NIL; /* we'll fill this list below */ rte->funcordinality = rangefunc->ordinality; rte->alias = alias; + rte->reftype = ROW_REF_COPY; /* * Choose the RTE alias name. We default to using the first function's @@ -2079,6 +2084,7 @@ addRangeTableEntryForTableFunc(ParseState *pstate, rte->coltypmods = tf->coltypmods; rte->colcollations = tf->colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; refname = alias ? alias->aliasname : pstrdup(tf->functype == TFT_XMLTABLE ? "xmltable" : "json_table"); @@ -2156,6 +2162,7 @@ addRangeTableEntryForValues(ParseState *pstate, rte->coltypmods = coltypmods; rte->colcollations = colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias(refname, NIL); @@ -2252,6 +2259,7 @@ addRangeTableEntryForJoin(ParseState *pstate, rte->joinrightcols = rightcols; rte->join_using_alias = join_using_alias; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_join", NIL); numaliases = list_length(eref->colnames); @@ -2332,6 +2340,7 @@ addRangeTableEntryForCTE(ParseState *pstate, rte->rtekind = RTE_CTE; rte->ctename = cte->ctename; rte->ctelevelsup = levelsup; + rte->reftype = ROW_REF_COPY; /* Self-reference if and only if CTE's parse analysis isn't completed */ rte->self_reference = !IsA(cte->ctequery, Query); @@ -2494,6 +2503,7 @@ addRangeTableEntryForENR(ParseState *pstate, * if they access transition tables linked to a table that is altered. */ rte->relid = enrmd->reliddesc; + rte->reftype = ROW_REF_COPY; /* * Build the list of effective column names using user-supplied aliases @@ -3262,6 +3272,9 @@ get_rte_attribute_name(RangeTblEntry *rte, AttrNumber attnum) attnum > 0 && attnum <= list_length(rte->alias->colnames)) return strVal(list_nth(rte->alias->colnames, attnum - 1)); + if (attnum == RowIdAttributeNumber) + return "rowid"; + /* * If the RTE is a relation, go to the system catalogs not the * eref->colnames list. This is a little slower but it will give the diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index ec5699e48e8..27d44a8e711 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2708,7 +2708,9 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); - relopts = extractRelOptions(tup, pg_class_desc, NULL); + relopts = extractRelOptions(tup, pg_class_desc, + GetTableAmRoutineByAmOid(((Form_pg_class) GETSTRUCT(tup))->relam), + NULL); if (relopts == NULL) return NULL; diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index beca0a9c747..e7cd65e3bdc 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -23,6 +23,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/dependency.h" #include "commands/trigger.h" #include "executor/executor.h" diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 62601a6d80c..9760febe7cc 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -244,6 +244,7 @@ RI_FKey_check(TriggerData *trigdata) TupleTableSlot *newslot; RI_QueryKey qkey; SPIPlanPtr qplan; + Relation rel = trigdata->tg_relation; riinfo = ri_FetchConstraintInfo(trigdata->tg_trigger, trigdata->tg_relation, false); @@ -261,7 +262,7 @@ RI_FKey_check(TriggerData *trigdata) * and lock on the buffer to call HeapTupleSatisfiesVisibility. Caller * should be holding pin, but not lock. */ - if (!table_tuple_satisfies_snapshot(trigdata->tg_relation, newslot, SnapshotSelf)) + if (!table_tuple_satisfies_snapshot(rel, newslot, SnapshotSelf)) return PointerGetDatum(NULL); /* @@ -1327,7 +1328,7 @@ RI_FKey_fk_upd_check_required(Trigger *trigger, Relation fk_rel, * this if we knew the INSERT trigger already fired, but there is no easy * way to know that.) */ - if (slot_is_current_xact_tuple(oldslot)) + if (table_tuple_is_current(fk_rel, oldslot)) return true; /* If all old and new key values are equal, no check is needed */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 3f1e8ce1f5f..31066221b27 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -33,6 +33,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/parallel.h" +#include "access/relation.h" #include "access/reloptions.h" #include "access/sysattr.h" #include "access/table.h" @@ -319,6 +320,7 @@ static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid, StrategyNumber numSupport); static void RelationCacheInitFileRemoveInDir(const char *tblspcpath); static void unlink_initfile(const char *initfilename, int elevel); +static void release_rd_amcache(Relation rel); /* @@ -463,8 +465,9 @@ AllocateRelationDesc(Form_pg_class relp) static void RelationParseRelOptions(Relation relation, HeapTuple tuple) { - bytea *options; - amoptions_function amoptsfn; + bytea *options; + amoptions_function amoptsfn; + const TableAmRoutine *tableam = NULL; relation->rd_options = NULL; @@ -476,9 +479,10 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) { case RELKIND_RELATION: case RELKIND_TOASTVALUE: - case RELKIND_VIEW: case RELKIND_MATVIEW: + case RELKIND_VIEW: case RELKIND_PARTITIONED_TABLE: + tableam = relation->rd_tableam; amoptsfn = NULL; break; case RELKIND_INDEX: @@ -490,11 +494,12 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) } /* - * Fetch reloptions from tuple; have to use a hardwired descriptor because - * we might not have any other for pg_class yet (consider executing this - * code for pg_class itself) - */ - options = extractRelOptions(tuple, GetPgClassDescriptor(), amoptsfn); + * Fetch reloptions from tuple; have to use a hardwired descriptor because + * we might not have any other for pg_class yet (consider executing this + * code for pg_class itself) + */ + options = extractRelOptions(tuple, GetPgClassDescriptor(), + tableam, amoptsfn); /* * Copy parsed data into CacheMemoryContext. To guard against the @@ -2270,9 +2275,7 @@ RelationReloadIndexInfo(Relation relation) RelationCloseSmgr(relation); /* Must free any AM cached data upon relcache flush */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * If it's a shared index, we might be called before backend startup has @@ -2492,8 +2495,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation->rd_options); if (relation->rd_indextuple) pfree(relation->rd_indextuple); - if (relation->rd_amcache) - pfree(relation->rd_amcache); + release_rd_amcache(relation); if (relation->rd_fdwroutine) pfree(relation->rd_fdwroutine); if (relation->rd_indexcxt) @@ -2580,9 +2582,7 @@ RelationClearRelation(Relation relation, bool rebuild) RelationCloseSmgr(relation); /* Free AM cached data, if any */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * Treat nailed-in system relations separately, they always need to be @@ -6899,3 +6899,9 @@ ResOwnerReleaseRelation(Datum res) RelationCloseCleanup((Relation) res); } + +static void +release_rd_amcache(Relation rel) +{ + table_free_rd_amcache(rel); +} diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 947a868e569..d3a41533552 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1100,6 +1100,36 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward, } } +/* + * Same as tuplestore_gettupleslot(), but foces tuple storage to slot. Thus, + * it can work with slot types different than minimal tuple. + */ +bool +tuplestore_force_gettupleslot(Tuplestorestate *state, bool forward, + bool copy, TupleTableSlot *slot) +{ + MinimalTuple tuple; + bool should_free; + + tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free); + + if (tuple) + { + if (copy && !should_free) + { + tuple = heap_copy_minimal_tuple(tuple); + should_free = true; + } + ExecForceStoreMinimalTuple(tuple, slot, should_free); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + /* * tuplestore_advance - exported function to adjust position without fetching * diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index 81829b8270a..8ddc75df287 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -21,6 +21,7 @@ #include "access/amapi.h" #include "access/htup.h" +#include "access/tableam.h" #include "access/tupdesc.h" #include "nodes/pg_list.h" #include "storage/lock.h" @@ -224,6 +225,7 @@ extern Datum transformRelOptions(Datum oldOptions, List *defList, bool acceptOidsOff, bool isReset); extern List *untransformRelOptions(Datum options); extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, + const TableAmRoutine *tableam, amoptions_function amoptions); extern void *build_reloptions(Datum reloptions, bool validate, relopt_kind kind, diff --git a/src/include/access/sysattr.h b/src/include/access/sysattr.h index e88dec71ee9..867b5eb489e 100644 --- a/src/include/access/sysattr.h +++ b/src/include/access/sysattr.h @@ -24,6 +24,7 @@ #define MaxTransactionIdAttributeNumber (-4) #define MaxCommandIdAttributeNumber (-5) #define TableOidAttributeNumber (-6) -#define FirstLowInvalidHeapAttributeNumber (-7) +#define RowIdAttributeNumber (-7) +#define FirstLowInvalidHeapAttributeNumber (-8) #endif /* SYSATTR_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index c2d6972b310..e16e5cbf5d7 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -17,11 +17,15 @@ #ifndef TABLEAM_H #define TABLEAM_H +#include "access/amapi.h" #include "access/relscan.h" #include "access/sdir.h" #include "access/xact.h" #include "executor/tuptable.h" #include "storage/read_stream.h" +#include "nodes/execnodes.h" +#include "storage/bufmgr.h" +#include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -40,6 +44,16 @@ struct TBMIterateResult; struct VacuumParams; struct ValidateIndexState; +typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, + double *totaldeadrows); + +/* in commands/analyze.c */ +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); + /* * Bitmask values for the flags argument to the scan_begin callback. */ @@ -309,6 +323,9 @@ typedef struct TableAmRoutine */ const TupleTableSlotOps *(*slot_callbacks) (Relation rel); + RowRefType (*get_row_ref_type) (Relation rel); + + void (*free_rd_amcache) (Relation rel); /* ------------------------------------------------------------------------ * Table scan callbacks. @@ -478,7 +495,7 @@ typedef struct TableAmRoutine * test, returns true, false otherwise. */ bool (*tuple_fetch_row_version) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot); @@ -514,23 +531,19 @@ typedef struct TableAmRoutine */ /* see table_tuple_insert() for reference about parameters */ - void (*tuple_insert) (Relation rel, TupleTableSlot *slot, + TupleTableSlot *(*tuple_insert) (Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate); - /* see table_tuple_insert_speculative() for reference about parameters */ - void (*tuple_insert_speculative) (Relation rel, - TupleTableSlot *slot, - CommandId cid, - int options, - struct BulkInsertStateData *bistate, - uint32 specToken); - - /* see table_tuple_complete_speculative() for reference about parameters */ - void (*tuple_complete_speculative) (Relation rel, - TupleTableSlot *slot, - uint32 specToken, - bool succeeded); + TupleTableSlot *(*tuple_insert_with_arbiter) (ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot); /* see table_multi_insert() for reference about parameters */ void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots, @@ -538,7 +551,7 @@ typedef struct TableAmRoutine /* see table_tuple_delete() for reference about parameters */ TM_Result (*tuple_delete) (Relation rel, - ItemPointer tid, + Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, @@ -549,7 +562,7 @@ typedef struct TableAmRoutine /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, - ItemPointer otid, + Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, @@ -562,7 +575,7 @@ typedef struct TableAmRoutine /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, @@ -881,6 +894,14 @@ typedef struct TableAmRoutine struct SampleScanState *scanstate, TupleTableSlot *slot); + /* Check if tuple in the slot belongs to the current transaction */ + bool (*tuple_is_current) (Relation rel, TupleTableSlot *slot); + + void (*analyze_table) (Relation relation, + AcquireSampleRowsFunc *func, + BlockNumber *totalpages); + + bytea *(*reloptions) (char relkind, Datum reloptions, bool validate); } TableAmRoutine; @@ -1294,7 +1315,7 @@ extern bool table_index_fetch_tuple_check(Relation rel, */ static inline bool table_tuple_fetch_row_version(Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -1306,7 +1327,7 @@ table_tuple_fetch_row_version(Relation rel, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); - return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); + return rel->rd_tableam->tuple_fetch_row_version(rel, tupleid, snapshot, slot); } /* @@ -1406,45 +1427,32 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) * insertion. But note that any toasting of fields within the slot is NOT * reflected in the slots contents. */ -static inline void +static inline TupleTableSlot * table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate) { - rel->rd_tableam->tuple_insert(rel, slot, cid, options, - bistate); -} - -/* - * Perform a "speculative insertion". These can be backed out afterwards - * without aborting the whole transaction. Other sessions can wait for the - * speculative insertion to be confirmed, turning it into a regular tuple, or - * aborted, as if it never existed. Speculatively inserted tuples behave as - * "value locks" of short duration, used to implement INSERT .. ON CONFLICT. - * - * A transaction having performed a speculative insertion has to either abort, - * or finish the speculative insertion with - * table_tuple_complete_speculative(succeeded = ...). - */ -static inline void -table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, - CommandId cid, int options, - struct BulkInsertStateData *bistate, - uint32 specToken) -{ - rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options, - bistate, specToken); + return rel->rd_tableam->tuple_insert(rel, slot, cid, options, bistate); } -/* - * Complete "speculative insertion" started in the same transaction. If - * succeeded is true, the tuple is fully inserted, if false, it's removed. - */ -static inline void -table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, - uint32 specToken, bool succeeded) +static inline TupleTableSlot * +table_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) { - rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken, - succeeded); + Relation rel = resultRelInfo->ri_RelationDesc; + + return rel->rd_tableam->tuple_insert_with_arbiter(resultRelInfo, + slot, cid, options, + bistate, arbiterIndexes, + estate, + lockmode, lockedSlot, + tempSlot); } /* @@ -1506,12 +1514,12 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * TM_FailureData for additional info. */ static inline TM_Result -table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, +table_tuple_delete(Relation rel, Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, bool changingPart, TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_delete(rel, tid, cid, + return rel->rd_tableam->tuple_delete(rel, tupleid, cid, snapshot, crosscheck, options, tmfd, changingPart, oldSlot); @@ -1562,13 +1570,13 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * for additional info. */ static inline TM_Result -table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, +table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_update(rel, otid, slot, + return rel->rd_tableam->tuple_update(rel, tupleid, slot, cid, snapshot, crosscheck, options, tmfd, lockmode, update_indexes, @@ -1609,12 +1617,12 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * comments for struct TM_FailureData for additional info. */ static inline TM_Result -table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, +table_tuple_lock(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) { - return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot, + return rel->rd_tableam->tuple_lock(rel, tupleid, snapshot, slot, cid, mode, wait_policy, flags, tmfd); } @@ -2078,6 +2086,11 @@ table_scan_sample_next_tuple(TableScanDesc scan, slot); } +static inline bool +table_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + return rel->rd_tableam->tuple_is_current(rel, slot); +} /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. @@ -2132,12 +2145,60 @@ extern void table_block_relation_estimate_size(Relation rel, */ extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler); +extern const TableAmRoutine *GetTableAmRoutineByAmOid(Oid amoid); +extern const TableAmRoutine *GetHeapamTableAmRoutine(void); -/* ---------------------------------------------------------------------------- - * Functions in heapam_handler.c - * ---------------------------------------------------------------------------- - */ +static inline RowRefType +table_get_row_ref_type(Relation rel) +{ + if (rel->rd_tableam) + return rel->rd_tableam->get_row_ref_type(rel); + else + return ROW_REF_TID; +} -extern const TableAmRoutine *GetHeapamTableAmRoutine(void); +static inline void +table_free_rd_amcache(Relation rel) +{ + if (rel->rd_tableam) + { + rel->rd_tableam->free_rd_amcache(rel); + } + else + { + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + } +} + +static inline void +table_analyze(Relation relation, AcquireSampleRowsFunc *func, + BlockNumber *totalpages) +{ + if (relation->rd_tableam->analyze_table) + { + relation->rd_tableam->analyze_table(relation, func, totalpages); + } + else + { + *func = acquire_sample_rows; + *totalpages = RelationGetNumberOfBlocks(relation); + } +} + +static inline bytea * +table_reloptions(Relation rel, char relkind, + Datum reloptions, bool validate) +{ + return rel->rd_tableam->reloptions(relkind, reloptions, validate); +} + +static inline bytea * +tableam_reloptions(const TableAmRoutine *tableam, char relkind, + Datum reloptions, bool validate) +{ + return tableam->reloptions(relkind, reloptions, validate); +} #endif /* TABLEAM_H */ diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index ca0165e6e03..76cd1672011 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -209,7 +209,7 @@ extern void ExecASDeleteTriggers(EState *estate, extern bool ExecBRDeleteTriggersNew(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -218,7 +218,7 @@ extern bool ExecBRDeleteTriggersNew(EState *estate, extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -240,7 +240,7 @@ extern void ExecASUpdateTriggers(EState *estate, extern bool ExecBRUpdateTriggersNew(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, @@ -249,7 +249,7 @@ extern bool ExecBRUpdateTriggersNew(EState *estate, extern bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 759f9a87d38..dfea1e93e33 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -378,6 +378,9 @@ extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc); extern void analyze_rel(Oid relid, RangeVar *relation, VacuumParams *params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy); +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); extern bool std_typanalyze(VacAttrStats *stats); /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */ diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index fcde3876b28..777e59c86e9 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -13,6 +13,7 @@ #define FDWAPI_H #include "access/parallel.h" +#include "access/tableam.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" @@ -148,11 +149,6 @@ typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate, typedef void (*ExplainDirectModify_function) (ForeignScanState *node, struct ExplainState *es); -typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, - double *totaldeadrows); - typedef bool (*AnalyzeForeignTable_function) (Relation relation, AcquireSampleRowsFunc *func, BlockNumber *totalpages); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0696ec05b16..32abf5d3b16 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -457,6 +457,8 @@ typedef struct ResultRelInfo /* relation descriptor for result relation */ Relation ri_RelationDesc; + RowRefType ri_RowRefType; + /* # of indices existing on result relation */ int ri_NumIndices; @@ -757,6 +759,7 @@ typedef struct ExecRowMark Index prti; /* parent range table index, if child */ Index rowmarkId; /* unique identifier for resjunk columns */ RowMarkType markType; /* see enum in nodes/plannodes.h */ + RowRefType refType; LockClauseStrength strength; /* LockingClause's strength, or LCS_NONE */ LockWaitPolicy waitPolicy; /* NOWAIT and SKIP LOCKED */ bool ermActive; /* is this mark relevant for current tuple? */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 67c90a2bd32..ddc80007b34 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1103,6 +1103,7 @@ typedef struct RangeTblEntry Index perminfoindex pg_node_attr(query_jumble_ignore); /* sampling info, or NULL */ struct TableSampleClause *tablesample; + RowRefType reftype; /* * Fields valid for a subquery RTE (else NULL): diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 1aeeaec95e1..9b41e298b0b 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1353,7 +1353,7 @@ typedef enum RowMarkType * child relations will also have entries with isParent = true. The child * entries have rti == child rel's RT index and prti == top parent's RT index, * and can therefore be recognized as children by the fact that prti != rti. - * The parent's allMarkTypes field gets the OR of (1< Date: Mon, 13 Dec 2021 00:19:41 +0300 Subject: [PATCH 08/79] Hook for custom error cleanup --- src/backend/access/transam/xact.c | 2 ++ src/backend/postmaster/autovacuum.c | 1 + src/backend/postmaster/auxprocess.c | 1 + src/backend/postmaster/bgwriter.c | 1 + src/backend/postmaster/checkpointer.c | 2 ++ src/backend/postmaster/walwriter.c | 1 + src/backend/replication/walsender.c | 1 + src/backend/storage/lmgr/proc.c | 2 ++ src/backend/utils/error/elog.c | 9 +++++++++ src/include/utils/elog.h | 6 ++++++ 10 files changed, 26 insertions(+) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 4cecf630060..da7b20b3f05 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2770,6 +2770,7 @@ AbortTransaction(void) * while cleaning up! */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Clear wait information and command progress indicator */ pgstat_report_wait_end(); @@ -5180,6 +5181,7 @@ AbortSubTransaction(void) * Buffer locks, for example? I don't think so but I'm not sure. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); pgstat_progress_end_command(); diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 27d44a8e711..72f2665f3b3 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -458,6 +458,7 @@ AutoVacLauncherMain(char *startup_data, size_t startup_data_len) * transaction. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); UnlockBuffers(); /* this is probably dead code, but let's be safe: */ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index 78f4263eeb1..4dae7ce9c3c 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -101,6 +101,7 @@ static void ShutdownAuxiliaryProcess(int code, Datum arg) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); } diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 0f75548759a..74cc63cc89f 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -167,6 +167,7 @@ BackgroundWriterMain(char *startup_data, size_t startup_data_len) * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); UnlockBuffers(); ReleaseAuxProcessResources(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 342376f933e..4caf069a20e 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -207,6 +207,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) */ pqsignal(SIGCHLD, SIG_DFL); + /* * Initialize so that first time-driven event happens at the correct time. */ @@ -269,6 +270,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) * files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 6e7918a78d4..3cb439d377a 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -164,6 +164,7 @@ WalWriterMain(char *startup_data, size_t startup_data_len) * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 4019eb90292..fc7a432f0a2 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -331,6 +331,7 @@ void WalSndErrorCleanup(void) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index b50e2eff218..3584bceec0c 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -866,6 +866,7 @@ ProcKill(int code, Datum arg) * facility by releasing our PGPROC ... */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); @@ -987,6 +988,7 @@ AuxiliaryProcKill(int code, Datum arg) /* Release any LW locks I am holding (see notes above) */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 948bec886a2..d95d3321e64 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -3770,3 +3770,12 @@ write_stderr(const char *fmt,...) #endif va_end(ap); } + +CustomErrorCleanupHookType CustomErrorCleanupHook = NULL; + +void +CustomErrorCleanup(void) +{ + if (CustomErrorCleanupHook) + CustomErrorCleanupHook(); +} diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index e54eca5b489..f583eca37ee 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -537,4 +537,10 @@ extern void write_jsonlog(ErrorData *edata); */ extern void write_stderr(const char *fmt,...) pg_attribute_printf(1, 2); +typedef void (*CustomErrorCleanupHookType) (void); + +extern CustomErrorCleanupHookType CustomErrorCleanupHook; + +extern void CustomErrorCleanup(void); + #endif /* ELOG_H */ From d98bbc2e7a53209bfcae60f5cd9e668da192b77b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 01:51:03 +0300 Subject: [PATCH 09/79] Snapshot extension and hooks Snapshot have two pairing heap nodes: for data and system undos. --- src/backend/access/transam/xact.c | 11 ++++++++ src/backend/access/transam/xlog.c | 3 ++ src/backend/storage/ipc/procarray.c | 8 ++++++ src/backend/utils/time/snapmgr.c | 44 +++++++++++++++++++++++++++++ src/include/access/transam.h | 11 ++++++-- src/include/access/xlog.h | 1 + src/include/storage/proc.h | 1 + src/include/storage/procarray.h | 2 ++ src/include/utils/snapmgr.h | 11 +++++++- src/include/utils/snapshot.h | 13 +++++++++ 10 files changed, 102 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index da7b20b3f05..c740d46023a 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -212,6 +212,7 @@ typedef struct TransactionStateData bool parallelChildXact; /* is any parent transaction parallel? */ bool chain; /* start a new block after this one */ bool topXidLogged; /* for a subxact: is top-level XID logged? */ + CommitSeqNo csn; struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; @@ -245,6 +246,7 @@ static TransactionStateData TopTransactionStateData = { .state = TRANS_DEFAULT, .blockState = TBLOCK_DEFAULT, .topXidLogged = false, + .csn = COMMITSEQNO_INPROGRESS }; /* @@ -2035,6 +2037,7 @@ StartTransaction(void) */ s->state = TRANS_START; s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + s->csn = COMMITSEQNO_INPROGRESS; /* Determine if statements are logged in this transaction */ xact_is_sampled = log_xact_sample_rate != 0 && @@ -2336,7 +2339,9 @@ CommitTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ + MyProc->lastCommittedCSN = s->csn; ProcArrayEndTransaction(MyProc, latestXid); + s->csn = MyProc->lastCommittedCSN; /* * This is all post-commit cleanup. Note that if an error is raised here, @@ -6384,3 +6389,9 @@ xact_redo(XLogReaderState *record) else elog(PANIC, "xact_redo: unknown op code %u", info); } + +CommitSeqNo +GetCurrentCSN(void) +{ + return TopTransactionStateData.csn; +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ffea4993177..5e2150f5b03 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -136,6 +136,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +CommitSeqNo startupCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -5051,6 +5052,7 @@ BootStrapXLOG(void) TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; TransamVariables->oidCount = 0; + pg_atomic_write_u64(&TransamVariables->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL + 1); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5527,6 +5529,7 @@ StartupXLOG(void) TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; TransamVariables->oidCount = 0; + pg_atomic_write_u64(&TransamVariables->nextCommitSeqNo, startupCommitSeqNo); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 7b931f028e9..593cf8f9252 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -306,6 +306,8 @@ static GlobalVisState GlobalVisTempRels; */ static TransactionId ComputeXidHorizonsResultLastXmin; +snapshot_hook_type snapshot_hook = NULL; + #ifdef XIDCACHE_DEBUG /* counters for XidCache measurement */ @@ -749,6 +751,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->delayChkptFlags = 0; proc->recoveryConflictPending = false; + proc->lastCommittedCSN = pg_atomic_fetch_add_u64(&TransamVariables->nextCommitSeqNo, 1); /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ @@ -2234,6 +2237,8 @@ GetSnapshotData(Snapshot snapshot) if (GetSnapshotDataReuse(snapshot)) { + if (snapshot_hook) + snapshot_hook(snapshot); LWLockRelease(ProcArrayLock); return snapshot; } @@ -2415,6 +2420,9 @@ GetSnapshotData(Snapshot snapshot) if (!TransactionIdIsValid(MyProc->xmin)) MyProc->xmin = TransactionXmin = xmin; + if (snapshot_hook) + snapshot_hook(snapshot); + LWLockRelease(ProcArrayLock); /* maintain state for GlobalVis* */ diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index b41c307237f..f498d9fb42e 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -101,6 +101,10 @@ TransactionId RecentXmin = FirstNormalTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; +snapshot_hook_type snapshot_register_hook = NULL; +snapshot_hook_type snapshot_deregister_hook = NULL; +reset_xmin_hook_type reset_xmin_hook = NULL; + /* * Elements of the active snapshot stack. * @@ -201,6 +205,11 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; + CommitSeqNo snapshotcsn; + uint64 undoRegularLocation; + uint64 undoRegularXmin; + uint64 undoSystemLocation; + uint64 undoSystemXmin; } SerializedSnapshotData; /* @@ -263,6 +272,8 @@ GetTransactionSnapshot(void) /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } else CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -403,6 +414,8 @@ GetNonHistoricCatalogSnapshot(Oid relid) * CatalogSnapshot pointer is already valid. */ pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(CatalogSnapshot); } return CatalogSnapshot; @@ -424,6 +437,8 @@ InvalidateCatalogSnapshot(void) if (CatalogSnapshot) { pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(CatalogSnapshot); CatalogSnapshot = NULL; SnapshotResetXmin(); } @@ -558,6 +573,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } FirstSnapshotSet = true; @@ -820,7 +837,11 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner) ResourceOwnerRememberSnapshot(owner, snap); if (snap->regd_count == 1) + { pairingheap_add(&RegisteredSnapshots, &snap->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snap); + } return snap; } @@ -863,7 +884,11 @@ UnregisterSnapshotNoOwner(Snapshot snapshot) snapshot->regd_count--; if (snapshot->regd_count == 0) + { pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(snapshot); + } if (snapshot->regd_count == 0 && snapshot->active_count == 0) { @@ -915,6 +940,9 @@ SnapshotResetXmin(void) { Snapshot minSnapshot; + if (reset_xmin_hook) + reset_xmin_hook(); + if (ActiveSnapshot != NULL) return; @@ -1008,6 +1036,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) Assert(FirstXactSnapshot->regd_count > 0); Assert(!pairingheap_is_empty(&RegisteredSnapshots)); pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(FirstXactSnapshot); } FirstXactSnapshot = NULL; @@ -1039,6 +1069,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) pairingheap_remove(&RegisteredSnapshots, &esnap->snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(esnap->snapshot); } exportedSnapshots = NIL; @@ -1167,6 +1199,8 @@ ExportSnapshot(Snapshot snapshot) snapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snapshot); /* * Fill buf with a text serialization of the snapshot, plus identification @@ -1729,6 +1763,11 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; + serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; + serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; + serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; + serialized_snapshot.undoSystemLocation = snapshot->undoSystemLocationPhNode.undoLocation; /* * Ignore the SubXID array if it has overflowed, unless the snapshot was @@ -1804,6 +1843,11 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; + snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; + snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; + snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; + snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; + snapshot->undoSystemLocationPhNode.undoLocation = serialized_snapshot.undoSystemLocation; /* Copy XIDs, if present. */ if (serialized_snapshot.xcnt > 0) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index fcf8466cba7..01d584f4965 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -15,7 +15,9 @@ #define TRANSAM_H #include "access/xlogdefs.h" - +#ifndef FRONTEND +#include "port/atomics.h" +#endif /* ---------------- * Special transaction ID values @@ -268,9 +270,13 @@ typedef struct TransamVariablesData */ TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ +#ifndef FRONTEND + pg_atomic_uint64 nextCommitSeqNo; +#else + CommitSeqNo nextCommitSeqNo; +#endif } TransamVariablesData; - /* ---------------- * extern declarations * ---------------- @@ -310,6 +316,7 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); extern void StopGeneratingPinnedObjectIds(void); +extern CommitSeqNo GetCurrentCSN(void); #ifdef USE_ASSERT_CHECKING extern void AssertTransactionIdInAllowableRange(TransactionId xid); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2c507ea618c..a88968ca648 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -56,6 +56,7 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern PGDLLIMPORT CommitSeqNo startupCommitSeqNo; /* Archive modes */ typedef enum ArchiveMode diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index bf0a714d60e..2acc1bd080d 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -298,6 +298,7 @@ struct PGPROC bool fpVXIDLock; /* are we holding a fast-path VXID lock? */ LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID * lock */ + CommitSeqNo lastCommittedCSN; /* * Support for lock groups. Use LockHashPartitionLockByProc on the group diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 8ca60504622..5d065eebd42 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -100,4 +100,6 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, TransactionId *catalog_xmin); +extern snapshot_hook_type snapshot_hook; + #endif /* PROCARRAY_H */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 9398a84051c..3f6952d9895 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -18,6 +18,9 @@ #include "utils/resowner.h" #include "utils/snapshot.h" +#ifndef SNAPSHOT_H +typedef void (*snapshot_hook_type) (Snapshot snapshot); +#endif extern PGDLLIMPORT bool FirstSnapshotSet; @@ -78,7 +81,7 @@ extern void PushActiveSnapshotWithLevel(Snapshot snapshot, int snap_level); extern void PushCopiedSnapshot(Snapshot snapshot); extern void UpdateActiveSnapshotCommandId(void); extern void PopActiveSnapshot(void); -extern Snapshot GetActiveSnapshot(void); +extern PGDLLIMPORT Snapshot GetActiveSnapshot(void); extern bool ActiveSnapshotSet(void); extern Snapshot RegisterSnapshot(Snapshot snapshot); @@ -127,4 +130,10 @@ extern void SerializeSnapshot(Snapshot snapshot, char *start_address); extern Snapshot RestoreSnapshot(char *start_address); extern void RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc); +typedef void (*reset_xmin_hook_type) (void); + +extern snapshot_hook_type snapshot_register_hook; +extern snapshot_hook_type snapshot_deregister_hook; +extern reset_xmin_hook_type reset_xmin_hook; + #endif /* SNAPMGR_H */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 8d1e31e888e..49c913b12f8 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -122,6 +122,13 @@ typedef struct SnapshotData *Snapshot; #define InvalidSnapshot ((Snapshot) NULL) +typedef struct +{ + uint64 undoLocation; /* undo log location retained by this snapshot */ + uint64 xmin; + pairingheap_node ph_node; +} RetainUndoLocationPHNode; + /* * Struct representing all kind of possible snapshots. * @@ -214,6 +221,12 @@ typedef struct SnapshotData * transactions completed since the last GetSnapshotData(). */ uint64 snapXactCompletionCount; + + RetainUndoLocationPHNode undoRegularLocationPhNode; + RetainUndoLocationPHNode undoSystemLocationPhNode; + CommitSeqNo snapshotcsn; } SnapshotData; +typedef void (*snapshot_hook_type) (Snapshot snapshot); + #endif /* SNAPSHOT_H */ From a469aee2a80cb261db4c055f343d1838ab70a55a Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 01:57:10 +0300 Subject: [PATCH 10/79] Hooks for builtin functions and datatypes and orioledb recovery * Added SearchCatCacheInternal_hook, SearchCatCacheList_hook * Added SysCacheGetAttr_hook --- src/backend/commands/indexcmds.c | 4 ++++ src/backend/executor/execExpr.c | 2 ++ src/backend/utils/cache/catcache.c | 25 +++++++++++++++++++++++++ src/backend/utils/cache/syscache.c | 10 ++++++++-- src/backend/utils/cache/typcache.c | 14 ++++++++++++++ src/backend/utils/fmgr/fmgr.c | 4 ++-- src/include/commands/defrem.h | 3 +++ src/include/utils/catcache.h | 24 ++++++++++++++++++++++++ src/include/utils/fmgrtab.h | 3 +++ src/include/utils/typcache.h | 5 +++++ 10 files changed, 90 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 75edb3697b3..b2cf89e1be6 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -73,6 +73,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +GetDefaultOpClass_hook_type GetDefaultOpClass_hook = NULL; /* non-export function prototypes */ static bool CompareOpclassOptions(const Datum *opts1, const Datum *opts2, int natts); @@ -2290,6 +2291,9 @@ GetDefaultOpClass(Oid type_id, Oid am_id) /* If it's a domain, look at the base type instead */ type_id = getBaseType(type_id); + if (GetDefaultOpClass_hook) + return GetDefaultOpClass_hook(type_id, am_id); + tcategory = TypeCategory(type_id); /* diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index 1e3b93a69d8..7fe19f3ea69 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -48,6 +48,8 @@ #include "utils/array.h" #include "utils/builtins.h" #include "utils/jsonfuncs.h" +#include "utils/json.h" +#include "utils/jsonb.h" #include "utils/jsonpath.h" #include "utils/lsyscache.h" #include "utils/typcache.h" diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 59d625b244c..5d5bf4fd806 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -83,6 +83,10 @@ static CatCInProgress *catcache_in_progress_stack = NULL; /* Cache management header --- pointer is NULL until created */ static CatCacheHeader *CacheHdr = NULL; +SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook = NULL; +SearchCatCacheList_hook_type SearchCatCacheList_hook = NULL; +GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook = NULL; + static inline HeapTuple SearchCatCacheInternal(CatCache *cache, int nkeys, Datum v1, Datum v2, @@ -1374,6 +1378,14 @@ SearchCatCacheInternal(CatCache *cache, dlist_head *bucket; CatCTup *ct; + if (SearchCatCacheInternal_hook) + { + ct = SearchCatCacheInternal_hook(cache, nkeys, v1, v2, v3, v4); + + if (ct) + return &ct->tuple; + } + /* Make sure we're in an xact, even if this ends up being a cache hit */ Assert(IsTransactionState()); @@ -1666,6 +1678,11 @@ GetCatCacheHashValue(CatCache *cache, Datum v3, Datum v4) { + if (GetCatCacheHashValue_hook) + { + return GetCatCacheHashValue_hook(cache, cache->cc_nkeys, + v1, v2, v3, v4); + } /* * one-time startup overhead for each cache */ @@ -1718,6 +1735,14 @@ SearchCatCacheList(CatCache *cache, CatCInProgress *save_in_progress; CatCInProgress in_progress_ent; + if (SearchCatCacheList_hook) + { + cl = SearchCatCacheList_hook(cache, nkeys, v1, v2, v3); + + if (cl) + return cl; + } + /* * one-time startup overhead for each cache */ diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 88c47a99965..fbbe0f59411 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -97,6 +97,7 @@ static int SysCacheSupportingRelOidSize; static int oid_compare(const void *a, const void *b); +SysCacheGetAttr_hook_type SysCacheGetAttr_hook = NULL; /* * InitCatalogCache - initialize the caches @@ -602,6 +603,7 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull) { + TupleDesc cc_tupdesc = SysCache[cacheId]->cc_tupdesc; /* * We just need to get the TupleDesc out of the cache entry, and then we * can apply heap_getattr(). Normally the cache control data is already @@ -611,14 +613,18 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, if (cacheId < 0 || cacheId >= SysCacheSize || !PointerIsValid(SysCache[cacheId])) elog(ERROR, "invalid cache ID: %d", cacheId); - if (!PointerIsValid(SysCache[cacheId]->cc_tupdesc)) + + if (!PointerIsValid(cc_tupdesc) && SysCacheGetAttr_hook) + cc_tupdesc = SysCacheGetAttr_hook(SysCache[cacheId]); + if (!PointerIsValid(cc_tupdesc)) { InitCatCachePhase2(SysCache[cacheId], false); Assert(PointerIsValid(SysCache[cacheId]->cc_tupdesc)); + cc_tupdesc = SysCache[cacheId]->cc_tupdesc; } return heap_getattr(tup, attributeNumber, - SysCache[cacheId]->cc_tupdesc, + cc_tupdesc, isNull); } diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index aa4720cb598..b18e50df27d 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -292,6 +292,8 @@ static int32 NextRecordTypmod = 0; /* number of entries used */ * as identifiers, so we start the counter at INVALID_TUPLEDESC_IDENTIFIER. */ static uint64 tupledesc_id_counter = INVALID_TUPLEDESC_IDENTIFIER; +load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook = NULL; +load_enum_cache_data_hook_type load_enum_cache_data_hook = NULL; static void load_typcache_tupdesc(TypeCacheEntry *typentry); static void load_rangetype_info(TypeCacheEntry *typentry); @@ -881,6 +883,12 @@ load_typcache_tupdesc(TypeCacheEntry *typentry) { Relation rel; + if (load_typcache_tupdesc_hook) + { + load_typcache_tupdesc_hook(typentry); + return; + } + if (!OidIsValid(typentry->typrelid)) /* should not happen */ elog(ERROR, "invalid typrelid for composite type %u", typentry->type_id); @@ -2563,6 +2571,12 @@ load_enum_cache_data(TypeCacheEntry *tcache) int bm_size, start_pos; + if (load_enum_cache_data_hook) + { + load_enum_cache_data_hook(tcache); + return; + } + /* Check that this is actually an enum */ if (tcache->typtype != TYPTYPE_ENUM) ereport(ERROR, diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index e48a86be54b..5b7888c705f 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -72,7 +72,7 @@ extern Datum fmgr_security_definer(PG_FUNCTION_ARGS); * or name, but search by Oid is much faster. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_isbuiltin(Oid id) { uint16 index; @@ -97,7 +97,7 @@ fmgr_isbuiltin(Oid id) * the array with the same name, but they should all point to the same * routine. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_lookupByName(const char *name) { int i; diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 29c511e3196..a1ade77b732 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -158,4 +158,7 @@ extern int defGetTypeLength(DefElem *def); extern List *defGetStringList(DefElem *def); extern void errorConflictingDefElem(DefElem *defel, ParseState *pstate) pg_attribute_noreturn(); +typedef Oid (*GetDefaultOpClass_hook_type)(Oid type_id, Oid am_id); +extern PGDLLIMPORT GetDefaultOpClass_hook_type GetDefaultOpClass_hook; + #endif /* DEFREM_H */ diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index 99169a93d91..b9cb53dbf0c 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -228,4 +228,28 @@ extern void PrepareToInvalidateCacheTuple(Relation relation, HeapTuple newtuple, void (*function) (int, uint32, Oid)); +typedef CatCTup *(*SearchCatCacheInternal_hook_type)(CatCache *cache, + int nkeys, + Datum v1, Datum v2, + Datum v3, Datum v4); +extern SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook; + +typedef CatCList *(*SearchCatCacheList_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3); +extern SearchCatCacheList_hook_type SearchCatCacheList_hook; + +typedef TupleDesc (*SysCacheGetAttr_hook_type)(CatCache *SysCache); +extern SysCacheGetAttr_hook_type SysCacheGetAttr_hook; + +typedef uint32 (*GetCatCacheHashValue_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3, + Datum v4); +extern GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook; + #endif /* CATCACHE_H */ diff --git a/src/include/utils/fmgrtab.h b/src/include/utils/fmgrtab.h index 151dd74055d..f8666ba7087 100644 --- a/src/include/utils/fmgrtab.h +++ b/src/include/utils/fmgrtab.h @@ -46,4 +46,7 @@ extern PGDLLIMPORT const Oid fmgr_last_builtin_oid; /* highest function OID in #define InvalidOidBuiltinMapping PG_UINT16_MAX extern PGDLLIMPORT const uint16 fmgr_builtin_oid_index[]; +extern const FmgrBuiltin *fmgr_isbuiltin(Oid id); +extern const FmgrBuiltin *fmgr_lookupByName(const char *name); + #endif /* FMGRTAB_H */ diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h index f506cc4aa35..7c84978b7fa 100644 --- a/src/include/utils/typcache.h +++ b/src/include/utils/typcache.h @@ -207,4 +207,9 @@ extern void SharedRecordTypmodRegistryInit(SharedRecordTypmodRegistry *, extern void SharedRecordTypmodRegistryAttach(SharedRecordTypmodRegistry *); +typedef void (*load_typcache_tupdesc_hook_type)(TypeCacheEntry *typentry); +extern PGDLLIMPORT load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook; +typedef void (*load_enum_cache_data_hook_type)(TypeCacheEntry *tcache); +extern PGDLLIMPORT load_enum_cache_data_hook_type load_enum_cache_data_hook; + #endif /* TYPCACHE_H */ From dcd403fffa8a082c138d39bd46425e65483dfcfe Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:01:16 +0300 Subject: [PATCH 11/79] Recovery and checkpointer hooks --- src/backend/access/transam/transam.c | 1 + src/backend/access/transam/xact.c | 4 ++++ src/backend/access/transam/xlog.c | 18 ++++++++++++++++++ src/backend/access/transam/xlogrecovery.c | 2 ++ src/backend/storage/buffer/bufmgr.c | 6 +++++- src/include/access/xact.h | 3 +++ src/include/access/xlog.h | 10 ++++++++++ 7 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 75b5325df8b..95647a357ea 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -22,6 +22,7 @@ #include "access/clog.h" #include "access/subtrans.h" #include "access/transam.h" +#include "storage/proc.h" #include "utils/snapmgr.h" /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c740d46023a..198f1b403c5 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -325,6 +325,7 @@ typedef struct SubXactCallbackItem static SubXactCallbackItem *SubXact_callbacks = NULL; +xact_redo_hook_type xact_redo_hook = NULL; /* local function prototypes */ static void AssignTransactionId(TransactionState s); @@ -6080,6 +6081,9 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionId max_xid; TimestampTz commit_time; + if (xact_redo_hook) + xact_redo_hook(xid, lsn); + Assert(TransactionIdIsValid(xid)); max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5e2150f5b03..b6a11a5d561 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -144,6 +144,11 @@ bool XLOG_DEBUG = false; int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +/* Hook for plugins to get control in CheckPointGuts() */ +CheckPoint_hook_type CheckPoint_hook = NULL; +double CheckPointProgress; +after_checkpoint_cleanup_hook_type after_checkpoint_cleanup_hook = NULL; + /* * Number of WAL insertion locks to use. A higher value allows more insertions * to happen concurrently, but adds some CPU overhead to flushing the WAL, @@ -5400,6 +5405,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + bool wasInRecovery; /* * We should have an aux process resource owner to use, and we should not @@ -6052,6 +6058,8 @@ StartupXLOG(void) */ PreallocXlogFiles(EndOfLog, newTLI); + wasInRecovery = InRecovery; + /* * Okay, we're officially UP. */ @@ -6130,6 +6138,9 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + if (wasInRecovery && after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(EndOfLog, 0); + /* * All done with end-of-recovery actions. * @@ -7347,6 +7358,9 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + if (after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(ProcLastRecPtr, flags); + /* Real work is done; log and update stats. */ LogCheckpointEnd(false); @@ -7506,6 +7520,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); CheckPointRelationMap(); CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN); CheckPointSnapBuild(); @@ -9529,3 +9545,5 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +void (*RedoShutdownHook) (void) = NULL; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 1917cd4f449..cfcf4763b15 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1856,6 +1856,8 @@ PerformWalRecovery(void) * exit with special return code to request shutdown of * postmaster. Log messages issued from postmaster. */ + if (RedoShutdownHook != NULL) + RedoShutdownHook(); proc_exit(3); case RECOVERY_TARGET_ACTION_PAUSE: diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f8d30bf71e1..0f0dad39884 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3086,6 +3086,7 @@ BufferSync(int flags) BufferDesc *bufHdr = NULL; CkptTsStatus *ts_stat = (CkptTsStatus *) DatumGetPointer(binaryheap_first(ts_heap)); + double progress; buf_id = CkptBufferIds[ts_stat->index].buf_id; Assert(buf_id != -1); @@ -3140,7 +3141,10 @@ BufferSync(int flags) * * (This will check for barrier events even if it doesn't sleep.) */ - CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); + progress = (double) num_processed / num_to_scan; + progress = CheckPointProgress + progress * (1 - CheckPointProgress); + + CheckpointWriteDelay(flags, progress); } /* diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 6d4439f0524..327328da54c 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -527,4 +527,7 @@ extern void EnterParallelMode(void); extern void ExitParallelMode(void); extern bool IsInParallelMode(void); +typedef void (*xact_redo_hook_type) (TransactionId xid, XLogRecPtr lsn); +extern xact_redo_hook_type xact_redo_hook; + #endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a88968ca648..bd9eff2709a 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -308,4 +308,14 @@ extern SessionBackupState get_backup_status(void); /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" +typedef void (*CheckPoint_hook_type) (XLogRecPtr checkPointRedo, int flags); +extern PGDLLIMPORT CheckPoint_hook_type CheckPoint_hook; +extern double CheckPointProgress; +typedef void (*after_checkpoint_cleanup_hook_type)(XLogRecPtr checkPointRedo, + int flags); +extern PGDLLIMPORT after_checkpoint_cleanup_hook_type + after_checkpoint_cleanup_hook; + +extern void (*RedoShutdownHook) (void); + #endif /* XLOG_H */ From 4226b4860631cd5850037d604129a63cf49be2e6 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:07:13 +0300 Subject: [PATCH 12/79] Allow skipping logging for AccessExclusiveLock --- src/backend/storage/lmgr/lock.c | 14 ++++++++++++-- src/include/storage/lock.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index e5e7ab55716..f5825f20fc2 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -784,7 +784,7 @@ LockAcquireExtended(const LOCKTAG *locktag, bool reportMemoryError, LOCALLOCK **locallockp) { - LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCKMETHODID lockmethodid; LockMethod lockMethodTable; LOCALLOCKTAG localtag; LOCALLOCK *locallock; @@ -796,6 +796,15 @@ LockAcquireExtended(const LOCKTAG *locktag, LWLock *partitionLock; bool found_conflict; bool log_lock = false; + bool no_log_lock = false; + + if (locktag->locktag_lockmethodid == NO_LOG_LOCKMETHOD) + { + ((LOCKTAG *)locktag)->locktag_lockmethodid = DEFAULT_LOCKMETHOD; + no_log_lock = true; + } + + lockmethodid = locktag->locktag_lockmethodid; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -910,7 +919,8 @@ LockAcquireExtended(const LOCKTAG *locktag, if (lockmode >= AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION && !RecoveryInProgress() && - XLogStandbyInfoActive()) + XLogStandbyInfoActive() && + !no_log_lock) { LogAccessExclusiveLockPrepare(); log_lock = true; diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index cc1f6e78c39..29b7226fe50 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -124,6 +124,7 @@ typedef uint16 LOCKMETHODID; /* These identify the known lock methods */ #define DEFAULT_LOCKMETHOD 1 #define USER_LOCKMETHOD 2 +#define NO_LOG_LOCKMETHOD 255 /* Skip logging of AccessExclusiveLock */ /* * LOCKTAG is the key information needed to look up a LOCK item in the From 2b02ef3af87f1f5f14fdee3645db35e579537375 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:08:21 +0300 Subject: [PATCH 13/79] Add convenience functions IsFatalError() have_backup_in_progress() SnapBuildNextPhaseAt() DoLocalLockExist() --- src/backend/access/transam/xlog.c | 13 +++++++++++++ src/backend/postmaster/postmaster.c | 6 ++++++ src/backend/replication/logical/snapbuild.c | 11 +++++++++++ src/backend/storage/lmgr/lock.c | 21 +++++++++++++++++++++ src/include/access/xlog.h | 1 + src/include/postmaster/postmaster.h | 1 + src/include/replication/snapbuild.h | 1 + src/include/storage/lock.h | 1 + 8 files changed, 55 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index b6a11a5d561..1963127bb16 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9138,6 +9138,19 @@ get_backup_status(void) return sessionBackupState; } +/* + * Check if there is a backup in progress. + * + * We do this check without lock assuming 32-bit reads are atomic. In fact, + * the false result means that there was at least a moment of time when there + * were no backups. + */ +bool +have_backup_in_progress(void) +{ + return (XLogCtl->Insert.runningBackups > 0); +} + /* * do_pg_backup_stop * diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index d032091495b..abb7ae9c718 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -482,6 +482,12 @@ int postmaster_alive_fds[2] = {-1, -1}; HANDLE PostmasterHandle; #endif +bool +IsFatalError(void) +{ + return FatalError; +} + /* * Postmaster main entry point */ diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 5c3bbf0e93f..2972156c071 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -418,6 +418,17 @@ SnapBuildCurrentState(SnapBuild *builder) return builder->state; } +/* + * An which transaction id the next phase of initial snapshot building will + * happen? + */ +TransactionId +SnapBuildNextPhaseAt(SnapBuild *builder) +{ + return builder->next_phase_at; +} + + /* * Return the LSN at which the two-phase decoding was first enabled. */ diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index f5825f20fc2..249cf961cc1 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -635,6 +635,27 @@ GetLockMethodLocalHash(void) } #endif +/* + * Returns true if any LOCKMODE lock with given locktag exist in LocalMethodLocalHash. + */ +bool +DoLocalLockExist(const LOCKTAG *locktag) +{ + HASH_SEQ_STATUS scan_status; + LOCALLOCK* locallock; + + hash_seq_init(&scan_status, LockMethodLocalHash); + while ((locallock = (LOCALLOCK *) hash_seq_search(&scan_status)) != NULL) + { + if (memcmp(&locallock->tag.lock, locktag, sizeof(LOCKTAG)) == 0) + { + hash_seq_term(&scan_status); + return true; + } + } + return false; +} + /* * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index bd9eff2709a..da077b00ee1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -293,6 +293,7 @@ extern void do_pg_backup_start(const char *backupidstr, bool fast, StringInfo tblspcmapfile); extern void do_pg_backup_stop(BackupState *state, bool waitforarchive); extern void do_pg_abort_backup(int code, Datum arg); +extern bool have_backup_in_progress(void); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 89ad13b788b..9f1d8d7cd6c 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -54,6 +54,7 @@ extern PGDLLIMPORT const char *progname; extern PGDLLIMPORT bool LoadedSSL; +extern bool IsFatalError(void); extern void PostmasterMain(int argc, char *argv[]) pg_attribute_noreturn(); extern void ClosePostmasterPorts(bool am_syslogger); extern void InitProcessGlobals(void); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index caa5113ff81..6eee98557ad 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -74,6 +74,7 @@ extern void SnapBuildClearExportedSnapshot(void); extern void SnapBuildResetExportedSnapshotState(void); extern SnapBuildState SnapBuildCurrentState(SnapBuild *builder); +extern TransactionId SnapBuildNextPhaseAt(SnapBuild *builder); extern Snapshot SnapBuildGetOrBuildSnapshot(SnapBuild *builder); extern bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index 29b7226fe50..d271c32cd31 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -550,6 +550,7 @@ extern LockMethod GetLocksMethodTable(const LOCK *lock); extern LockMethod GetLockTagsMethodTable(const LOCKTAG *locktag); extern uint32 LockTagHashCode(const LOCKTAG *locktag); extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2); +extern bool DoLocalLockExist(const LOCKTAG *locktag); extern LockAcquireResult LockAcquire(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, From 2edd0b830767c92e1eb13b41ba7c0ba48723efcb Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:11:14 +0300 Subject: [PATCH 14/79] PERFORM_DELETION_OF_RELATION flag for object hooks --- src/backend/catalog/dependency.c | 36 +++++++++++++++++++++++++++++++- src/include/catalog/dependency.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 0489cbabcb8..b3873fbd2ac 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -186,6 +186,7 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, int flags) { int i; + bool *depends_on_relation; /* * Keep track of objects for event triggers, if necessary. @@ -213,6 +214,33 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, } } + depends_on_relation = palloc0(sizeof(bool) * targetObjects->numrefs); + + for (i = targetObjects->numrefs - 1; i >= 0; i--) + { + ObjectAddressExtra *thisextra = targetObjects->extras + i; + int j; + + if (thisextra->dependee.classId == RelationRelationId && + thisextra->dependee.objectSubId == 0) + { + depends_on_relation[i] = true; + continue; + } + + for (j = i + 1; j < targetObjects->numrefs; j++) + { + ObjectAddress *depobj = targetObjects->refs + j; + if (depobj->classId == thisextra->dependee.classId && + depobj->objectId == thisextra->dependee.objectId && + depobj->objectSubId == thisextra->dependee.objectSubId) + { + depends_on_relation[i] = depends_on_relation[j]; + break; + } + } + } + /* * Delete all the objects in the proper order, except that if told to, we * should skip the original object(s). @@ -221,13 +249,19 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, { ObjectAddress *thisobj = targetObjects->refs + i; ObjectAddressExtra *thisextra = targetObjects->extras + i; + int temp_flags = flags; if ((flags & PERFORM_DELETION_SKIP_ORIGINAL) && (thisextra->flags & DEPFLAG_ORIGINAL)) continue; - deleteOneObject(thisobj, depRel, flags); + if (depends_on_relation[i]) + temp_flags |= PERFORM_DELETION_OF_RELATION; + + deleteOneObject(thisobj, depRel, temp_flags); } + + pfree(depends_on_relation); } /* diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 6908ca7180a..c9b59706373 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -96,6 +96,8 @@ typedef struct ObjectAddresses ObjectAddresses; #define PERFORM_DELETION_SKIP_EXTENSIONS 0x0010 /* keep extensions */ #define PERFORM_DELETION_CONCURRENT_LOCK 0x0020 /* normal drop with * concurrent lock mode */ +#define PERFORM_DELETION_OF_RELATION 0x0040 /* used for orioledb + * extension */ /* in dependency.c */ From a74c13c3c3dec79d0d99ba2d2a2ebeae5d683c6e Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:20:32 +0300 Subject: [PATCH 15/79] Expose existing planning funcs and structs --- src/backend/catalog/index.c | 5 +---- src/backend/commands/explain.c | 14 +++----------- src/backend/commands/indexcmds.c | 8 ++------ src/backend/optimizer/path/indxpath.c | 13 +------------ src/backend/optimizer/plan/createplan.c | 16 ++++++++++------ src/include/catalog/index.h | 2 ++ src/include/commands/defrem.h | 4 ++++ src/include/commands/explain.h | 8 ++++++++ src/include/optimizer/paths.h | 12 ++++++++++++ src/include/optimizer/planmain.h | 5 +++++ 10 files changed, 48 insertions(+), 39 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index abd8eef0865..cb69d84afa8 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -119,9 +119,6 @@ static void UpdateIndexRelation(Oid indexoid, Oid heapoid, bool immediate, bool isvalid, bool isready); -static void index_update_stats(Relation rel, - bool hasindex, - double reltuples); static void IndexCheckExclusion(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo); @@ -2777,7 +2774,7 @@ FormIndexDatum(IndexInfo *indexInfo, * index. When updating an index, it's important because some index AMs * expect a relcache flush to occur after REINDEX. */ -static void +void index_update_stats(Relation rel, bool hasindex, double reltuples) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 8086607710e..e6c989aea19 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -81,9 +81,6 @@ static void report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es); static double elapsed_time(instr_time *starttime); static bool ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used); -static void ExplainNode(PlanState *planstate, List *ancestors, - const char *relationship, const char *plan_name, - ExplainState *es); static void show_plan_tlist(PlanState *planstate, List *ancestors, ExplainState *es); static void show_expression(Node *node, const char *qlabel, @@ -92,9 +89,6 @@ static void show_expression(Node *node, const char *qlabel, static void show_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, bool useprefix, ExplainState *es); -static void show_scan_qual(List *qual, const char *qlabel, - PlanState *planstate, List *ancestors, - ExplainState *es); static void show_upper_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es); @@ -131,8 +125,6 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); -static void show_instrumentation_count(const char *qlabel, int which, - PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); @@ -1363,7 +1355,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) * to the nesting depth of logical output groups, and therefore is controlled * by ExplainOpenGroup/ExplainCloseGroup. */ -static void +void ExplainNode(PlanState *planstate, List *ancestors, const char *relationship, const char *plan_name, ExplainState *es) @@ -2527,7 +2519,7 @@ show_qual(List *qual, const char *qlabel, /* * Show a qualifier expression for a scan plan node */ -static void +void show_scan_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es) @@ -3618,7 +3610,7 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) * * "which" identifies which instrumentation counter to print */ -static void +void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es) { diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b2cf89e1be6..b0fa957a456 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -94,11 +94,7 @@ static void ComputeIndexAttrs(IndexInfo *indexInfo, Oid ddl_userid, int ddl_sec_context, int *ddl_save_nestlevel); -static char *ChooseIndexName(const char *tabname, Oid namespaceId, - const List *colnames, const List *exclusionOpNames, - bool primary, bool isconstraint); static char *ChooseIndexNameAddition(const List *colnames); -static List *ChooseIndexColumnNames(const List *indexElems); static void ReindexIndex(const ReindexStmt *stmt, const ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -2543,7 +2539,7 @@ ChooseRelationName(const char *name1, const char *name2, * * The argument list is pretty ad-hoc :-( */ -static char * +char * ChooseIndexName(const char *tabname, Oid namespaceId, const List *colnames, const List *exclusionOpNames, bool primary, bool isconstraint) @@ -2632,7 +2628,7 @@ ChooseIndexNameAddition(const List *colnames) * * Returns a List of plain strings (char *, not String nodes). */ -static List * +List * ChooseIndexColumnNames(const List *indexElems) { List *result = NIL; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index c0fcc7d78df..7c043c53133 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -48,14 +48,6 @@ typedef enum ST_ANYSCAN, /* either is okay */ } ScanTypeControl; -/* Data structure for collecting qual clauses that match an index */ -typedef struct -{ - bool nonempty; /* True if lists are not all empty */ - /* Lists of IndexClause nodes, one list per index column */ - List *indexclauses[INDEX_MAX_KEYS]; -} IndexClauseSet; - /* Per-path data used within choose_bitmap_and() */ typedef struct { @@ -129,9 +121,6 @@ static double adjust_rowcount_for_semijoins(PlannerInfo *root, Index outer_relid, double rowcount); static double approximate_joinrel_size(PlannerInfo *root, Relids relids); -static void match_restriction_clauses_to_index(PlannerInfo *root, - IndexOptInfo *index, - IndexClauseSet *clauseset); static void match_join_clauses_to_index(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauseset, @@ -1964,7 +1953,7 @@ approximate_joinrel_size(PlannerInfo *root, Relids relids) * Identify restriction clauses for the rel that match the index. * Matching clauses are added to *clauseset. */ -static void +void match_restriction_clauses_to_index(PlannerInfo *root, IndexOptInfo *index, IndexClauseSet *clauseset) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 25e126d1c39..3ab5e54338a 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -165,16 +165,12 @@ static MergeJoin *create_mergejoin_plan(PlannerInfo *root, MergePath *best_path) static HashJoin *create_hashjoin_plan(PlannerInfo *root, HashPath *best_path); static Node *replace_nestloop_params(PlannerInfo *root, Node *expr); static Node *replace_nestloop_params_mutator(Node *node, PlannerInfo *root); -static void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, - List **stripped_indexquals_p, - List **fixed_indexquals_p); static List *fix_indexorderby_references(PlannerInfo *root, IndexPath *index_path); static Node *fix_indexqual_clause(PlannerInfo *root, IndexOptInfo *index, int indexcol, Node *clause, List *indexcolnos); static Node *fix_indexqual_operand(Node *node, IndexOptInfo *index, int indexcol); static List *get_switched_clauses(List *clauses, Relids outerrelids); -static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_generic_path_info(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static void label_sort_with_costsize(PlannerInfo *root, Sort *plan, @@ -4939,6 +4935,14 @@ replace_nestloop_params(PlannerInfo *root, Node *expr) return replace_nestloop_params_mutator(expr, root); } +Node * +replace_nestloop_params_compat(PlannerInfo *root, Node *expr) +{ + /* No setup needed for tree walk, so away we go */ + return replace_nestloop_params_mutator(expr, root); +} + + static Node * replace_nestloop_params_mutator(Node *node, PlannerInfo *root) { @@ -5019,7 +5023,7 @@ replace_nestloop_params_mutator(Node *node, PlannerInfo *root) * are subplans in it (we need two separate copies of the subplan tree, or * things will go awry). */ -static void +void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, List **stripped_indexquals_p, List **fixed_indexquals_p) { @@ -5312,7 +5316,7 @@ get_switched_clauses(List *clauses, Relids outerrelids) * instead of bare clauses. This is another reason why trying to consider * selectivity in the ordering would likely do the wrong thing. */ -static List * +List * order_qual_clauses(PlannerInfo *root, List *clauses) { typedef struct diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 7d434f8e653..0beab397c79 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -215,4 +215,6 @@ itemptr_decode(ItemPointer itemptr, int64 encoded) ItemPointerSet(itemptr, block, offset); } +extern void index_update_stats(Relation rel, bool hasindex, double reltuples); + #endif /* INDEX_H */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index a1ade77b732..628e43dc33f 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -41,6 +41,10 @@ extern char *makeObjectName(const char *name1, const char *name2, extern char *ChooseRelationName(const char *name1, const char *name2, const char *label, Oid namespaceid, bool isconstraint); +extern List *ChooseIndexColumnNames(const List *indexElems); +extern char *ChooseIndexName(const char *tabname, Oid namespaceId, + const List *colnames, const List *exclusionOpNames, + bool primary, bool isconstraint); extern bool CheckIndexCompatible(Oid oldId, const char *accessMethodName, const List *attributeList, diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 9b8b351d9a2..5a6fabe8ed9 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -107,6 +107,14 @@ extern void ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, const instr_time *planduration, const BufferUsage *bufusage, const MemoryContextCounters *mem_counters); +extern void ExplainNode(PlanState *planstate, List *ancestors, + const char *relationship, const char *plan_name, + ExplainState *es); +extern void show_scan_qual(List *qual, const char *qlabel, + PlanState *planstate, List *ancestors, + ExplainState *es); +extern void show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es); extern void ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc); extern void ExplainPrintTriggers(ExplainState *es, QueryDesc *queryDesc); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 5e88c0224a4..58a2deb0094 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -64,6 +64,14 @@ extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, extern void generate_partitionwise_join_paths(PlannerInfo *root, RelOptInfo *rel); +/* Data structure for collecting qual clauses that match an index */ +typedef struct +{ + bool nonempty; /* True if lists are not all empty */ + /* Lists of IndexClause nodes, one list per index column */ + List *indexclauses[INDEX_MAX_KEYS]; +} IndexClauseSet; + /* * indxpath.c * routines to generate index paths @@ -79,6 +87,10 @@ extern bool match_index_to_operand(Node *operand, int indexcol, IndexOptInfo *index); extern void check_index_predicates(PlannerInfo *root, RelOptInfo *rel); +extern void match_restriction_clauses_to_index(PlannerInfo *root, + IndexOptInfo *index, + IndexClauseSet *clauseset); + /* * tidpath.c * routines to generate tid paths diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index aafc1737921..ef7658c378e 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -39,6 +39,11 @@ extern void preprocess_minmax_aggregates(PlannerInfo *root); * prototypes for plan/createplan.c */ extern Plan *create_plan(PlannerInfo *root, Path *best_path); +extern List *order_qual_clauses(PlannerInfo *root, List *clauses); +extern void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, + List **stripped_indexquals_p, + List **fixed_indexquals_p); +extern Node *replace_nestloop_params_compat(PlannerInfo *root, Node *expr); extern ForeignScan *make_foreignscan(List *qptlist, List *qpqual, Index scanrelid, List *fdw_exprs, List *fdw_private, List *fdw_scan_tlist, List *fdw_recheck_quals, From 9b69260bcaa5ade0192d50b06c01531e6c2dc947 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:22:17 +0300 Subject: [PATCH 16/79] Allow locks in checkpointer --- src/backend/postmaster/checkpointer.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 4caf069a20e..67d79892f8b 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -54,11 +54,20 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/shmem.h" +#include "storage/sinvaladt.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" +#include "utils/syscache.h" + +/* + * Included for InitializeTimeouts and RegisterTimeout functions that + * needed for correct working of OrioleDB checkpoint. + * See comment for InitializeTimeouts call in CheckpointerMain for details. + */ +#include "utils/timeout.h" /*---------- @@ -207,6 +216,20 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) */ pqsignal(SIGCHLD, SIG_DFL); + /* + * To use OrioleDB checkpoint, we must initialize the data for the primary + * lock mechanism (lock.h) to work correctly. Because locks of this type are + * needed by the OrioleDB module for debug events and relation locks, but + * they are not used by the postgres checkpointer and are not initialized + * for it. + */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + InitDeadLockChecking(); + RegisterTimeout(DEADLOCK_TIMEOUT, CheckDeadLockAlert); + RelationCacheInitialize(); + InitCatalogCache(); + SharedInvalBackendInit(false); + /* * Initialize so that first time-driven event happens at the correct time. From 3f7fb15172bbd93dae58655e8a85faaf5c5e5b53 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:14:57 +0300 Subject: [PATCH 17/79] Add base_init_startup_hook and HandleStartupProcInterrupts_hook --- src/backend/postmaster/startup.c | 5 +++++ src/backend/utils/init/postinit.c | 5 ++++- src/include/postmaster/postmaster.h | 4 ++++ src/include/postmaster/startup.h | 3 +++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index ef6f98ebcd7..5cea0f97a30 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -75,6 +75,8 @@ static volatile sig_atomic_t startup_progress_timer_expired = false; */ int log_startup_progress_interval = 10000; /* 10 sec */ +HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook = NULL; + /* Signal handlers */ static void StartupProcTriggerHandler(SIGNAL_ARGS); static void StartupProcSigHupHandler(SIGNAL_ARGS); @@ -157,6 +159,9 @@ HandleStartupProcInterrupts(void) static uint32 postmaster_poll_count = 0; #endif + if (HandleStartupProcInterrupts_hook) + HandleStartupProcInterrupts_hook(); + /* * Process any requests or signals received recently. */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 2ed7c7c02db..b278c286d83 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -81,7 +81,7 @@ static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); - +base_init_startup_hook_type base_init_startup_hook = NULL; /*** InitPostgres support ***/ @@ -659,6 +659,9 @@ BaseInit(void) */ InitFileAccess(); + if (base_init_startup_hook) + base_init_startup_hook(); + /* * Initialize statistics reporting. This needs to happen early to ensure * that pgstat's shutdown callback runs after the shutdown callbacks of diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 9f1d8d7cd6c..67c9b39423a 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -65,6 +65,10 @@ extern bool PostmasterMarkPIDForWorkerNotify(int); extern void processCancelRequest(int backendPID, int32 cancelAuthCode); +typedef void (*base_init_startup_hook_type)(void); + +extern PGDLLIMPORT base_init_startup_hook_type base_init_startup_hook; + #ifdef EXEC_BACKEND extern Size ShmemBackendArraySize(void); extern void ShmemBackendArrayAllocation(void); diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h index dde7ebde881..17b60012a90 100644 --- a/src/include/postmaster/startup.h +++ b/src/include/postmaster/startup.h @@ -23,7 +23,10 @@ ereport(LOG, errmsg(msg, secs, (usecs / 10000), __VA_ARGS__ )); \ } while(0) +typedef void (*HandleStartupProcInterrupts_hook_type)(void); + extern PGDLLIMPORT int log_startup_progress_interval; +extern PGDLLIMPORT HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook; extern void HandleStartupProcInterrupts(void); extern void StartupProcessMain(char *startup_data, size_t startup_data_len) pg_attribute_noreturn(); From 88a083bc5f41b9b47c012b58cc8d21b94d0e2cd4 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:17:36 +0300 Subject: [PATCH 18/79] Don't cancel recovery processes because of deadlocks --- src/backend/storage/lmgr/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 3584bceec0c..750d49c4b3d 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1258,7 +1258,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait) * If InHotStandby we set lock waits slightly later for clarity with other * code. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { @@ -1618,7 +1618,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait) * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { From 246a6a0568906de11075261e8fda9fbf6bb5535b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:42:18 +0300 Subject: [PATCH 19/79] set_plain_rel_pathlist_hook --- src/backend/optimizer/path/allpaths.c | 7 +++++-- src/include/optimizer/paths.h | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4895cee9944..7e02b670931 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -83,6 +83,7 @@ int min_parallel_index_scan_size; /* Hook for plugins to get control in set_rel_pathlist() */ set_rel_pathlist_hook_type set_rel_pathlist_hook = NULL; +set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook = NULL; /* Hook for plugins to replace standard_join_search() */ join_search_hook_type join_search_hook = NULL; @@ -772,8 +773,10 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) */ required_outer = rel->lateral_relids; - /* Consider sequential scan */ - add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); + if (!set_plain_rel_pathlist_hook || + set_plain_rel_pathlist_hook(root, rel, rte)) + /* Consider sequential scan */ + add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); /* If appropriate, consider parallel sequential scan */ if (rel->consider_parallel && required_outer == NULL) diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 58a2deb0094..e15e83bc7a3 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -32,6 +32,10 @@ typedef void (*set_rel_pathlist_hook_type) (PlannerInfo *root, Index rti, RangeTblEntry *rte); extern PGDLLIMPORT set_rel_pathlist_hook_type set_rel_pathlist_hook; +typedef bool (*set_plain_rel_pathlist_hook_type)(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +extern PGDLLIMPORT set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook; /* Hook for plugins to get control in add_paths_to_joinrel() */ typedef void (*set_join_pathlist_hook_type) (PlannerInfo *root, From 25e1f000d6bfa76cb6f34a8093cb5ffa028d98f7 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 14:17:57 +0300 Subject: [PATCH 20/79] Let locker tolerate being removed from the waiting queue without obtaining a lock. --- src/backend/storage/lmgr/lock.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 249cf961cc1..95ff26a20f7 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -1118,6 +1118,8 @@ LockAcquireExtended(const LOCKTAG *locktag, */ if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) { + int i; + AbortStrongLockAcquire(); if (dontWait) @@ -1167,7 +1169,27 @@ LockAcquireExtended(const LOCKTAG *locktag, PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock); LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode); LWLockRelease(partitionLock); - elog(ERROR, "LockAcquire failed"); + /* + * We've been removed from the queue without obtaining a lock. + * That's OK, we're going to return LOCKACQUIRE_NOT_AVAIL, but + * need to release a local lock first. + */ + locallock->nLocks--; + for (i = 0; i < locallock->numLockOwners; i++) + { + if (locallock->lockOwners[i].owner == owner) + { + locallock->lockOwners[i].nLocks--; + if (locallock->lockOwners[i].nLocks == 0) + { + ResourceOwnerForgetLock(owner, locallock); + locallock->lockOwners[i] = locallock->lockOwners[--locallock->numLockOwners]; + } + break; + } + } + + return LOCKACQUIRE_NOT_AVAIL; } } PROCLOCK_PRINT("LockAcquire: granted", proclock); @@ -4687,8 +4709,8 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LWLockRelease(&proc->fpInfoLock); /* Time to wait. */ - (void) LockAcquire(&tag, ShareLock, false, false); - + if (LockAcquire(&tag, ShareLock, false, false) == LOCKACQUIRE_NOT_AVAIL) + return false; LockRelease(&tag, ShareLock, false); return XactLockForVirtualXact(vxid, xid, wait); } From 465a7a3b47553a8df7c6a926ed9f727d7da88d2b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 17 Feb 2022 07:46:49 +0300 Subject: [PATCH 21/79] Count extension wait events in pg_isolation_test_session_is_blocked() --- src/backend/utils/adt/lockfuncs.c | 3 +++ src/backend/utils/adt/waitfuncs.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index e790f856ab3..b26e51246c1 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -16,8 +16,11 @@ #include "funcapi.h" #include "miscadmin.h" #include "storage/predicate_internals.h" +#include "storage/proc.h" +#include "storage/procarray.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/wait_event.h" /* diff --git a/src/backend/utils/adt/waitfuncs.c b/src/backend/utils/adt/waitfuncs.c index e135c9e5e45..c68b36121e3 100644 --- a/src/backend/utils/adt/waitfuncs.c +++ b/src/backend/utils/adt/waitfuncs.c @@ -38,6 +38,7 @@ Datum pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) { + PGPROC *blocked_proc; int blocked_pid = PG_GETARG_INT32(0); ArrayType *interesting_pids_a = PG_GETARG_ARRAYTYPE_P(1); PGPROC *proc; @@ -109,5 +110,9 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) if (GetSafeSnapshotBlockingPids(blocked_pid, &dummy, 1) > 0) PG_RETURN_BOOL(true); + blocked_proc = BackendPidGetProc(blocked_pid); + if ((blocked_proc->wait_event_info & 0xFF000000) == PG_WAIT_EXTENSION) + PG_RETURN_BOOL(true); + PG_RETURN_BOOL(false); } From 435ed018607b8eefbcf9fc551502ddb7e62724df Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 24 Feb 2022 03:19:39 +0300 Subject: [PATCH 22/79] Support for custom table AM in pgbench --- src/bin/pgbench/pgbench.c | 45 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 7d2811ebe42..23e8c75d9a8 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -210,6 +210,11 @@ double throttle_delay = 0; */ int64 latency_limit = 0; +/* + * tableam selection + */ +char *tableam = NULL; + /* * tablespace selection */ @@ -893,6 +898,7 @@ usage(void) " --partition-method=(range|hash)\n" " partition pgbench_accounts with this method (default: range)\n" " --partitions=NUM partition pgbench_accounts into NUM parts (default: 0)\n" + " --tableam=TABLEAM create tables using the specified tableam\n" " --tablespace=TABLESPACE create tables in the specified tablespace\n" " --unlogged-tables create tables as unlogged tables\n" "\nOptions to select what to run:\n" @@ -4793,14 +4799,34 @@ createPartitions(PGconn *con) appendPQExpBufferStr(&query, "maxvalue"); appendPQExpBufferChar(&query, ')'); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } } else if (partition_method == PART_HASH) + { printfPQExpBuffer(&query, "create%s table pgbench_accounts_%d\n" " partition of pgbench_accounts\n" " for values with (modulus %d, remainder %d)", unlogged_tables ? " unlogged" : "", p, partitions, p - 1); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + } else /* cannot get there */ Assert(0); @@ -4887,10 +4913,20 @@ initCreateTables(PGconn *con) if (partition_method != PART_NONE && strcmp(ddl->table, "pgbench_accounts") == 0) appendPQExpBuffer(&query, " partition by %s (aid)", PARTITION_METHOD[partition_method]); - else if (ddl->declare_fillfactor) + else { + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + /* fillfactor is only expected on actual tables */ - appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); + if (ddl->declare_fillfactor) + appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); } if (tablespace != NULL) @@ -6699,6 +6735,7 @@ main(int argc, char **argv) {"verbose-errors", no_argument, NULL, 15}, {"exit-on-abort", no_argument, NULL, 16}, {"debug", no_argument, NULL, 17}, + {"tableam", required_argument, NULL, 18}, {NULL, 0, NULL, 0} }; @@ -7039,6 +7076,10 @@ main(int argc, char **argv) case 17: /* debug */ pg_logging_increase_verbosity(); break; + case 18: /* tableam */ + initialization_option_set = true; + tableam = pg_strdup(optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); From 108092953bede5b623761e6ad33615b1d22df509 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 2 Mar 2022 14:49:29 +0300 Subject: [PATCH 23/79] Support for outline atomics on aarch64 Outline-atomics is a gcc compilation flag that enables runtime detection of CPU support for atomic instructions. Performance on CPUs that do support atomic instructions is improved, while compatibility and performance on CPUs without atomic instructions is not hurt. Discussion: https://postgr.es/m/flat/099F69EE-51D3-4214-934A-1F28C0A1A7A7%40amazon.com Author: Tsahi Zidenberg --- configure | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 4 +++ 2 files changed, 97 insertions(+) diff --git a/configure b/configure index 240e84a2099..eff1c7f8147 100755 --- a/configure +++ b/configure @@ -6663,6 +6663,99 @@ fi if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -moutline-atomics, for CFLAGS" >&5 +$as_echo_n "checking whether ${CC} supports -moutline-atomics, for CFLAGS... " >&6; } +if ${pgac_cv_prog_CC_cflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CC} +CFLAGS="${CFLAGS} -moutline-atomics" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CC_cflags__moutline_atomics=yes +else + pgac_cv_prog_CC_cflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CC_cflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CC_cflags__moutline_atomics" = x"yes"; then + CFLAGS="${CFLAGS} -moutline-atomics" +fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS" >&5 +$as_echo_n "checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS... " >&6; } +if ${pgac_cv_prog_CXX_cxxflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CXXFLAGS=$CXXFLAGS +pgac_save_CXX=$CXX +CXX=${CXX} +CXXFLAGS="${CXXFLAGS} -moutline-atomics" +ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + pgac_cv_prog_CXX_cxxflags__moutline_atomics=yes +else + pgac_cv_prog_CXX_cxxflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag +CXXFLAGS="$pgac_save_CXXFLAGS" +CXX="$pgac_save_CXX" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CXX_cxxflags__moutline_atomics" = x"yes"; then + CXXFLAGS="${CXXFLAGS} -moutline-atomics" +fi + + + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. diff --git a/configure.ac b/configure.ac index d1c56f0125a..8a73fa1fd14 100644 --- a/configure.ac +++ b/configure.ac @@ -580,6 +580,10 @@ if test "$GCC" = yes -a "$ICC" = no; then if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + PGAC_PROG_CC_CFLAGS_OPT([-moutline-atomics]) + PGAC_PROG_CXX_CFLAGS_OPT([-moutline-atomics]) + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. From 214bc21c3c285df6c6ae001dddaea781fac5cc10 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 17 Feb 2022 08:43:32 +0300 Subject: [PATCH 24/79] OrioleDB specific CI --- .github/workflows/build.yml | 31 +++++++++++++++++++++++++++++++ ci/build.sh | 21 +++++++++++++++++++++ ci/check.sh | 11 +++++++++++ ci/check_output.sh | 30 ++++++++++++++++++++++++++++++ ci/prerequisites.sh | 22 ++++++++++++++++++++++ configure | 5 +++++ configure.ac | 4 ++++ meson.build | 1 + src/Makefile.global.in | 3 +++ src/bin/pg_rewind/meson.build | 6 ++++++ src/makefiles/meson.build | 1 + 11 files changed, 135 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 ci/build.sh create mode 100644 ci/check.sh create mode 100644 ci/check_output.sh create mode 100644 ci/prerequisites.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000000..c6f1bef64aa --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,31 @@ +name: build + +on: + push: + pull_request: + +jobs: + test: + runs-on: + - ubuntu-20.04 + strategy: + fail-fast: false + matrix: + compiler: [clang, gcc] + check_type: [normal, debug] + env: + LLVM_VER: 10 + COMPILER: ${{ matrix.compiler }} + CHECK_TYPE: ${{ matrix.check_type }} + steps: + - name: Checkout code into workspace directory + uses: actions/checkout@v2 + - name: Setup prerequisites + run: bash ./ci/prerequisites.sh + - name: Build + run: bash ./ci/build.sh + - name: Check + run: bash ./ci/check.sh + - name: Check output + run: bash ./ci/check_output.sh + if: ${{ success() || failure() }} diff --git a/ci/build.sh b/ci/build.sh new file mode 100644 index 00000000000..f541929e69c --- /dev/null +++ b/ci/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eu + +if [ $COMPILER = "clang" ]; then + export CC=clang-$LLVM_VER +else + export CC=gcc +fi + +# configure & build +if [ $CHECK_TYPE = "debug" ]; then + CFLAGS="-O0" ./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu +else + ./configure --disable-debug --disable-cassert --enable-tap-tests --with-icu +fi + +make -sj4 +cd contrib +make -sj4 +cd .. diff --git a/ci/check.sh b/ci/check.sh new file mode 100644 index 00000000000..faa8c25e84a --- /dev/null +++ b/ci/check.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eu + +# unsets limit for coredumps size +ulimit -c unlimited -S +# sets a coredump file pattern +mkdir -p /tmp/cores-$GITHUB_SHA-$TIMESTAMP +sudo sh -c "echo \"/tmp/cores-$GITHUB_SHA-$TIMESTAMP/%t_%p_%s.core\" > /proc/sys/kernel/core_pattern" + +make check-world -j4 diff --git a/ci/check_output.sh b/ci/check_output.sh new file mode 100644 index 00000000000..ae26cf63d68 --- /dev/null +++ b/ci/check_output.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -eu + +status=0 + +# show diff if it exists +for f in ` find . -name regression.diffs ` ; do + echo "========= Contents of $f" + cat $f + status=1 +done + +# check core dumps if any +cores=$(find /tmp/cores-$GITHUB_SHA-$TIMESTAMP/ -name '*.core' 2>/dev/null) + +if [ -n "$cores" ]; then + for corefile in $cores ; do + if [[ $corefile != *_3.core ]]; then + binary=$(gdb -quiet -core $corefile -batch -ex 'info auxv' | grep AT_EXECFN | perl -pe "s/^.*\"(.*)\"\$/\$1/g") + echo dumping $corefile for $binary + gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" $binary $corefile + status=1 + fi + done +fi + +rm -rf /tmp/cores-$GITHUB_SHA-$TIMESTAMP + +exit $status diff --git a/ci/prerequisites.sh b/ci/prerequisites.sh new file mode 100644 index 00000000000..b26251b711c --- /dev/null +++ b/ci/prerequisites.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +# print the hostname to be able to identify runner by logs +echo "HOSTNAME=`hostname`" +TIMESTAMP=$(date +%s) +echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_ENV +echo "TIMESTAMP=$TIMESTAMP" + +sudo apt-get -y install -qq wget ca-certificates + +sudo apt-get update -qq + +apt_packages="build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3 python3-dev python3-pip python3-setuptools python3-testresources" + +if [ $COMPILER = "clang" ]; then + apt_packages="$apt_packages llvm-$LLVM_VER clang-$LLVM_VER clang-tools-$LLVM_VER" +fi + +# install required packages +sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $apt_packages diff --git a/configure b/configure index eff1c7f8147..d73589b56a2 100755 --- a/configure +++ b/configure @@ -628,6 +628,7 @@ ac_includes_default="\ ac_subst_vars='LTLIBOBJS vpath_build PG_SYSROOT +ORIOLEDB_PATCHSET_VERSION PG_VERSION_NUM LDFLAGS_EX_BE PROVE @@ -19376,6 +19377,10 @@ _ACEOF +# Needed to check postgresql patches git tag during orioledb extension build +ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2` + + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/configure.ac b/configure.ac index 8a73fa1fd14..45baa47af54 100644 --- a/configure.ac +++ b/configure.ac @@ -2461,6 +2461,10 @@ $AWK '{printf "%d%04d", $1, $2}'`"] AC_DEFINE_UNQUOTED(PG_VERSION_NUM, $PG_VERSION_NUM, [PostgreSQL version as a number]) AC_SUBST(PG_VERSION_NUM) +# Needed to check postgresql patches git tag during orioledb extension build +[ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2`] +AC_SUBST(ORIOLEDB_PATCHSET_VERSION) + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/meson.build b/meson.build index 5c33f569b43..7f173875e94 100644 --- a/meson.build +++ b/meson.build @@ -153,6 +153,7 @@ cdata.set('PG_VERSION_NUM', pg_version_num) # PG_VERSION_STR is built later, it depends on compiler test results cdata.set_quoted('CONFIGURE_ARGS', '') +orioledb_patchset_version = '22' ############################################################### diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 0c6c31b5bee..72271af0ab1 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -44,6 +44,9 @@ VERSION_NUM = @PG_VERSION_NUM@ PACKAGE_URL = @PACKAGE_URL@ +# OrioleDB patchset git tag number +ORIOLEDB_PATCHSET_VERSION = @ORIOLEDB_PATCHSET_VERSION@ + # Set top_srcdir, srcdir, and VPATH. ifdef PGXS top_srcdir = $(top_builddir) diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index 200ebf84eb9..4a874bbf19d 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -2,6 +2,7 @@ pg_rewind_sources = files( 'datapagemap.c', + 'extension.c', 'file_ops.c', 'filemap.c', 'libpq_source.c', @@ -23,6 +24,7 @@ pg_rewind = executable('pg_rewind', pg_rewind_sources, dependencies: [frontend_code, libpq, lz4, zstd], c_args: ['-DFRONTEND'], # needed for xlogreader et al + export_dynamic: true, kwargs: default_bin_args, ) bin_targets += pg_rewind @@ -49,3 +51,7 @@ tests += { } subdir('po', if_found: libintl) + +install_headers( + 'pg_rewind_ext.h' +) \ No newline at end of file diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 5618050b306..192d3303f55 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -37,6 +37,7 @@ pgxs_kv = { 'PACKAGE_VERSION': pg_version, 'PG_MAJORVERSION': pg_version_major, 'PG_VERSION_NUM': pg_version_num, + 'ORIOLEDB_PATCHSET_VERSION': orioledb_patchset_version, 'configure_input': 'meson', 'vpath_build': 'yes', From 31831e1548ee96ac13e59b2c5a8a60485607391b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 9 Apr 2023 01:57:21 +0300 Subject: [PATCH 25/79] Close indices in AttachPartitionEnsureIndexes() before DefineIndex() --- src/backend/commands/tablecmds.c | 43 +++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 76912a87b8c..eb68bd8b552 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -18815,12 +18815,14 @@ static void AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) { List *idxes; + List *buildIdxes = NIL; List *attachRelIdxs; Relation *attachrelIdxRels; IndexInfo **attachInfos; ListCell *cell; MemoryContext cxt; MemoryContext oldcxt; + AttrMap *attmap; cxt = AllocSetContextCreate(CurrentMemoryContext, "AttachPartitionEnsureIndexes", @@ -18869,6 +18871,10 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) goto out; } + attmap = build_attrmap_by_name(RelationGetDescr(attachrel), + RelationGetDescr(rel), + false); + /* * For each index on the partitioned table, find a matching one in the * partition-to-be; if one is not found, create one. @@ -18878,7 +18884,6 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) Oid idx = lfirst_oid(cell); Relation idxRel = index_open(idx, AccessShareLock); IndexInfo *info; - AttrMap *attmap; bool found = false; Oid constraintOid; @@ -18894,9 +18899,6 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) /* construct an indexinfo to compare existing indexes against */ info = BuildIndexInfo(idxRel); - attmap = build_attrmap_by_name(RelationGetDescr(attachrel), - RelationGetDescr(rel), - false); constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(rel), idx); /* @@ -18962,19 +18964,7 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) * now. */ if (!found) - { - IndexStmt *stmt; - Oid conOid; - - stmt = generateClonedIndexStmt(NULL, - idxRel, attmap, - &conOid); - DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, - RelationGetRelid(idxRel), - conOid, - -1, - true, false, false, false, false); - } + buildIdxes = lappend_oid(buildIdxes, RelationGetRelid(idxRel)); index_close(idxRel, AccessShareLock); } @@ -18983,6 +18973,25 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) /* Clean up. */ for (int i = 0; i < list_length(attachRelIdxs); i++) index_close(attachrelIdxRels[i], AccessShareLock); + + foreach(cell, buildIdxes) + { + Oid idx = lfirst_oid(cell); + Relation idxRel = index_open(idx, AccessShareLock); + IndexStmt *stmt; + Oid conOid; + + stmt = generateClonedIndexStmt(NULL, + idxRel, attmap, + &conOid); + DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, + RelationGetRelid(idxRel), + conOid, + -1, + true, false, false, false, false); + index_close(idxRel, AccessShareLock); + } + MemoryContextSwitchTo(oldcxt); MemoryContextDelete(cxt); } From 1a7a81fd6bfe8d0f523ef0fd7b38e07b7755847b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Fri, 30 Jun 2023 01:35:54 +0300 Subject: [PATCH 26/79] New BGWORKER_CLASS_SYSTEM bgworkers class They are allowed to stay during shutdown checkpointing and help checkpointer do its work. --- src/backend/postmaster/postmaster.c | 39 +++++++++++++++++++++-------- src/include/postmaster/bgworker.h | 6 +++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index abb7ae9c718..fd6097abf98 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -137,7 +137,8 @@ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ -#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ +#define BACKEND_TYPE_SYSTEM_BGWORKER 0x0010 /* system bgworker process */ +#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ /* * List of active backends (or child processes anyway; we don't actually @@ -1896,8 +1897,9 @@ processCancelRequest(int backendPID, int32 cancelAuthCode) /* * canAcceptConnections --- check to see if database state allows connections * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, - * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet - * know whether a NORMAL connection might turn into a walsender.) + * BACKEND_TYPE_AUTOVAC, BACKEND_TYPE_BGWORKER or BACKEND_TYPE_SYSTEM_BGWORKER. + * (Note that we don't yet know whether a NORMAL connection might turn into + * a walsender.) */ static CAC_state canAcceptConnections(int backend_type) @@ -1911,7 +1913,8 @@ canAcceptConnections(int backend_type) * bgworker_should_start_now() decided whether the DB state allows them. */ if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && - backend_type != BACKEND_TYPE_BGWORKER) + backend_type != BACKEND_TYPE_BGWORKER && + backend_type != BACKEND_TYPE_SYSTEM_BGWORKER) { if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ @@ -2541,6 +2544,13 @@ process_pm_child_exit(void) if (PgArchPID != 0) signal_child(PgArchPID, SIGUSR2); + /* + * Terminate system background workers since checpoint is + * complete. + */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_SYSTEM_BGWORKER); + /* * Waken walsenders for the last time. No regular backends * should be around anymore. @@ -2972,7 +2982,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) * Background workers were already processed above; ignore them * here. */ - if (bp->bkend_type == BACKEND_TYPE_BGWORKER) + if (bp->bkend_type == BACKEND_TYPE_BGWORKER || + bp->bkend_type == BACKEND_TYPE_SYSTEM_BGWORKER) continue; if (take_action) @@ -3163,7 +3174,7 @@ PostmasterStateMachine(void) /* Signal all backend children except walsenders */ SignalSomeChildren(SIGTERM, - BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); @@ -3205,7 +3216,7 @@ PostmasterStateMachine(void) * here. Walsenders and archiver are also disregarded, they will be * terminated later after writing the checkpoint record. */ - if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER) == 0 && StartupPID == 0 && WalReceiverPID == 0 && WalSummarizerPID == 0 && @@ -4308,16 +4319,20 @@ do_start_bgworker(RegisteredBgWorker *rw) * specified start_time? */ static bool -bgworker_should_start_now(BgWorkerStartTime start_time) +bgworker_should_start_now(BgWorkerStartTime start_time, int flags) { switch (pmState) { case PM_NO_CHILDREN: case PM_WAIT_DEAD_END: case PM_SHUTDOWN_2: + break; + case PM_SHUTDOWN: case PM_WAIT_BACKENDS: case PM_STOP_BACKENDS: + if (flags & BGWORKER_CLASS_SYSTEM) + return true; break; case PM_RUN: @@ -4392,7 +4407,10 @@ assign_backendlist_entry(RegisteredBgWorker *rw) bn->cancel_key = MyCancelKey; bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); - bn->bkend_type = BACKEND_TYPE_BGWORKER; + if (rw->rw_worker.bgw_flags & BGWORKER_CLASS_SYSTEM) + bn->bkend_type = BACKEND_TYPE_SYSTEM_BGWORKER; + else + bn->bkend_type = BACKEND_TYPE_BGWORKER; bn->dead_end = false; bn->bgworker_notify = false; @@ -4490,7 +4508,8 @@ maybe_start_bgworkers(void) } } - if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) + if (bgworker_should_start_now(rw->rw_worker.bgw_start_time, + rw->rw_worker.bgw_flags)) { /* reset crash time before trying to start worker */ rw->rw_crashed_at = 0; diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h index 22fc49ec27f..9a1cac58dee 100644 --- a/src/include/postmaster/bgworker.h +++ b/src/include/postmaster/bgworker.h @@ -66,6 +66,12 @@ * background workers should not use this class. */ #define BGWORKER_CLASS_PARALLEL 0x0010 + +/* + * This class of bgworkers are allowed to stay working during shutdown + * checkpointing. + */ +#define BGWORKER_CLASS_SYSTEM 0x0020 /* add additional bgworker classes here */ From 7c15745b37174d73a6814b74bde879c5b70f2e8e Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 7 Sep 2023 21:33:03 +0200 Subject: [PATCH 27/79] Add pg_newlocale_from_collation_hook to perform stricter collation checks --- src/backend/utils/adt/pg_locale.c | 7 ++++++- src/include/utils/pg_locale.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 4c85a01b284..056785b1444 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -134,6 +134,7 @@ typedef struct static HTAB *collation_cache = NULL; +pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook = NULL; #if defined(WIN32) && defined(LC_MESSAGES) static char *IsoLocaleName(const char *); @@ -1696,6 +1697,7 @@ pg_newlocale_from_collation(Oid collid) { char *actual_versionstr; char *collversionstr; + int level = WARNING; collversionstr = TextDatumGetCString(datum); @@ -1718,8 +1720,11 @@ pg_newlocale_from_collation(Oid collid) NameStr(collform->collname)))); } + if (pg_newlocale_from_collation_hook && pg_newlocale_from_collation_hook()) + level = ERROR; + if (strcmp(actual_versionstr, collversionstr) != 0) - ereport(WARNING, + ereport(level, (errmsg("collation \"%s\" has version mismatch", NameStr(collform->collname)), errdetail("The collation in the database was created using version %s, " diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 0bc93142e91..68722e7f539 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -106,6 +106,8 @@ extern void make_icu_collator(const char *iculocstr, extern bool pg_locale_deterministic(pg_locale_t locale); extern pg_locale_t pg_newlocale_from_collation(Oid collid); +typedef bool (*pg_newlocale_from_collation_hook_type)(); +extern pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook; extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); From 47a445118e810f75ae1a8ac1c46c25dba27b189f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 12 Jul 2023 23:40:12 +0300 Subject: [PATCH 28/79] Archive preload callback --- src/backend/postmaster/pgarch.c | 16 ++++++++++++++++ src/include/archive/archive_module.h | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 02f91431f5f..35af55cd678 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -760,6 +760,22 @@ pgarch_readyXlog(char *xlog) for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); + /* + * Preload the WAL files if the relevant callback is provided. + */ + if (ArchiveCallbacks->archive_preload_file_cb) + { + for (int i = 0; i < arch_files->arch_files_size; i++) + { + char *xlog1 = arch_files->arch_files[i]; + char pathname[MAXPGPATH]; + + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog1); + ArchiveCallbacks->archive_preload_file_cb(archive_module_state, + xlog1, pathname); + } + } + /* Return the highest priority file. */ arch_files->arch_files_size--; strcpy(xlog, arch_files->arch_files[arch_files->arch_files_size]); diff --git a/src/include/archive/archive_module.h b/src/include/archive/archive_module.h index 763af76e542..d73b9661a4f 100644 --- a/src/include/archive/archive_module.h +++ b/src/include/archive/archive_module.h @@ -37,13 +37,17 @@ typedef struct ArchiveModuleState */ typedef void (*ArchiveStartupCB) (ArchiveModuleState *state); typedef bool (*ArchiveCheckConfiguredCB) (ArchiveModuleState *state); -typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, const char *file, const char *path); +typedef void (*ArchivePreloadFileCB) (ArchiveModuleState *state, + const char *file, const char *path); +typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, + const char *file, const char *path); typedef void (*ArchiveShutdownCB) (ArchiveModuleState *state); typedef struct ArchiveModuleCallbacks { ArchiveStartupCB startup_cb; ArchiveCheckConfiguredCB check_configured_cb; + ArchivePreloadFileCB archive_preload_file_cb; ArchiveFileCB archive_file_cb; ArchiveShutdownCB shutdown_cb; } ArchiveModuleCallbacks; From 78db7cc7660ed20753c99e8c2fb3d17e1ea7d570 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 18 Feb 2024 06:10:50 +0200 Subject: [PATCH 29/79] Remove pthread_is_threaded_np() call To use curl during shared_preload_libraries initialization. --- configure | 2 +- configure.ac | 1 - meson.build | 1 - src/backend/postmaster/postmaster.c | 49 ----------------------------- src/include/pg_config.h.in | 3 -- 5 files changed, 1 insertion(+), 55 deletions(-) diff --git a/configure b/configure index d73589b56a2..ae1e42f3046 100755 --- a/configure +++ b/configure @@ -15366,7 +15366,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strsignal syncfs sync_file_range uselocale wcstombs_l +for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l posix_fallocate ppoll setproctitle setproctitle_fast strsignal syncfs sync_file_range uselocale wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 45baa47af54..74149784c30 100644 --- a/configure.ac +++ b/configure.ac @@ -1763,7 +1763,6 @@ AC_CHECK_FUNCS(m4_normalize([ mbstowcs_l posix_fallocate ppoll - pthread_is_threaded_np setproctitle setproctitle_fast strsignal diff --git a/meson.build b/meson.build index 7f173875e94..8d3e24cba47 100644 --- a/meson.build +++ b/meson.build @@ -2732,7 +2732,6 @@ func_checks = [ ['posix_fallocate'], ['ppoll'], ['pthread_barrier_wait', {'dependencies': [thread_dep]}], - ['pthread_is_threaded_np', {'dependencies': [thread_dep]}], ['sem_init', {'dependencies': [rt_dep, thread_dep], 'skip': sema_kind != 'unnamed_posix', 'define': false}], ['setproctitle', {'dependencies': [util_dep]}], ['setproctitle_fast'], diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index fd6097abf98..d0ae33c08f7 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -85,10 +85,6 @@ #include #endif -#ifdef HAVE_PTHREAD_IS_THREADED_NP -#include -#endif - #include "access/xlog.h" #include "access/xlogrecovery.h" #include "common/file_perm.h" @@ -1330,26 +1326,6 @@ PostmasterMain(int argc, char *argv[]) */ } -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * On macOS, libintl replaces setlocale() with a version that calls - * CFLocaleCopyCurrent() when its second argument is "" and every relevant - * environment variable is unset or empty. CFLocaleCopyCurrent() makes - * the process multithreaded. The postmaster calls sigprocmask() and - * calls fork() without an immediate exec(), both of which have undefined - * behavior in a multithreaded program. A multithreaded postmaster is the - * normal case on Windows, which offers neither fork() nor sigprocmask(). - * Currently, macOS is the only platform having pthread_is_threaded_np(), - * so we need not worry whether this HINT is appropriate elsewhere. - */ - if (pthread_is_threaded_np() != 0) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("postmaster became multithreaded during startup"), - errhint("Set the LC_ALL environment variable to a valid locale."))); -#endif - /* * Remember postmaster startup time */ @@ -1758,15 +1734,6 @@ ServerLoop(void) if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworkers(); -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * With assertions enabled, check regularly for appearance of - * additional threads. All builds check at start and exit. - */ - Assert(pthread_is_threaded_np() == 0); -#endif - /* * Lastly, check to see if it's time to do some things that we don't * want to do every single time through the loop, because they're a @@ -3685,22 +3652,6 @@ report_fork_failure_to_client(ClientSocket *client_sock, int errnum) static void ExitPostmaster(int status) { -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * There is no known cause for a postmaster to become multithreaded after - * startup. However, we might reach here via an error exit before - * reaching the test in PostmasterMain, so provide the same hint as there. - * This message uses LOG level, because an unclean shutdown at this point - * would usually not look much different from a clean shutdown. - */ - if (pthread_is_threaded_np() != 0) - ereport(LOG, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("postmaster became multithreaded"), - errhint("Set the LC_ALL environment variable to a valid locale."))); -#endif - /* should cleanup shared memory and kill all backends */ /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 1371ac055f4..569b23ecb68 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -332,9 +332,6 @@ /* Define to 1 if you have the `pthread_barrier_wait' function. */ #undef HAVE_PTHREAD_BARRIER_WAIT -/* Define to 1 if you have the `pthread_is_threaded_np' function. */ -#undef HAVE_PTHREAD_IS_THREADED_NP - /* Have PTHREAD_PRIO_INHERIT. */ #undef HAVE_PTHREAD_PRIO_INHERIT From 7744e805801724fde0573926742a0aa2b3957df7 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 8 Dec 2023 01:37:02 +0100 Subject: [PATCH 30/79] Added option to pg_rewind to perform extension specific rewind - added option --extension for pg_rewind - extracted SimpleXLogRead from extractPageMap for generic wal iteration in pg_rewind --- doc/src/sgml/ref/pg_rewind.sgml | 5 ++ src/bin/pg_rewind/Makefile | 7 +- src/bin/pg_rewind/extension.c | 132 ++++++++++++++++++++++++++++++ src/bin/pg_rewind/filemap.c | 40 +++++++++ src/bin/pg_rewind/parsexlog.c | 36 +++++--- src/bin/pg_rewind/pg_rewind.c | 15 +++- src/bin/pg_rewind/pg_rewind.h | 10 +++ src/bin/pg_rewind/pg_rewind_ext.h | 44 ++++++++++ 8 files changed, 273 insertions(+), 16 deletions(-) create mode 100644 src/bin/pg_rewind/extension.c create mode 100644 src/bin/pg_rewind/pg_rewind_ext.h diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index dc039d87566..0c8e7dd2cc3 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -302,6 +302,11 @@ PostgreSQL documentation This option has no effect when is used. + + + + + Load shared library that performs custom rewind for postgres extension. The path may be full or relative to PKGLIBDIR. File extension is optional. Multiple extensions can be selected by multiple switches. diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 12b138b2f2c..4f93864cf7e 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -21,6 +21,7 @@ LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) OBJS = \ $(WIN32RES) \ datapagemap.o \ + extension.o \ file_ops.o \ filemap.o \ libpq_source.o \ @@ -35,19 +36,21 @@ EXTRA_CLEAN = xlogreader.c all: pg_rewind pg_rewind: $(OBJS) | submake-libpq submake-libpgport - $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LDFLAGS_EX_BE) $(LIBS) -o $@$(X) xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . install: all installdirs $(INSTALL_PROGRAM) pg_rewind$(X) '$(DESTDIR)$(bindir)/pg_rewind$(X)' + $(INSTALL_DATA) $(srcdir)/pg_rewind_ext.h '$(DESTDIR)$(includedir)' installdirs: - $(MKDIR_P) '$(DESTDIR)$(bindir)' + $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(includedir)' uninstall: rm -f '$(DESTDIR)$(bindir)/pg_rewind$(X)' + rm -f '$(DESTDIR)$(includedir)/pg_rewind_ext.h' clean distclean: rm -f pg_rewind$(X) $(OBJS) xlogreader.c diff --git a/src/bin/pg_rewind/extension.c b/src/bin/pg_rewind/extension.c new file mode 100644 index 00000000000..29ec4b5a6f6 --- /dev/null +++ b/src/bin/pg_rewind/extension.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * extension.c + * Functions for processing shared libraries loaded by pg_rewind. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#ifndef WIN32 +#include + +/* + * On macOS, insists on including . If we're not + * using stdbool, undef bool to undo the damage. + */ +#ifndef PG_USE_STDBOOL +#ifdef bool +#undef bool +#endif +#endif +#endif /* !WIN32 */ + +#include + +#include "access/xlog_internal.h" +#include "pg_rewind.h" + +/* signature for pg_rewind extension library rewind function */ +typedef void (*PG_rewind_t) (const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + +static bool +file_exists(const char *argv0, const char *name) +{ + struct stat st; + + Assert(name != NULL); + + if (stat(name, &st) == 0) + return !S_ISDIR(st.st_mode); + else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES)) + { + const char *progname; + + progname = get_progname(argv0); + pg_log_error("could not access file \"%s\": %m", name); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + return false; +} + +static char * +expand_dynamic_library_name(const char *argv0, const char *name) +{ + char *full; + char my_exec_path[MAXPGPATH]; + char pkglib_path[MAXPGPATH]; + + Assert(name); + + if (find_my_exec(argv0, my_exec_path) < 0) + pg_fatal("%s: could not locate my own executable path", argv0); + get_pkglib_path(my_exec_path, pkglib_path); + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1); + sprintf(full, "%s/%s", pkglib_path, name); + if (file_exists(argv0, full)) + return full; + pfree(full); + + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1 + + strlen(DLSUFFIX) + 1); + sprintf(full, "%s/%s%s", pkglib_path, name, DLSUFFIX); + if (file_exists(argv0, full)) + return full; + pfree(full); + + return pstrdup(name); +} + +void +process_extensions(SimpleStringList *extensions, const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug) +{ + SimpleStringListCell *cell; + + if (extensions->head == NULL) + return; /* nothing to do */ + + for (cell = extensions->head; cell; cell = cell->next) + { + char *filename = cell->val; + char *fullname; + void *lib_handle; + PG_rewind_t PG_rewind; + char *load_error; + + fullname = expand_dynamic_library_name(argv0, filename); + + lib_handle = dlopen(fullname, RTLD_NOW | RTLD_GLOBAL); + if (lib_handle == NULL) + { + load_error = dlerror(); + pg_fatal("could not load library \"%s\": %s", fullname, load_error); + } + + PG_rewind = dlsym(lib_handle, "_PG_rewind"); + + if (PG_rewind == NULL) + pg_fatal("could not find function \"_PG_rewind\" in \"%s\"", + fullname); + pfree(fullname); + + if (showprogress) + pg_log_info("performing rewind for '%s' extension", filename); + PG_rewind(datadir_target, datadir_source, connstr_source, startpoint, + tliIndex, endpoint, restoreCommand, argv0, debug); + + pg_log_debug("loaded library \"%s\"", filename); + } +} diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index aff6f37cca5..3d7d483d757 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -53,6 +53,7 @@ #define FILEHASH_INITIAL_SIZE 1000 static filehash_hash *filehash; +static SimpleStringList extensions_exclude = {NULL, NULL}; static bool isRelDataFile(const char *path); static char *datasegpath(RelFileLocator rlocator, ForkNumber forknum, @@ -322,6 +323,8 @@ process_target_file(const char *path, file_type_t type, size_t size, * from the target data folder all paths which have been filtered out from * the source data folder when processing the source files. */ + if (check_file_excluded(path, false)) + return; /* * Like in process_source_file, pretend that pg_wal is always a directory. @@ -466,6 +469,31 @@ check_file_excluded(const char *path, bool is_source) } } + /* + * Exclude extensions directories + */ + if (extensions_exclude.head != NULL) + { + SimpleStringListCell *cell; + + for (cell = extensions_exclude.head; cell; cell = cell->next) + { + char *exclude_dir = cell->val; + + snprintf(localpath, sizeof(localpath), "%s/", exclude_dir); + if (strstr(path, localpath) == path) + { + if (is_source) + pg_log_debug("entry \"%s\" excluded from source file list", + path); + else + pg_log_debug("entry \"%s\" excluded from target file list", + path); + return true; + } + } + } + return false; } @@ -890,3 +918,15 @@ decide_file_actions(void) return filemap; } + +void +extensions_exclude_add(char **exclude_dirs) +{ + int i; + + for (i = 0; exclude_dirs[i] != NULL; i++) + { + simple_string_list_append(&extensions_exclude, + pstrdup(exclude_dirs[i])); + } +} diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 242326c97a7..dc31ee53e53 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -38,7 +38,7 @@ static const char *const RmgrNames[RM_MAX_ID + 1] = { #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \ RmgrNames[rmid] : "custom") -static void extractPageInfo(XLogReaderState *record); +static void extractPageInfo(XLogReaderState *record, void *arg); static int xlogreadfd = -1; static XLogSegNo xlogreadsegno = 0; @@ -54,17 +54,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); -/* - * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline - * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of - * the data blocks touched by the WAL records, and return them in a page map. - * - * 'endpoint' is the end of the last record to read. The record starting at - * 'endpoint' is the first one that is not read. - */ void -extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, - XLogRecPtr endpoint, const char *restoreCommand) +SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand, + void (*page_callback) (XLogReaderState *, void *arg), + void *arg) { XLogRecord *record; XLogReaderState *xlogreader; @@ -97,7 +91,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, LSN_FORMAT_ARGS(errptr)); } - extractPageInfo(xlogreader); + page_callback(xlogreader, arg); } while (xlogreader->EndRecPtr < endpoint); /* @@ -116,6 +110,22 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, } } +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of + * the data blocks touched by the WAL records, and return them in a page map. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +void +extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand) +{ + SimpleXLogRead(datadir, startpoint, tliIndex, endpoint, restoreCommand, + extractPageInfo, NULL); +} + /* * Reads one WAL record. Returns the end position of the record, without * doing anything with the record itself. @@ -386,7 +396,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, * Extract information on which blocks the current record modifies. */ static void -extractPageInfo(XLogReaderState *record) +extractPageInfo(XLogReaderState *record, void *arg) { int block_id; RmgrId rmid = XLogRecGetRmid(record); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 53eb49abdea..016c332f406 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -77,6 +77,8 @@ bool do_sync = true; bool restore_wal = false; DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +static SimpleStringList extensions = {NULL, NULL}; + /* Target history */ TimeLineHistoryEntry *targetHistory; int targetNentries; @@ -110,6 +112,7 @@ usage(const char *progname) printf(_(" --debug write a lot of debug messages\n")); printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); + printf(_(" -e, --extension=PATH path to library performing rewind for extension\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); @@ -135,6 +138,7 @@ main(int argc, char **argv) {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, {"sync-method", required_argument, NULL, 6}, + {"extension", required_argument, NULL, 'e'}, {NULL, 0, NULL, 0} }; int option_index; @@ -173,7 +177,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "cD:nNPRe", long_options, &option_index)) != -1) { switch (c) { @@ -225,6 +229,9 @@ main(int argc, char **argv) case 6: if (!parse_sync_method(optarg, &sync_method)) exit(1); + + case 'e': /* -e or --extension */ + simple_string_list_append(&extensions, optarg); break; default: @@ -466,6 +473,12 @@ main(int argc, char **argv) /* Initialize the hash table to track the status of each file */ filehash_init(); + if (extensions.head != NULL) + process_extensions(&extensions, datadir_target, datadir_source, + connstr_source, chkptrec, lastcommontliIndex, + target_wal_endrec, restore_command, argv[0], + debug); + /* * Collect information about all files in the both data directories. */ diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index ec43cbe2c67..4397259e0d0 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -15,7 +15,9 @@ #include "common/logging.h" #include "common/file_utils.h" #include "datapagemap.h" +#include "fe_utils/simple_list.h" #include "libpq-fe.h" +#include "pg_rewind_ext.h" #include "storage/block.h" #include "storage/relfilelocator.h" @@ -55,4 +57,12 @@ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries); +/* in extension.c */ +extern void process_extensions(SimpleStringList *extensions, + const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + #endif /* PG_REWIND_H */ diff --git a/src/bin/pg_rewind/pg_rewind_ext.h b/src/bin/pg_rewind/pg_rewind_ext.h new file mode 100644 index 00000000000..3616d94f588 --- /dev/null +++ b/src/bin/pg_rewind/pg_rewind_ext.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * pg_rewind_ext.h + * + * + * Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef PG_REWIND_EXT_H +#define PG_REWIND_EXT_H + +#include "access/xlogreader.h" + +/* in parsexlog.c */ +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. + * Pass all WAL records to 'page_callback'. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +extern void SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, + void (*page_callback) (XLogReaderState *, + void *arg), + void *arg); + + +/* in filemap.c */ +/* Add NULL-terminated list of dirs that pg_rewind can skip copying */ +extern void extensions_exclude_add(char **exclude_dirs); + +/* signature for pg_rewind extension library rewind function */ +extern PGDLLEXPORT void _PG_rewind(const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, + const char *restoreCommand, + const char *argv0, bool debug); + +#endif /* PG_REWIND_EXT_H */ From 9f2c6fa612487a04d381fd35405205746c14c968 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 3 May 2024 22:05:35 +0200 Subject: [PATCH 31/79] Index scan and index only scan with rowid --- src/backend/access/heap/heapam_handler.c | 3 +- src/backend/access/index/genam.c | 2 + src/backend/access/index/indexam.c | 88 +++++++++++++++++++++--- src/backend/access/table/tableam.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/nodeIndexonlyscan.c | 33 +++++++-- src/backend/utils/adt/selfuncs.c | 28 ++++++-- src/include/access/genam.h | 3 + src/include/access/relscan.h | 2 + src/include/access/tableam.h | 6 +- 10 files changed, 147 insertions(+), 22 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 2c2c7061189..7d6828db403 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -132,7 +132,7 @@ heapam_index_fetch_end(IndexFetchTableData *scan) static bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -140,6 +140,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; bool got_heap_tuple; + ItemPointer tid = DatumGetItemPointer(tupleid); Assert(TTS_IS_BUFFERTUPLE(slot)); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index b123acc5a60..e172025fe04 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -103,6 +103,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->orderByData = NULL; scan->xs_want_itup = false; /* may be set later */ + scan->xs_want_rowid = false; /* may be set later */ /* * During recovery we ignore killed tuples and don't bother to kill them @@ -124,6 +125,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_rowid.isnull = true; return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index dcd04b813d8..596773a5c11 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -610,6 +610,55 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } +/* ---------------- + * index_getnext_rowid - get the next ROWID from a scan + * + * The result is the next ROWID satisfying the scan keys, + * or isnull if no more matching tuples exist. + * ---------------- + */ +NullableDatum +index_getnext_rowid(IndexScanDesc scan, ScanDirection direction) +{ + NullableDatum result; + bool found; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgettuple); + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_heaptid. It should also set + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. + */ + found = scan->indexRelation->rd_indam->amgettuple(scan, direction); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + scan->xs_heap_continue = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + result.isnull = true; + return result; + } + /* Assert(RowidIsValid(&scan->xs_rowid)); */ + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the ROWID of the tuple we found. */ + return scan->xs_rowid; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -633,8 +682,17 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) { bool all_dead = false; bool found; + Datum tupleid; + + if (scan->xs_want_rowid) + { + Assert(!scan->xs_rowid.isnull); + tupleid = scan->xs_rowid.value; + } + else + tupleid = PointerGetDatum(&scan->xs_heaptid); - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, + found = table_index_fetch_tuple(scan->xs_heapfetch, tupleid, scan->xs_snapshot, slot, &scan->xs_heap_continue, &all_dead); @@ -676,16 +734,30 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * { if (!scan->xs_heap_continue) { - ItemPointer tid; + if (scan->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scan, direction); - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + ItemPointer tid; + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } } /* diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 0c02e428e4e..daac7275ed2 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -218,7 +218,7 @@ table_index_fetch_tuple_check(Relation rel, slot = table_slot_create(rel, NULL); scan = table_index_fetch_begin(rel); - found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, + found = table_index_fetch_tuple(scan, PointerGetDatum(tid), snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index f7dc42f7452..ea5a1f365b1 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -109,7 +109,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); bool call_again = false; - if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, + if (!table_index_fetch_tuple(scan, PointerGetDatum(&tmptid), SnapshotSelf, slot, &call_again, NULL)) { /* diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index b49194c0167..a8424922ccc 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -65,7 +65,7 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; + ItemPointer tid = NULL; /* * extract necessary information from index scan node @@ -117,12 +117,36 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (true) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + if (scandesc->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; + + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + } + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -157,7 +181,8 @@ IndexOnlyNext(IndexOnlyScanState *node) * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, + if (!scandesc->xs_want_rowid && + !VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { @@ -242,7 +267,7 @@ IndexOnlyNext(IndexOnlyScanState *node) * If we didn't access the heap, then we'll need to take a predicate * lock explicitly, as if we had. For now we do that at page level. */ - if (!tuple_from_heap) + if (!tuple_from_heap && !scandesc->xs_want_rowid) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 4670a3d648d..82c523eacd4 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6405,12 +6405,32 @@ get_actual_variable_endpoint(Relation heapRel, index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); - /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (true) { - BlockNumber block = ItemPointerGetBlockNumber(tid); + BlockNumber block = InvalidBlockNumber; - if (!VM_ALL_VISIBLE(heapRel, + /* Fetch first/next tuple in specified direction */ + if (index_scan->xs_want_rowid) + { + NullableDatum rowid; + rowid = index_getnext_rowid(index_scan, indexscandir); + + if (rowid.isnull) + break; + } + else + { + tid = index_getnext_tid(index_scan, indexscandir); + + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &index_scan->xs_heaptid)); + block = ItemPointerGetBlockNumber(tid); + } + + if (!index_scan->xs_want_rowid && + !VM_ALL_VISIBLE(heapRel, block, &vmbuffer)) { diff --git a/src/include/access/genam.h b/src/include/access/genam.h index c25f5d11b53..1b1c3e09ce4 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -176,6 +176,9 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); +extern NullableDatum index_getnext_rowid(IndexScanDesc scan, + ScanDirection direction); +extern Datum index_getnext_tupleid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 521043304ab..24b04709012 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -122,6 +122,7 @@ typedef struct IndexScanDescData struct ScanKeyData *keyData; /* array of index qualifier descriptors */ struct ScanKeyData *orderByData; /* array of ordering op descriptors */ bool xs_want_itup; /* caller requests index tuples */ + bool xs_want_rowid; /* caller requests index tuples */ bool xs_temp_snap; /* unregister snapshot at scan end? */ /* signaling to index AM about killing index tuples */ @@ -145,6 +146,7 @@ typedef struct IndexScanDescData struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ ItemPointerData xs_heaptid; /* result */ + NullableDatum xs_rowid; /* result if xs_want_rowid */ bool xs_heap_continue; /* T if must keep walking, potential * further results */ IndexFetchTableData *xs_heapfetch; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index e16e5cbf5d7..25748822386 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -478,7 +478,7 @@ typedef struct TableAmRoutine * future searches. */ bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead); @@ -1268,7 +1268,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan) */ static inline bool table_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -1281,7 +1281,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); - return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, + return scan->rel->rd_tableam->index_fetch_tuple(scan, tupleid, snapshot, slot, call_again, all_dead); } From f2a9278c08be1b8a4600fd4da4a64b19739358de Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Wed, 8 May 2024 04:09:19 +0200 Subject: [PATCH 32/79] Remove primary index am check --- src/backend/access/index/indexam.c | 3 ++- src/backend/catalog/index.c | 3 --- src/backend/parser/parse_utilcmd.c | 13 ------------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 596773a5c11..8e6fbd2555d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -765,7 +765,8 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * If we don't find anything, loop around and grab the next TID from * the index. */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (!scan->xs_want_rowid) + Assert(ItemPointerIsValid(&scan->xs_heaptid)); if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index cb69d84afa8..0250d68dd86 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2648,9 +2648,6 @@ BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii) */ Assert(ii->ii_Unique); - if (index->rd_rel->relam != BTREE_AM_OID) - elog(ERROR, "unexpected non-btree speculative unique index"); - ii->ii_UniqueOps = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueProcs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueStrats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index eaf46ab6871..ad207acae60 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -2310,19 +2310,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) errdetail("Cannot create a non-deferrable constraint using a deferrable index."), parser_errposition(cxt->pstate, constraint->location))); - /* - * Insist on it being a btree. That's the only kind that supports - * uniqueness at the moment anyway; but we must have an index that - * exactly matches what you'd get from plain ADD CONSTRAINT syntax, - * else dump and reload will produce a different index (breaking - * pg_upgrade in particular). - */ - if (index_rel->rd_rel->relam != get_index_am_oid(DEFAULT_INDEX_TYPE, false)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("index \"%s\" is not a btree", index_name), - parser_errposition(cxt->pstate, constraint->location))); - /* Must get indclass the hard way */ indclassDatum = SysCacheGetAttrNotNull(INDEXRELID, index_rel->rd_indextuple, From b96191b5057b9e474a3052254e26fbf7928057ec Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Mon, 13 May 2024 20:33:54 +0200 Subject: [PATCH 33/79] Passing tupleid to insert now --- contrib/bloom/blinsert.c | 3 ++- contrib/bloom/bloom.h | 2 +- src/backend/access/brin/brin.c | 3 ++- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/gininsert.c | 3 ++- src/backend/access/gist/gist.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/index/indexam.c | 4 ++-- src/backend/access/nbtree/nbtree.c | 3 ++- src/backend/access/spgist/spginsert.c | 3 ++- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/execIndexing.c | 18 +++++++++++++++--- src/backend/executor/nodeModifyTable.c | 4 ++-- src/include/access/amapi.h | 2 +- src/include/access/brin_internal.h | 2 +- src/include/access/genam.h | 2 +- src/include/access/gin_private.h | 2 +- src/include/access/gist_private.h | 2 +- src/include/access/hash.h | 2 +- src/include/access/nbtree.h | 2 +- src/include/access/spgist.h | 2 +- .../modules/dummy_index_am/dummy_index_am.c | 2 +- 24 files changed, 47 insertions(+), 28 deletions(-) diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index f8a1061abb9..7873118d112 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -172,7 +172,7 @@ blbuildempty(Relation index) */ bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ blinsert(Relation index, Datum *values, bool *isnull, BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index 83c81e640f9..3e813ab15c5 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -186,7 +186,7 @@ extern bool blvalidate(Oid opclassoid); /* index access method interface functions */ extern bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 6cbd31f0a3d..944157612cb 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -333,7 +333,7 @@ initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo) */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -348,6 +348,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); + ItemPointer heaptid = DatumGetItemPointer(tupleid); /* * If first time through in this statement, initialize the insert state diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 538a554c917..2b4fa1fb25a 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -338,7 +338,7 @@ toast_save_datum(Relation rel, Datum value, /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, - &(toasttup->t_self), + ItemPointerGetDatum(&(toasttup->t_self)), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 71f38be90c3..690c744d9a9 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -481,7 +481,7 @@ ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -490,6 +490,7 @@ gininsert(Relation index, Datum *values, bool *isnull, MemoryContext oldCtx; MemoryContext insertCtx; int i; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GinState cache if first call in this statement */ if (ginstate == NULL) diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index e49abbcb18a..06a5a4c62ee 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -157,7 +157,7 @@ gistbuildempty(Relation index) */ bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -165,6 +165,7 @@ gistinsert(Relation r, Datum *values, bool *isnull, GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; IndexTuple itup; MemoryContext oldCxt; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GISTSTATE cache if first call in this statement */ if (giststate == NULL) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 01d06b7c328..1dc15d2a53b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -249,7 +249,7 @@ hashbuildCallback(Relation index, */ bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -257,6 +257,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull, Datum index_values[1]; bool index_isnull[1]; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* convert data to a hash key; on failure, do not insert anything */ if (!_hash_convert_tuple(rel, diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 7d6828db403..6f0464896c0 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2313,7 +2313,7 @@ heapam_index_validate_scan(Relation heapRelation, index_insert(indexRelation, values, isnull, - &rootTuple, + ItemPointerGetDatum(&rootTuple), heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 8e6fbd2555d..d4d1bf52739 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -213,7 +213,7 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, @@ -228,7 +228,7 @@ index_insert(Relation indexRelation, InvalidBlockNumber); return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, - heap_t_ctid, heapRelation, + tupleid, heapRelation, checkUnique, indexUnchanged, indexInfo); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 59155a7bea6..4acb3c73089 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -180,13 +180,14 @@ btbuildempty(Relation index) */ bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { bool result; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* generate an index tuple */ itup = index_form_tuple(RelationGetDescr(rel), values, isnull); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 1bec19c2b88..57004e79f54 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -181,7 +181,7 @@ spgbuildempty(Relation index) */ bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ spginsert(Relation index, Datum *values, bool *isnull, SpGistState spgstate; MemoryContext oldCtx; MemoryContext insertCtx; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST insert temporary context", diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index d0d1abda58a..cd78b1ea55e 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -170,7 +170,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, index_insert(index, /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - &(heapTuple->t_self), /* tid of heap tuple */ + ItemPointerGetDatum(&(heapTuple->t_self)), /* tid of heap tuple */ heapRelation, index->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index ea5a1f365b1..43618646861 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -171,7 +171,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) * the row is now dead, because that is the TID the index will know * about. */ - index_insert(indexRel, values, isnull, &checktid, + index_insert(indexRel, values, isnull, ItemPointerGetDatum(&checktid), trigdata->tg_relation, UNIQUE_CHECK_EXISTING, false, indexInfo); diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index dded5e79374..d762ca1cf2b 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -304,7 +304,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, List *arbiterIndexes, bool onlySummarizing) { - ItemPointer tupleid = &slot->tts_tid; List *result = NIL; int i; int numIndices; @@ -314,8 +313,20 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; - Assert(ItemPointerIsValid(tupleid)); + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } /* * Get information from the result relation info structure. @@ -462,6 +473,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; + ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -482,7 +494,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - tupleid, values, isnull, + raw_tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c81a9a49582..7621ef28eb3 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1774,8 +1774,8 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, /* Tuple routing starts from the root table. */ context->cpUpdateReturningSlot = - ExecInsert(context, mtstate->rootResultRelInfo, slot, canSetTag, - inserted_tuple, insert_destrel); + ExecInsert(context, mtstate->rootResultRelInfo, + slot, canSetTag, inserted_tuple, insert_destrel); /* * Reset the transition state that may possibly have been written by diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 09801450816..36b41d69724 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -111,7 +111,7 @@ typedef void (*ambuildempty_function) (Relation indexRelation); typedef bool (*aminsert_function) (Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_tid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h index a5a9772621c..442d2c96b7b 100644 --- a/src/include/access/brin_internal.h +++ b/src/include/access/brin_internal.h @@ -92,7 +92,7 @@ extern IndexBuildResult *brinbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void brinbuildempty(Relation index); extern bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 1b1c3e09ce4..7c807f3cf74 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,7 +144,7 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 3013a44bae1..2e81017f014 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -115,7 +115,7 @@ extern IndexBuildResult *ginbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void ginbuildempty(Relation index); extern bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 7b8749c8db0..284fb49c517 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -401,7 +401,7 @@ typedef struct GiSTOptions /* gist.c */ extern void gistbuildempty(Relation index); extern bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 9c7d81525b4..e787974a3cf 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -364,7 +364,7 @@ extern IndexBuildResult *hashbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void hashbuildempty(Relation index); extern bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 74930433480..049ebf72b7b 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1164,7 +1164,7 @@ typedef struct BTOptions */ extern void btbuildempty(Relation index); extern bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index d6a49531200..b9cc48aba37 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -197,7 +197,7 @@ extern IndexBuildResult *spgbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void spgbuildempty(Relation index); extern bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 18185d02067..80c6668666a 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -164,7 +164,7 @@ dibuildempty(Relation index) */ static bool diinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) From 09fb4ee67f3e38389a3db0b3ee2d2116b0344d08 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 17 May 2024 00:27:02 +0200 Subject: [PATCH 34/79] Methods for index update and delete Also validates compatability of index AM with table AM at index creation --- src/backend/access/index/indexam.c | 60 ++++ src/backend/executor/execIndexing.c | 401 +++++++++++++++++++++++++ src/backend/executor/nodeModifyTable.c | 22 +- src/backend/parser/gram.y | 16 +- src/include/access/amapi.h | 23 ++ src/include/access/genam.h | 15 + src/include/executor/executor.h | 10 + src/include/nodes/parsenodes.h | 1 + 8 files changed, 542 insertions(+), 6 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index d4d1bf52739..fe1efe283c2 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -247,6 +247,66 @@ index_insert_cleanup(Relation indexRelation, indexRelation->rd_indam->aminsertcleanup(indexRelation, indexInfo); } +/* ---------------- + * index_update - update an index tuple in a relation + * ---------------- + */ +bool +index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amupdate); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amupdate(indexRelation, + new_valid, old_valid, + values, isnull, tupleid, + valuesOld, isnullOld, oldTupleid, + heapRelation, + checkUnique, + indexInfo); +} + + +/* ---------------- + * index_delete - delete an index tuple from a relation + * ---------------- + */ +bool +index_delete(Relation indexRelation, + Datum *values, bool *isnull, Datum tupleid, + Relation heapRelation, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amdelete); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amdelete(indexRelation, + values, isnull, tupleid, + heapRelation, + indexInfo); +} + /* * index_beginscan - start a scan of an index with amgettuple * diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index d762ca1cf2b..859cdd7147d 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -518,6 +518,407 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, return result; } +List * +ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, + List *arbiterIndexes, + bool onlySummarizing) +{ + List *result = NIL; + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool applyNoDupErr; + IndexUniqueCheck checkUnique; + bool satisfiesConstraint; + bool new_valid = true; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* + * Skip processing of non-summarizing indexes if we only update + * summarizing indexes + */ + if (onlySummarizing && !indexInfo->ii_Summarizing) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + { + if (!indexRelation->rd_indam->ammvccaware) + continue; + new_valid = false; + } + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Check whether to apply noDupErr to this index */ + applyNoDupErr = noDupErr && + (arbiterIndexes == NIL || + list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)); + + /* + * The index AM does the actual insertion, plus uniqueness checking. + * + * For an immediate-mode unique index, we just tell the index AM to + * throw error if not unique. + * + * For a deferrable unique index, we tell the index AM to just detect + * possible non-uniqueness, and we add the index OID to the result + * list if further checking is needed. + * + * For a speculative insertion (used by INSERT ... ON CONFLICT), do + * the same as for a deferrable unique index. + */ + if (!indexRelation->rd_index->indisunique) + checkUnique = UNIQUE_CHECK_NO; + else if (applyNoDupErr) + checkUnique = UNIQUE_CHECK_PARTIAL; + else if (indexRelation->rd_index->indimmediate) + checkUnique = UNIQUE_CHECK_YES; + else + checkUnique = UNIQUE_CHECK_PARTIAL; + + if (indexRelation->rd_indam->ammvccaware) + { + Datum valuesOld[INDEX_MAX_KEYS]; + bool isnullOld[INDEX_MAX_KEYS]; + Datum oldTupleid; + bool old_valid = true; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + oldTupleid = slot_getsysattr(oldSlot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&oldSlot->tts_tid)); + oldTupleid = PointerGetDatum(&oldSlot->tts_tid); + } + + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = oldSlot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + old_valid = false; + } + + FormIndexDatum(indexInfo, + oldSlot, + estate, + valuesOld, + isnullOld); + + satisfiesConstraint = + index_update(indexRelation, /* index relation */ + new_valid, + old_valid, + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + valuesOld, + isnullOld, + oldTupleid, + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexInfo); /* index AM may need this */ + + } + else + { + bool indexUnchanged; + /* + * There's definitely going to be an index_insert() call for this + * index. If we're being called as part of an UPDATE statement, + * consider if the 'indexUnchanged' = true hint should be passed. + */ + indexUnchanged = index_unchanged_by_update(resultRelInfo, + estate, + indexInfo, + indexRelation); + + satisfiesConstraint = + index_insert(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexUnchanged, /* UPDATE without logical change? */ + indexInfo); /* index AM may need this */ + } + + /* + * If the index has an associated exclusion constraint, check that. + * This is simpler than the process for uniqueness checks since we + * always insert first and then check. If the constraint is deferred, + * we check now anyway, but don't throw error on violation or wait for + * a conclusive outcome from a concurrent insertion; instead we'll + * queue a recheck event. Similarly, noDupErr callers (speculative + * inserters) will recheck later, and wait for a conclusive outcome + * then. + * + * An index for an exclusion constraint can't also be UNIQUE (not an + * essential property, we just don't allow it in the grammar), so no + * need to preserve the prior state of satisfiesConstraint. + */ + if (indexInfo->ii_ExclusionOps != NULL) + { + bool violationOK; + CEOUC_WAIT_MODE waitMode; + ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); + + if (applyNoDupErr) + { + violationOK = true; + waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT; + } + else if (!indexRelation->rd_index->indimmediate) + { + violationOK = true; + waitMode = CEOUC_NOWAIT; + } + else + { + violationOK = false; + waitMode = CEOUC_WAIT; + } + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, + indexRelation, indexInfo, + raw_tupleid, values, isnull, + estate, false, + waitMode, violationOK, NULL); + } + + if ((checkUnique == UNIQUE_CHECK_PARTIAL || + indexInfo->ii_ExclusionOps != NULL) && + !satisfiesConstraint) + { + /* + * The tuple potentially violates the uniqueness or exclusion + * constraint, so make a note of the index so that we can re-check + * it later. Speculative inserters are told if there was a + * speculative conflict, since that always requires a restart. + */ + result = lappend_oid(result, RelationGetRelid(indexRelation)); + if (indexRelation->rd_index->indimmediate && specConflict) + *specConflict = true; + } + } + + return result; +} + +void +ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, + EState *estate) +{ + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + if (!indexRelation->rd_indam->ammvccaware) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + index_delete(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + indexInfo); /* index AM may need this */ + } +} + /* ---------------------------------------------------------------- * ExecCheckIndexConstraints * diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 7621ef28eb3..eb5934a9f12 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1252,6 +1252,14 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result) *result = TM_Ok; + /* + * Open the table's indexes, if we have not done so already, so that we + * can delete index entries. + */ + if (resultRelInfo->ri_RelationDesc->rd_rel->relhasindex && + resultRelInfo->ri_IndexRelationDescs == NULL) + ExecOpenIndices(resultRelInfo, false); + /* BEFORE ROW DELETE triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_delete_before_row) @@ -1309,6 +1317,10 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, EState *estate = context->estate; TransitionCaptureState *ar_delete_trig_tcs; + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, slot, context->estate); + /* * If this delete is the result of a partition key update that moved the * tuple to a new partition, put this row into the transition OLD TABLE, @@ -2037,11 +2049,15 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* insert index entries for tuple if necessary */ if (resultRelInfo->ri_NumIndices > 0 && (updateCxt->updateIndexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, context->estate, - true, false, + { + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + context->estate, + false, NULL, NIL, (updateCxt->updateIndexes == TU_Summarizing)); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index f230c5ff9e7..46aa27ef9bb 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -373,6 +373,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type OptSchemaEltList parameter_name_list %type am_type +%type opt_for_tableam %type TriggerForSpec TriggerForType %type TriggerActionTime @@ -5870,17 +5871,21 @@ row_security_cmd: /***************************************************************************** * * QUERY: - * CREATE ACCESS METHOD name HANDLER handler_name + * CREATE ACCESS METHOD name TYPE am_type + * [FOR tableam_name] + * HANDLER handler_name * *****************************************************************************/ -CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type HANDLER handler_name +CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type + opt_for_tableam HANDLER handler_name { CreateAmStmt *n = makeNode(CreateAmStmt); n->amname = $4; - n->handler_name = $8; n->amtype = $6; + n->tableam_name = $7; + n->handler_name = $9; $$ = (Node *) n; } ; @@ -5890,6 +5895,11 @@ am_type: | TABLE { $$ = AMTYPE_TABLE; } ; +opt_for_tableam: + FOR name { $$ = $2; } + | /*EMPTY*/ { $$ = NULL; } + ; + /***************************************************************************** * * QUERIES : diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 36b41d69724..38195699990 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -116,6 +116,25 @@ typedef bool (*aminsert_function) (Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); +/* update this tuple */ +typedef bool (*amupdate_function) (Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +/* delete this tuple */ +typedef bool (*amdelete_function) (Relation indexRelation, + Datum *values, bool *isnull, + Datum tupleid, + Relation heapRelation, + struct IndexInfo *indexInfo); /* cleanup after insert */ typedef void (*aminsertcleanup_function) (Relation indexRelation, @@ -256,6 +275,8 @@ typedef struct IndexAmRoutine bool amusemaintenanceworkmem; /* does AM store tuple information only at block granularity? */ bool amsummarizing; + /* does AM can provide MVCC */ + bool ammvccaware; /* OR of parallel vacuum flags. See vacuum.h for flags. */ uint8 amparallelvacuumoptions; /* type of data stored in index, or InvalidOid if variable */ @@ -272,6 +293,8 @@ typedef struct IndexAmRoutine ambuildempty_function ambuildempty; aminsert_function aminsert; aminsertcleanup_function aminsertcleanup; /* can be NULL */ + amupdate_function amupdate; + amdelete_function amdelete; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 7c807f3cf74..ab34a7726ff 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -151,6 +151,21 @@ extern bool index_insert(Relation indexRelation, struct IndexInfo *indexInfo); extern void index_insert_cleanup(Relation indexRelation, struct IndexInfo *indexInfo); +extern bool index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +extern bool index_delete(Relation indexRelation, Datum *values, bool *isnull, + Datum tupleid, Relation heapRelation, + struct IndexInfo *indexInfo); extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 7e6e366ceac..a044e76c437 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -634,6 +634,16 @@ extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, bool noDupErr, bool *specConflict, List *arbiterIndexes, bool onlySummarizing); +extern List *ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, List *arbiterIndexes, + bool onlySummarizing); +extern void ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate); extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index ddc80007b34..82443390a85 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2993,6 +2993,7 @@ typedef struct CreateAmStmt char *amname; /* access method name */ List *handler_name; /* handler function name */ char amtype; /* type of access method */ + char *tableam_name; /* table AM name */ } CreateAmStmt; /* ---------------------- From 9171f28c2a649d51c935f29f2af56efdc060c3dd Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 12 Aug 2024 12:30:00 +0300 Subject: [PATCH 35/79] Hook to override index AM routine --- src/backend/access/index/amapi.c | 67 ++++++++++++++++++---- src/backend/catalog/index.c | 2 +- src/backend/commands/indexcmds.c | 4 +- src/backend/commands/opclasscmds.c | 9 +-- src/backend/executor/execAmi.c | 2 +- src/backend/replication/logical/relation.c | 2 +- src/backend/utils/adt/amutils.c | 4 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/cache/relcache.c | 2 +- src/include/access/amapi.h | 9 ++- 10 files changed, 76 insertions(+), 27 deletions(-) diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 079fb7cba65..40fb78e71d2 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -16,25 +16,27 @@ #include "access/amapi.h" #include "access/htup_details.h" #include "catalog/pg_am.h" +#include "catalog/pg_class.h" +#include "catalog/pg_index.h" #include "catalog/pg_opclass.h" #include "utils/fmgrprotos.h" #include "utils/syscache.h" +IndexAMRoutineHookType IndexAMRoutineHook = NULL; -/* - * GetIndexAmRoutine - call the specified access method handler routine to get - * its IndexAmRoutine struct, which will be palloc'd in the caller's context. - * - * Note that if the amhandler function is built-in, this will not involve - * any catalog access. It's therefore safe to use this while bootstrapping - * indexes for the system catalogs. relcache.c relies on that. - */ IndexAmRoutine * -GetIndexAmRoutine(Oid amhandler) +GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) { Datum datum; IndexAmRoutine *routine; + if (IndexAMRoutineHook != NULL) + { + routine = IndexAMRoutineHook(tamoid, amhandler); + if (routine) + return routine; + } + datum = OidFunctionCall0(amhandler); routine = (IndexAmRoutine *) DatumGetPointer(datum); @@ -45,6 +47,47 @@ GetIndexAmRoutine(Oid amhandler) return routine; } + +/* + * GetIndexAmRoutine - call the specified access method handler routine to get + * its IndexAmRoutine struct, which will be palloc'd in the caller's context. + * + * Note that if the amhandler function is built-in, this will not involve + * any catalog access. It's therefore safe to use this while bootstrapping + * indexes for the system catalogs. relcache.c relies on that. + */ +IndexAmRoutine * +GetIndexAmRoutine(Oid indoid, Oid amhandler) +{ + HeapTuple ht_idx; + HeapTuple ht_tblrel; + Form_pg_index idxrec; + Form_pg_class tblrelrec; + Oid indrelid; + Oid tamoid; + + if (!OidIsValid((indoid)) || indoid < FirstNormalObjectId) + return GetIndexAmRoutineWithTableAM(HEAP_TABLE_AM_OID, amhandler); + + ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indoid)); + if (!HeapTupleIsValid(ht_idx)) + elog(ERROR, "cache lookup failed for index %u", indoid); + idxrec = (Form_pg_index) GETSTRUCT(ht_idx); + Assert(indoid == idxrec->indexrelid); + indrelid = idxrec->indrelid; + + ht_tblrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indrelid)); + if (!HeapTupleIsValid(ht_tblrel)) + elog(ERROR, "cache lookup failed for relation %u", indrelid); + tblrelrec = (Form_pg_class) GETSTRUCT(ht_tblrel); + tamoid = tblrelrec->relam; + + ReleaseSysCache(ht_tblrel); + ReleaseSysCache(ht_idx); + + return GetIndexAmRoutineWithTableAM(tamoid, amhandler); +} + /* * GetIndexAmRoutineByAmId - look up the handler of the index access method * with the given OID, and get its IndexAmRoutine struct. @@ -53,7 +96,7 @@ GetIndexAmRoutine(Oid amhandler) * noerror is true, else throws error. */ IndexAmRoutine * -GetIndexAmRoutineByAmId(Oid amoid, bool noerror) +GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) { HeapTuple tuple; Form_pg_am amform; @@ -103,7 +146,7 @@ GetIndexAmRoutineByAmId(Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(amhandler); + return GetIndexAmRoutine(indoid, amhandler); } @@ -129,7 +172,7 @@ amvalidate(PG_FUNCTION_ARGS) ReleaseSysCache(classtup); - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (amroutine->amvalidate == NULL) elog(ERROR, "function amvalidate is not defined for index access method %u", diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 0250d68dd86..b447a080da5 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -292,7 +292,7 @@ ConstructTupleDescriptor(Relation heapRelation, int i; /* We need access to the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(accessMethodId, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, accessMethodId, false); /* ... and to the table's tuple descriptor */ heapTupDesc = RelationGetDescr(heapRelation); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b0fa957a456..e6598e40317 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -222,7 +222,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutine(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; @@ -849,7 +849,7 @@ DefineIndex(Oid tableId, } accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineWithTableAM(rel->rd_rel->relam, accessMethodForm->amhandler); pgstat_progress_update_param(PROGRESS_CREATEIDX_ACCESS_METHOD_OID, accessMethodId); diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index d024c547cc2..0d53a20ff0a 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -42,6 +42,7 @@ #include "parser/parse_oper.h" #include "parser/parse_type.h" #include "utils/acl.h" +#include "postgres_ext.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" @@ -377,7 +378,7 @@ DefineOpClass(CreateOpClassStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -835,7 +836,7 @@ AlterOpFamily(AlterOpFamilyStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -882,7 +883,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, int maxOpNumber, int maxProcNumber, int optsProcNumber, List *items) { - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); List *operators; /* OpFamilyMember list for operators */ List *procedures; /* OpFamilyMember list for support procs */ ListCell *l; @@ -1165,7 +1166,7 @@ assignOperTypes(OpFamilyMember *member, Oid amoid, Oid typeoid) * the family has been created but not yet populated with the required * operators.) */ - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (!amroutine->amcanorderbyop) ereport(ERROR, diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 3289e3e0219..1a7f6ae2c9b 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -613,7 +613,7 @@ IndexSupportsBackwardScan(Oid indexid) idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false); + amroutine = GetIndexAmRoutineByAmId(indexid, idxrelrec->relam, false); result = amroutine->amcanbackward; diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index f139e7b01e9..4429127c434 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -834,7 +834,7 @@ IsIndexUsableForReplicaIdentityFull(IndexInfo *indexInfo, AttrMap *attrmap) IndexAmRoutine *amroutine; /* The given index access method must implement amgettuple. */ - amroutine = GetIndexAmRoutineByAmId(indexInfo->ii_Am, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, indexInfo->ii_Am, false); Assert(amroutine->amgettuple != NULL); } #endif diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c index dd39a994c8d..b7ebe6a5f76 100644 --- a/src/backend/utils/adt/amutils.c +++ b/src/backend/utils/adt/amutils.c @@ -195,7 +195,7 @@ indexam_property(FunctionCallInfo fcinfo, /* * Get AM information. If we don't have a valid AM OID, return NULL. */ - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(index_oid, amoid, true); if (routine == NULL) PG_RETURN_NULL(); @@ -455,7 +455,7 @@ pg_indexam_progress_phasename(PG_FUNCTION_ARGS) IndexAmRoutine *routine; char *name; - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(InvalidOid, amoid, true); if (routine == NULL || !routine->ambuildphasename) PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index d1139a268f3..ede7cf34314 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1323,7 +1323,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(amrec->amhandler); + amroutine = GetIndexAmRoutine(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 31066221b27..d1b5c9a65b0 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1413,7 +1413,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_amhandler); + tmp = GetIndexAmRoutine(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 38195699990..86a93090180 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -320,7 +320,12 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ -extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid amoid, bool noerror); +extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutine(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); + +typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); + +extern IndexAMRoutineHookType IndexAMRoutineHook; #endif /* AMAPI_H */ From bfd9c1b75a749ceddb238852ff4a957d4011e165 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 5 Sep 2024 00:03:23 +0200 Subject: [PATCH 36/79] Always building child/root maps for relations with ROW_REF_ROWID --- src/backend/executor/execUtils.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index d8dd36685c5..0ac74db32c3 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1211,9 +1211,19 @@ ExecGetChildToRootMap(ResultRelInfo *resultRelInfo) ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo; if (rootRelInfo) - resultRelInfo->ri_ChildToRootMap = - convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc), - RelationGetDescr(rootRelInfo->ri_RelationDesc)); + { + TupleDesc indesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + TupleDesc outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); + AttrMap *attrMap; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, false); + else + attrMap = build_attrmap_by_name(indesc, outdesc, false); + if (attrMap) + resultRelInfo->ri_ChildToRootMap = + convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); + } else /* this isn't a child result rel */ resultRelInfo->ri_ChildToRootMap = NULL; @@ -1250,8 +1260,10 @@ ExecGetRootToChildMap(ResultRelInfo *resultRelInfo, EState *estate) * to ignore by passing true for missing_ok. */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - attrMap = build_attrmap_by_name_if_req(indesc, outdesc, - !childrel->rd_rel->relispartition); + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, !childrel->rd_rel->relispartition); + else + attrMap = build_attrmap_by_name(indesc, outdesc, !childrel->rd_rel->relispartition); if (attrMap) resultRelInfo->ri_RootToChildMap = convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); From ddad327479a6a56d01109e5ffa5cb8e60871881b Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Tue, 20 Aug 2024 14:09:51 +0200 Subject: [PATCH 37/79] Don't run internal btree _bt_getrootheight on non-btree in get_relation_info --- src/backend/optimizer/util/plancat.c | 3 ++- src/include/optimizer/plancat.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index ac82a021e97..48b52804632 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -58,6 +58,7 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +skip_tree_height_hook_type skip_tree_height_hook = NULL; static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, @@ -485,7 +486,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->tuples = rel->tuples; } - if (info->relam == BTREE_AM_OID) + if (info->relam == BTREE_AM_OID && (!skip_tree_height_hook || !skip_tree_height_hook(indexRelation))) { /* * For btrees, get tree height while we have the index diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 1206f60a715..98e5494e300 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -24,6 +24,9 @@ typedef void (*get_relation_info_hook_type) (PlannerInfo *root, RelOptInfo *rel); extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; +typedef bool (*skip_tree_height_hook_type) (Relation indexRelation); +extern PGDLLIMPORT skip_tree_height_hook_type skip_tree_height_hook; + extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); From ee008439304b8c028d4e3652fe30f2e26dd176d3 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 17 Sep 2024 01:58:24 +0300 Subject: [PATCH 38/79] Fix handling tupleid in logical replication --- src/backend/access/table/tableam.c | 8 ++-- src/backend/executor/execReplication.c | 54 +++++++++++++++++------- src/backend/replication/logical/worker.c | 15 +++---- src/include/access/tableam.h | 4 +- 4 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index daac7275ed2..865d30b61af 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -288,7 +288,7 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, +simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *oldSlot) { TM_Result result; @@ -299,7 +299,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_delete(rel, PointerGetDatum(tid), + result = table_tuple_delete(rel, tupleid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, @@ -340,7 +340,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, * via ereport(). */ void -simple_table_tuple_update(Relation rel, ItemPointer otid, +simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, TU_UpdateIndexes *update_indexes, @@ -355,7 +355,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, PointerGetDatum(otid), slot, + result = table_tuple_update(rel, tupleid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 5c2a03d257a..f3aea9b1813 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -166,6 +166,25 @@ build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel, return skey_attoff; } +static Datum +slot_get_tupleid(Relation rel, TupleTableSlot *slot) +{ + Datum tupleid; + + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&slot->tts_tid); + } + + return tupleid; +} + /* * Search the relation 'rel' for tuple using the index. * @@ -250,7 +269,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), GetActiveSnapshot(), outslot, GetCurrentCommandId(false), @@ -435,7 +454,7 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), GetActiveSnapshot(), outslot, GetCurrentCommandId(false), @@ -559,7 +578,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &(searchslot->tts_tid); + Datum tupleid = slot_get_tupleid(rel, searchslot); /* * We support only non-system tables, with @@ -575,7 +594,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - PointerGetDatum(tid), NULL, slot, NULL, NULL)) + tupleid, NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -597,16 +616,17 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); - if (resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_update_after_row) - oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); - simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, + simple_table_tuple_update(rel, tupleid, slot, estate->es_snapshot, &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, true, false, + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + estate, + false, NULL, NIL, (update_indexes == TU_Summarizing)); @@ -633,7 +653,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &searchslot->tts_tid; + Datum tupleid = slot_get_tupleid(rel, searchslot); CheckCmdReplicaIdentity(rel, CMD_DELETE); @@ -642,19 +662,21 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - PointerGetDatum(tid), NULL, NULL, NULL, NULL); + tupleid, NULL, NULL, NULL, NULL); } if (!skip_tuple) { TupleTableSlot *oldSlot = NULL; - if (resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_delete_after_row) - oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot, oldSlot); + simple_table_tuple_delete(rel, tupleid, estate->es_snapshot, oldSlot); + + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, oldSlot, estate); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 1bff6c92dda..222bae1afd0 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2419,9 +2419,8 @@ apply_handle_insert(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Process and store remote tuple in the slot */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -2579,9 +2578,8 @@ apply_handle_update(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* * Populate updatedCols so that per-column triggers can fire, and so @@ -2759,9 +2757,8 @@ apply_handle_delete(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Build the search tuple. */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 25748822386..5c55a5f78a5 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -2098,10 +2098,10 @@ table_tuple_is_current(Relation rel, TupleTableSlot *slot) */ extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); -extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, +extern void simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *oldSlot); -extern void simple_table_tuple_update(Relation rel, ItemPointer otid, +extern void simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot); From 8f90c69c557ca8a7ced0f10018b5cabce71c9609 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 27 Sep 2024 14:26:40 +0200 Subject: [PATCH 39/79] New csn snapshot format Added xlogptr and xmin to determine right order of transactions when decoding on replica --- src/backend/utils/time/snapmgr.c | 10 +++++++--- src/include/utils/snapshot.h | 8 +++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index f498d9fb42e..f3d47159fa6 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -205,7 +205,7 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; - CommitSeqNo snapshotcsn; + CSNSnapshotData csnSnapshotData; uint64 undoRegularLocation; uint64 undoRegularXmin; uint64 undoSystemLocation; @@ -1763,7 +1763,9 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; - serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot.csnSnapshotData.xmin = snapshot->csnSnapshotData.xmin; + serialized_snapshot.csnSnapshotData.snapshotcsn = snapshot->csnSnapshotData.snapshotcsn; + serialized_snapshot.csnSnapshotData.xlogptr = snapshot->csnSnapshotData.xlogptr; serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; @@ -1843,7 +1845,9 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; - snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; + snapshot->csnSnapshotData.xmin = serialized_snapshot.csnSnapshotData.xmin; + snapshot->csnSnapshotData.snapshotcsn = serialized_snapshot.csnSnapshotData.snapshotcsn; + snapshot->csnSnapshotData.xlogptr = serialized_snapshot.csnSnapshotData.xlogptr; snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 49c913b12f8..6052c760056 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -129,6 +129,12 @@ typedef struct pairingheap_node ph_node; } RetainUndoLocationPHNode; +typedef struct CSNSnapshotData { + uint64 xmin; + CommitSeqNo snapshotcsn; + XLogRecPtr xlogptr; +} CSNSnapshotData; + /* * Struct representing all kind of possible snapshots. * @@ -224,7 +230,7 @@ typedef struct SnapshotData RetainUndoLocationPHNode undoRegularLocationPhNode; RetainUndoLocationPHNode undoSystemLocationPhNode; - CommitSeqNo snapshotcsn; + CSNSnapshotData csnSnapshotData; } SnapshotData; typedef void (*snapshot_hook_type) (Snapshot snapshot); From 8edff8b91d2be193d9493f188c08d4bf2a916cd8 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Mon, 7 Oct 2024 18:55:44 +0400 Subject: [PATCH 40/79] expose functions that became private in PG17 due to ReourceOwner and SAOP changes in PG17 --- src/backend/access/nbtree/nbtutils.c | 3 +-- src/backend/utils/cache/catcache.c | 5 +++-- src/include/access/nbtree.h | 1 + src/include/utils/resowner_private.h | 33 ++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 src/include/utils/resowner_private.h diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index a448a547082..206a5d81432 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -73,7 +73,6 @@ static int _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result); -static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, @@ -1377,7 +1376,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) * On false result, the scankeys stay the same, and the array keys are not * advanced (every array remains at its final element for scan direction). */ -static bool +bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 5d5bf4fd806..4134bae67e6 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -39,6 +39,7 @@ #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" +#include "utils/resowner_private.h" #include "utils/syscache.h" /* @@ -159,7 +160,7 @@ static const ResourceOwnerDesc catlistref_resowner_desc = }; /* Convenience wrappers over ResourceOwnerRemember/Forget */ -static inline void +void ResourceOwnerRememberCatCacheRef(ResourceOwner owner, HeapTuple tuple) { ResourceOwnerRemember(owner, PointerGetDatum(tuple), &catcache_resowner_desc); @@ -169,7 +170,7 @@ ResourceOwnerForgetCatCacheRef(ResourceOwner owner, HeapTuple tuple) { ResourceOwnerForget(owner, PointerGetDatum(tuple), &catcache_resowner_desc); } -static inline void +void ResourceOwnerRememberCatCacheListRef(ResourceOwner owner, CatCList *list) { ResourceOwnerRemember(owner, PointerGetDatum(list), &catlistref_resowner_desc); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 049ebf72b7b..9ba149aa47d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1288,6 +1288,7 @@ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h new file mode 100644 index 00000000000..d32a3a42ef0 --- /dev/null +++ b/src/include/utils/resowner_private.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * resowner_private.h + * POSTGRES resource owner private definitions. + * + * See utils/resowner/README for more info. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/resowner_private.h + * + *------------------------------------------------------------------------- + */ +#ifndef RESOWNER_PRIVATE_H +#define RESOWNER_PRIVATE_H + +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/lock.h" +#include "utils/catcache.h" +#include "utils/plancache.h" +#include "utils/resowner.h" +#include "utils/snapshot.h" + + +extern void ResourceOwnerRememberCatCacheRef(ResourceOwner owner, + HeapTuple tuple); +extern void ResourceOwnerRememberCatCacheListRef(ResourceOwner owner, + CatCList *list); + +#endif /* RESOWNER_PRIVATE_H */ From e7fc0dda0398191e8f1572ba9b1d699e8eec3759 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 27 Sep 2024 14:26:40 +0200 Subject: [PATCH 41/79] New CSN snapshot format * Add xlogptr and xmin to determine right order of transactions when decoding on replica. * Add CSN snapshot data to snapshot builder. * Record CSN to the running xids and restore it during logical decoding to the snapshot builder. * Add function to update CSN snapshot data in snapshot builder. * Update CSN snapshot LSN in snapshot building after each transaction commit. * Restore CSN snapshot data in SnapBuildBuildSnapshot(). --- src/backend/replication/logical/snapbuild.c | 16 ++++++++++++++++ src/backend/storage/ipc/procarray.c | 1 + src/backend/storage/ipc/standby.c | 1 + src/include/replication/snapbuild.h | 2 ++ src/include/storage/standby.h | 1 + src/include/storage/standbydefs.h | 1 + src/include/utils/snapshot.h | 3 ++- 7 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 2972156c071..be024cb586c 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -219,6 +219,8 @@ struct SnapBuild */ TransactionId next_phase_at; + CSNSnapshotData csnSnapshotData; + /* * Array of transactions which could have catalog changes that committed * between xmin and xmax. @@ -576,6 +578,8 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->regd_count = 0; snapshot->snapXactCompletionCount = 0; + snapshot->csnSnapshotData = builder->csnSnapshotData; + return snapshot; } @@ -1097,6 +1101,8 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, TransactionId xmax = xid; + builder->csnSnapshotData.xlogptr = lsn; + /* * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor * will they be part of a snapshot. So we don't need to record anything. @@ -1314,6 +1320,9 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we hit fast paths in heapam_visibility.c. */ builder->xmin = running->oldestRunningXid; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; /* Remove transactions we don't need to keep track off anymore */ SnapBuildPurgeOlderTxn(builder); @@ -2232,3 +2241,10 @@ SnapBuildSnapshotExists(XLogRecPtr lsn) return ret == 0; } + +void +SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData) +{ + builder->csnSnapshotData = *csnSnapshotData; +} diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 593cf8f9252..fd53e6b9df6 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2858,6 +2858,7 @@ GetRunningTransactionData(void) CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + CurrentRunningXacts->csn = pg_atomic_read_u64(&TransamVariables->nextCommitSeqNo); Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 872679ca447..17ddeb893c6 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1353,6 +1353,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + xlrec.csn = CurrRunningXacts->csn; /* Header */ XLogBeginInsert(); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 6eee98557ad..4a74c89c358 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -92,6 +92,8 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, struct xl_running_xacts *running); extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn); +extern void SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData); extern bool SnapBuildSnapshotExists(XLogRecPtr lsn); diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index cce0bc521e7..4a42f9a767b 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -93,6 +93,7 @@ typedef struct RunningTransactionsData TransactionId oldestDatabaseRunningXid; /* same as above, but within the * current database */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId *xids; /* array of (sub)xids still running */ } RunningTransactionsData; diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index fe12f463a86..394bc42052f 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -52,6 +52,7 @@ typedef struct xl_running_xacts TransactionId nextXid; /* xid from TransamVariables->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; } xl_running_xacts; diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 6052c760056..9eec035622d 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -129,7 +129,8 @@ typedef struct pairingheap_node ph_node; } RetainUndoLocationPHNode; -typedef struct CSNSnapshotData { +typedef struct CSNSnapshotData +{ uint64 xmin; CommitSeqNo snapshotcsn; XLogRecPtr xlogptr; From 39bb781eba09246ea02450e9e1f1d6960fa9e678 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 3 Oct 2024 13:12:01 +0300 Subject: [PATCH 42/79] Restart archiver during PM_SHUTDOWN postmaster stage That allows S3 mode to finish WAL archiving if needed. --- src/backend/postmaster/postmaster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index d0ae33c08f7..f3ac478dedf 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -436,7 +436,7 @@ static void MaybeStartSlotSyncWorker(void); * even during recovery. */ #define PgArchStartupAllowed() \ - (((XLogArchivingActive() && pmState == PM_RUN) || \ + (((XLogArchivingActive() && (pmState == PM_RUN || pmState == PM_SHUTDOWN)) || \ (XLogArchivingAlways() && \ (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && \ PgArchCanRestart()) From 0d29f800fda3ba99a2aa27f44d392846ec24465b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 8 Oct 2024 21:31:33 +0300 Subject: [PATCH 43/79] Add handling of CSN snapshot in some places of snapbuild.c --- src/backend/replication/logical/snapbuild.c | 5 +++++ src/backend/utils/time/snapmgr.c | 1 + 2 files changed, 6 insertions(+) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index be024cb586c..3b4d16445b5 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -677,6 +677,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) snap->snapshot_type = SNAPSHOT_MVCC; snap->xcnt = newxcnt; snap->xip = newxip; + snap->csnSnapshotData = builder->csnSnapshotData; return snap; } @@ -1293,6 +1294,10 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact ReorderBufferTXN *txn; TransactionId xmin; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; + /* * If we're not consistent yet, inspect the record to see whether it * allows to get closer to being consistent. If we are consistent, dump diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index f3d47159fa6..4206c1bfd0b 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -516,6 +516,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, CurrentSnapshot->xmin = sourcesnap->xmin; CurrentSnapshot->xmax = sourcesnap->xmax; CurrentSnapshot->xcnt = sourcesnap->xcnt; + CurrentSnapshot->csnSnapshotData = sourcesnap->csnSnapshotData; Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount()); if (sourcesnap->xcnt > 0) memcpy(CurrentSnapshot->xip, sourcesnap->xip, From 7e7de4624af990949fce47b4393b309cc85c3b61 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 14 Oct 2024 16:22:14 +0300 Subject: [PATCH 44/79] Move CheckPoint_hook() call after CheckPointBuffers() That allows to process flushed buffers in CheckPoint_hook(). --- src/backend/access/transam/xlog.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1963127bb16..14f33568b3c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7520,8 +7520,6 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - if (CheckPoint_hook) - CheckPoint_hook(checkPointRedo, flags); CheckPointRelationMap(); CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN); CheckPointSnapBuild(); @@ -7538,6 +7536,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointPredicate(); CheckPointBuffers(flags); + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); + /* Perform all queued up fsyncs */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); From ad8c377ed4148a463670375cf4be5aff0ce447f7 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Tue, 15 Oct 2024 22:39:08 +0400 Subject: [PATCH 45/79] Restore GetIndexAmRoutine signature for compatibility with other callers Use GetIndexAmRoutineExtended instead for all Orioledb extensibility. --- src/backend/access/index/amapi.c | 11 ++++++++--- src/backend/commands/indexcmds.c | 2 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/cache/relcache.c | 2 +- src/include/access/amapi.h | 3 ++- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 40fb78e71d2..a8f1c580acd 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -47,7 +47,6 @@ GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) return routine; } - /* * GetIndexAmRoutine - call the specified access method handler routine to get * its IndexAmRoutine struct, which will be palloc'd in the caller's context. @@ -57,7 +56,13 @@ GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) * indexes for the system catalogs. relcache.c relies on that. */ IndexAmRoutine * -GetIndexAmRoutine(Oid indoid, Oid amhandler) +GetIndexAmRoutine(Oid amhandler) +{ + return GetIndexAmRoutineExtended(InvalidOid, amhandler); +} + +IndexAmRoutine * +GetIndexAmRoutineExtended(Oid indoid, Oid amhandler) { HeapTuple ht_idx; HeapTuple ht_tblrel; @@ -146,7 +151,7 @@ GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(indoid, amhandler); + return GetIndexAmRoutineExtended(indoid, amhandler); } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index e6598e40317..0782ba10a66 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -222,7 +222,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(oldId, accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineExtended(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index ede7cf34314..d90f7747ae5 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1323,7 +1323,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(indexrelid, amrec->amhandler); + amroutine = GetIndexAmRoutineExtended(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index d1b5c9a65b0..4c0950bc8f0 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1413,7 +1413,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_id, relation->rd_amhandler); + tmp = GetIndexAmRoutineExtended(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 86a93090180..4542444bc83 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -321,7 +321,8 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutine(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineExtended(Oid indoid, Oid amhandler); extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); From 5087dca601f8ace9564580e91698c07c69587ede Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Wed, 16 Oct 2024 19:23:07 +0400 Subject: [PATCH 46/79] Make index insert compatible with outside callers We split aminsert method to aminsert and aminsertextended. aminsert is a method for indexes implemented in other extensions, it accepts ItemPointer tupleid. aminsertextended is for internal Postgres indexes and Orioledb, it accepts Datum tupleid. They are not supposed to call aminsert method, so that it is set NULL for them. We can not rely that extensions are aware of aminsertextended, so index_insert() calls aminsert if it's not NULL preferentially. Signature of index_insert() is reverted so that it could be called by other extensions. Datum tupleid is confined inside index_insert method. --- contrib/bloom/blutils.c | 3 ++- doc/src/sgml/indexam.sgml | 1 + src/backend/access/brin/brin.c | 3 ++- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/ginutil.c | 3 ++- src/backend/access/gist/gist.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/index/indexam.c | 21 ++++++++++++++++--- src/backend/access/nbtree/nbtree.c | 3 ++- src/backend/access/spgist/spgutils.c | 3 ++- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/execIndexing.c | 20 ++++++++---------- src/include/access/amapi.h | 12 +++++++++++ src/include/access/genam.h | 2 +- .../modules/dummy_index_am/dummy_index_am.c | 3 ++- src/tools/pgindent/typedefs.list | 1 + 18 files changed, 62 insertions(+), 27 deletions(-) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 6836129c90d..9b72303c895 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -131,7 +131,8 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambuild = blbuild; amroutine->ambuildempty = blbuildempty; - amroutine->aminsert = blinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = blinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index d0dbf017a9e..ca401f7031a 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -141,6 +141,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 944157612cb..1e264145051 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -273,7 +273,8 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambuild = brinbuild; amroutine->ambuildempty = brinbuildempty; - amroutine->aminsert = brininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = brininsert; amroutine->aminsertcleanup = brininsertcleanup; amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 2b4fa1fb25a..538a554c917 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -338,7 +338,7 @@ toast_save_datum(Relation rel, Datum value, /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, - ItemPointerGetDatum(&(toasttup->t_self)), + &(toasttup->t_self), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 5747ae6a4ca..68ce032f150 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -63,7 +63,8 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambuild = ginbuild; amroutine->ambuildempty = ginbuildempty; - amroutine->aminsert = gininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gininsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 06a5a4c62ee..0117b62831e 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -85,7 +85,8 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambuild = gistbuild; amroutine->ambuildempty = gistbuildempty; - amroutine->aminsert = gistinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gistinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 1dc15d2a53b..557c7a3f316 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -83,7 +83,8 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambuild = hashbuild; amroutine->ambuildempty = hashbuildempty; - amroutine->aminsert = hashinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = hashinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 6f0464896c0..7d6828db403 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2313,7 +2313,7 @@ heapam_index_validate_scan(Relation heapRelation, index_insert(indexRelation, values, isnull, - ItemPointerGetDatum(&rootTuple), + &rootTuple, heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index fe1efe283c2..4668d7159ae 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -213,24 +213,39 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - Datum tupleid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { RELATION_CHECKS; - CHECK_REL_PROCEDURE(aminsert); + + if (indexRelation->rd_indam->aminsertextended == NULL && indexRelation->rd_indam->aminsert == NULL ) + elog(ERROR, "at least one function aminsert or aminsertextended should be defined for index \"%s\"", \ + RelationGetRelationName(indexRelation)); if (!(indexRelation->rd_indam->ampredlocks)) CheckForSerializableConflictIn(indexRelation, (ItemPointer) NULL, InvalidBlockNumber); - return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, + if (indexRelation->rd_indam->aminsert) + { + /* compatibility method for extension AM's not aware of aminsertextended */ + return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, tupleid, heapRelation, checkUnique, indexUnchanged, indexInfo); + } + else + { + /* index insert method for internal AM's and Orioledb that are aware of aminsertextended */ + return indexRelation->rd_indam->aminsertextended(indexRelation, values, isnull, + ItemPointerGetDatum(tupleid), heapRelation, + checkUnique, indexUnchanged, + indexInfo); + } } /* ------------------------- diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4acb3c73089..b661adb689e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -127,7 +127,8 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; - amroutine->aminsert = btinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = btinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 76b80146ff0..c1228ed2c01 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -70,7 +70,8 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambuild = spgbuild; amroutine->ambuildempty = spgbuildempty; - amroutine->aminsert = spginsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = spginsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index cd78b1ea55e..d0d1abda58a 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -170,7 +170,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, index_insert(index, /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - ItemPointerGetDatum(&(heapTuple->t_self)), /* tid of heap tuple */ + &(heapTuple->t_self), /* tid of heap tuple */ heapRelation, index->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 43618646861..ea5a1f365b1 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -171,7 +171,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) * the row is now dead, because that is the TID the index will know * about. */ - index_insert(indexRel, values, isnull, ItemPointerGetDatum(&checktid), + index_insert(indexRel, values, isnull, &checktid, trigdata->tg_relation, UNIQUE_CHECK_EXISTING, false, indexInfo); diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 859cdd7147d..f93bfe71d20 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -313,19 +313,19 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - Datum tupleid; + ItemPointer tupleid; if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { bool isnull; - tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); Assert(!isnull); } else { Assert(ItemPointerIsValid(&slot->tts_tid)); - tupleid = PointerGetDatum(&slot->tts_tid); + tupleid = &slot->tts_tid; } /* @@ -473,7 +473,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; - ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -494,7 +493,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - raw_tupleid, values, isnull, + tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } @@ -537,18 +536,18 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - Datum tupleid; + ItemPointer tupleid; if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { bool isnull; - tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); Assert(!isnull); } else { Assert(ItemPointerIsValid(&slot->tts_tid)); - tupleid = PointerGetDatum(&slot->tts_tid); + tupleid = &slot->tts_tid; } /* @@ -717,7 +716,7 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, old_valid, values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + ItemPointerGetDatum(tupleid), /* tid of heap tuple */ valuesOld, isnullOld, oldTupleid, @@ -768,7 +767,6 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; - ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -789,7 +787,7 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - raw_tupleid, values, isnull, + tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 4542444bc83..1addc03701d 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -109,6 +109,16 @@ typedef void (*ambuildempty_function) (Relation indexRelation); /* insert this tuple */ typedef bool (*aminsert_function) (Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer tupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* extended version of aminsert taking Datum tupleid */ +typedef bool (*aminsert_extended_function) (Relation indexRelation, Datum *values, bool *isnull, Datum tupleid, @@ -116,6 +126,7 @@ typedef bool (*aminsert_function) (Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); + /* update this tuple */ typedef bool (*amupdate_function) (Relation indexRelation, bool new_valid, @@ -292,6 +303,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; aminsertcleanup_function aminsertcleanup; /* can be NULL */ amupdate_function amupdate; amdelete_function amdelete; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index ab34a7726ff..92664eb69c6 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,7 +144,7 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - Datum tupleid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 80c6668666a..1c6825f391a 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -303,7 +303,8 @@ dihandler(PG_FUNCTION_ARGS) amroutine->ambuild = dibuild; amroutine->ambuildempty = dibuildempty; - amroutine->aminsert = diinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = diinsert; amroutine->ambulkdelete = dibulkdelete; amroutine->amvacuumcleanup = divacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 72e6d3a9865..5c2719500dd 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3268,6 +3268,7 @@ amgetbitmap_function amgettuple_function aminitparallelscan_function aminsert_function +aminsert_extended_function aminsertcleanup_function ammarkpos_function amoptions_function From b7af1bbaf1606a745b97392aece85ab858f2bf57 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Fri, 18 Oct 2024 14:31:16 +0400 Subject: [PATCH 47/79] Fix warning in pg_rewind --- src/bin/pg_rewind/pg_rewind.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 016c332f406..02643cf1c05 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -229,6 +229,7 @@ main(int argc, char **argv) case 6: if (!parse_sync_method(optarg, &sync_method)) exit(1); + break; case 'e': /* -e or --extension */ simple_string_list_append(&extensions, optarg); From 30bfd3e9fe9f3fb8833ca6af56a8c55edd84035b Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Tue, 22 Oct 2024 12:32:59 +0400 Subject: [PATCH 48/79] Fix active snapshot checks --- src/backend/executor/execMain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 6f5a572be62..053dd3191a0 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -147,7 +147,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) Assert(queryDesc->estate == NULL); /* caller must ensure the query's snapshot is active */ - Assert(GetActiveSnapshot() == queryDesc->snapshot); + Assert((ActiveSnapshotSet() ? GetActiveSnapshot() : InvalidSnapshot) == queryDesc->snapshot); /* * If the transaction is read-only, we need to check if any writes are @@ -325,7 +325,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); /* caller must ensure the query's snapshot is active */ - Assert(GetActiveSnapshot() == estate->es_snapshot); + Assert((ActiveSnapshotSet() ? GetActiveSnapshot() : InvalidSnapshot) == estate->es_snapshot); /* * Switch into per-query memory context From 0a6427f75fc3a17c2137ccd218dfde4570f3e73b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 23 Sep 2024 14:03:54 +0300 Subject: [PATCH 49/79] Transform OR-clauses to SAOP's during index matching Replace "(indexkey op C1) OR (indexkey op C2) ... (indexkey op CN)" with "indexkey op ANY(ARRAY[C1, C2, ...])" (ScalarArrayOpExpr node) during matching a clause to index. Here Ci is an i-th constant or parameters expression, 'expr' is non-constant expression, 'op' is an operator which returns boolean result and has a commuter (for the case of reverse order of constant and non-constant parts of the expression, like 'Cn op expr'). This transformation allows handling long OR-clauses with single IndexScan avoiding slower bitmap scans. Discussion: https://postgr.es/m/567ED6CA.2040504%40sigaev.ru Author: Alena Rybakina Author: Andrey Lepikhov Reviewed-by: Peter Geoghegan Reviewed-by: Ranier Vilela Reviewed-by: Alexander Korotkov Reviewed-by: Robert Haas Reviewed-by: Jian He Reviewed-by: Tom Lane Reviewed-by: Nikolay Shaplov --- src/backend/optimizer/path/indxpath.c | 281 ++++++++++++++++++++- src/test/regress/expected/create_index.out | 270 ++++++++++++++++++-- src/test/regress/expected/join.out | 57 ++++- src/test/regress/expected/rowsecurity.out | 7 + src/test/regress/expected/uuid.out | 31 +++ src/test/regress/sql/create_index.sql | 69 +++++ src/test/regress/sql/join.sql | 9 + src/test/regress/sql/rowsecurity.sql | 1 + src/test/regress/sql/uuid.sql | 12 + 9 files changed, 714 insertions(+), 23 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 7c043c53133..d5ba0c5cd68 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -20,6 +20,7 @@ #include "access/stratnum.h" #include "access/sysattr.h" #include "catalog/pg_am.h" +#include "catalog/pg_amop.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" @@ -32,8 +33,10 @@ #include "optimizer/paths.h" #include "optimizer/prep.h" #include "optimizer/restrictinfo.h" +#include "utils/array.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" +#include "utils/syscache.h" /* XXX see PartCollMatchesExprColl */ @@ -166,6 +169,10 @@ static IndexClause *match_rowcompare_to_indexcol(PlannerInfo *root, RestrictInfo *rinfo, int indexcol, IndexOptInfo *index); +static IndexClause *match_orclause_to_indexcol(PlannerInfo *root, + RestrictInfo *rinfo, + int indexcol, + IndexOptInfo *index); static IndexClause *expand_indexqual_rowcompare(PlannerInfo *root, RestrictInfo *rinfo, int indexcol, @@ -2138,7 +2145,10 @@ match_clause_to_index(PlannerInfo *root, * (3) must match the collation of the index, if collation is relevant. * * Our definition of "const" is exceedingly liberal: we allow anything that - * doesn't involve a volatile function or a Var of the index's relation. + * doesn't involve a volatile function or a Var of the index's relation + * except for a boolean OR expression input: due to a trade-off between the + * expected execution speedup and planning complexity, we limit or->saop + * transformation by obvious cases when an index scan can get a profit. * In particular, Vars belonging to other relations of the query are * accepted here, since a clause of that form can be used in a * parameterized indexscan. It's the responsibility of higher code levels @@ -2168,6 +2178,10 @@ match_clause_to_index(PlannerInfo *root, * It is also possible to match ScalarArrayOpExpr clauses to indexes, when * the clause is of the form "indexkey op ANY (arrayconst)". * + * It is also possible to match a list of OR clauses if it might be + * transformed into a single ScalarArrayOpExpr clause. On success, + * the returning index clause will contain a trasformed clause. + * * For boolean indexes, it is also possible to match the clause directly * to the indexkey; or perhaps the clause is (NOT indexkey). * @@ -2217,9 +2231,9 @@ match_clause_to_indexcol(PlannerInfo *root, } /* - * Clause must be an opclause, funcclause, ScalarArrayOpExpr, or - * RowCompareExpr. Or, if the index supports it, we can handle IS - * NULL/NOT NULL clauses. + * Clause must be an opclause, funcclause, ScalarArrayOpExpr, + * RowCompareExpr, or OR-clause that could be converted to SAOP. Or, if + * the index supports it, we can handle IS NULL/NOT NULL clauses. */ if (IsA(clause, OpExpr)) { @@ -2237,6 +2251,10 @@ match_clause_to_indexcol(PlannerInfo *root, { return match_rowcompare_to_indexcol(root, rinfo, indexcol, index); } + else if (restriction_is_or_clause(rinfo)) + { + return match_orclause_to_indexcol(root, rinfo, indexcol, index); + } else if (index->amsearchnulls && IsA(clause, NullTest)) { NullTest *nt = (NullTest *) clause; @@ -2760,6 +2778,261 @@ match_rowcompare_to_indexcol(PlannerInfo *root, return NULL; } +/* + * match_orclause_to_indexcol() + * Handles the OR-expr case for match_clause_to_indexcol() in the case + * when it could be transformed to ScalarArrayOpExpr. + * + * Given a list of OR-clause args, attempts to transform this BoolExpr into + * a single SAOP expression. On success, returns an IndexClause, containing + * the transformed expression or NULL, if failed. + */ +static IndexClause * +match_orclause_to_indexcol(PlannerInfo *root, + RestrictInfo *rinfo, + int indexcol, + IndexOptInfo *index) +{ + ListCell *lc; + BoolExpr *orclause = (BoolExpr *) rinfo->orclause; + Node *indexExpr = NULL; + List *consts = NIL; + Node *arrayNode = NULL; + ScalarArrayOpExpr *saopexpr = NULL; + Oid matchOpno = InvalidOid; + IndexClause *iclause; + Oid consttype = InvalidOid; + Oid arraytype = InvalidOid; + Oid inputcollid = InvalidOid; + bool firstTime = true; + bool have_param = false; + + Assert(IsA(orclause, BoolExpr)); + Assert(orclause->boolop == OR_EXPR); + + /* + * Try to convert a list of OR-clauses to a single SAOP expression. Each + * OR entry must be in the form: (indexkey operator constant) or (constant + * operator indexkey). Operators of all the entries must match. Constant + * might be either Const or Param. To be effective, give up on the first + * non-matching entry. Exit is implemented as a break from the loop, which + * is catched afterwards. + */ + foreach(lc, orclause->args) + { + RestrictInfo *subRinfo; + OpExpr *subClause; + Oid opno; + Node *leftop, + *rightop; + Node *constExpr; + + if (!IsA(lfirst(lc), RestrictInfo)) + break; + + subRinfo = (RestrictInfo *) lfirst(lc); + + /* Only operator clauses can match */ + if (!IsA(subRinfo->clause, OpExpr)) + break; + + subClause = (OpExpr *) subRinfo->clause; + opno = subClause->opno; + + /* Only binary operators can match */ + if (list_length(subClause->args) != 2) + break; + + /* + * The parameters below must match between sub-rinfo and its parent as + * make_restrictinfo() fills them with the same values, and further + * modifications are also the same for the whole subtree. However, + * still make a sanity check. + */ + Assert(subRinfo->is_pushed_down == rinfo->is_pushed_down); + Assert(subRinfo->is_clone == rinfo->is_clone); + Assert(subRinfo->security_level == rinfo->security_level); + Assert(bms_equal(subRinfo->incompatible_relids, rinfo->incompatible_relids)); + Assert(bms_equal(subRinfo->outer_relids, rinfo->outer_relids)); + + /* + * Also, check that required_relids in sub-rinfo is subset of parent's + * required_relids. + */ + Assert(bms_is_subset(subRinfo->required_relids, rinfo->required_relids)); + + /* Only operator returning boolean suits the transformation */ + if (get_op_rettype(opno) != BOOLOID) + break; + + /* + * Check for clauses of the form: (indexkey operator constant) or + * (constant operator indexkey). Determine indexkey side first, check + * the constant later. + */ + leftop = (Node *) linitial(subClause->args); + rightop = (Node *) lsecond(subClause->args); + if (match_index_to_operand(leftop, indexcol, index)) + { + indexExpr = leftop; + constExpr = rightop; + } + else if (match_index_to_operand(rightop, indexcol, index)) + { + opno = get_commutator(opno); + if (!OidIsValid(opno)) + { + /* commutator doesn't exist, we can't reverse the order */ + break; + } + indexExpr = rightop; + constExpr = leftop; + } + else + { + break; + } + + /* + * Ignore any RelabelType node above the operands. This is needed to + * be able to apply indexscanning in binary-compatible-operator cases. + * Note: we can assume there is at most one RelabelType node; + * eval_const_expressions() will have simplified if more than one. + */ + if (IsA(constExpr, RelabelType)) + constExpr = (Node *) ((RelabelType *) constExpr)->arg; + if (IsA(indexExpr, RelabelType)) + indexExpr = (Node *) ((RelabelType *) indexExpr)->arg; + + /* We allow constant to be Const or Param */ + if (!IsA(constExpr, Const) && !IsA(constExpr, Param)) + break; + + /* Forbid transformation for composite types, records. */ + if (type_is_rowtype(exprType(constExpr)) || + type_is_rowtype(exprType(indexExpr))) + break; + + /* + * Save information about the operator, type, and collation for the + * first matching qual. Then, check that subsequent quals match the + * first. + */ + if (firstTime) + { + matchOpno = opno; + consttype = exprType(constExpr); + arraytype = get_array_type(consttype); + inputcollid = subClause->inputcollid; + + /* + * Check that the operator is presented in the opfamily and that + * the expression collation matches the index collation. Also, + * there must be an array type to construct an array later. + */ + if (!IndexCollMatchesExprColl(index->indexcollations[indexcol], inputcollid) || + !op_in_opfamily(matchOpno, index->opfamily[indexcol]) || + !OidIsValid(arraytype)) + break; + firstTime = false; + } + else + { + if (opno != matchOpno || + inputcollid != subClause->inputcollid || + consttype != exprType(constExpr)) + break; + } + + if (IsA(constExpr, Param)) + have_param = true; + consts = lappend(consts, constExpr); + } + + /* + * Catch the break from the loop above. Normally, a foreach() loop ends + * up with a NULL list cell. A non-NULL list cell indicates a break from + * the foreach() loop. Free the consts list and return NULL then. + */ + if (lc != NULL) + { + list_free(consts); + return NULL; + } + + /* + * Assemble an array from the list of constants. It seems more profitable + * to build a const array. But in the presence of parameters, we don't + * have a specific value here and must employ an ArrayExpr instead. + */ + + if (have_param) + { + ArrayExpr *arrayExpr = makeNode(ArrayExpr); + + /* array_collid will be set by parse_collate.c */ + arrayExpr->element_typeid = consttype; + arrayExpr->array_typeid = arraytype; + arrayExpr->multidims = false; + arrayExpr->elements = consts; + arrayExpr->location = -1; + + arrayNode = (Node *) arrayExpr; + } + else + { + int16 typlen; + bool typbyval; + char typalign; + Datum *elems; + int i = 0; + ArrayType *arrayConst; + + get_typlenbyvalalign(consttype, &typlen, &typbyval, &typalign); + + elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); + foreach(lc, consts) + elems[i++] = ((Const *) lfirst(lc))->constvalue; + + arrayConst = construct_array(elems, i, consttype, + typlen, typbyval, typalign); + arrayNode = (Node *) makeConst(arraytype, -1, inputcollid, + -1, PointerGetDatum(arrayConst), + false, false); + + pfree(elems); + list_free(consts); + } + + /* Build the SAOP expression node */ + saopexpr = makeNode(ScalarArrayOpExpr); + saopexpr->opno = matchOpno; + saopexpr->opfuncid = get_opcode(matchOpno); + saopexpr->hashfuncid = InvalidOid; + saopexpr->negfuncid = InvalidOid; + saopexpr->useOr = true; + saopexpr->inputcollid = inputcollid; + saopexpr->args = list_make2(indexExpr, arrayNode); + saopexpr->location = -1; + + /* + * Finally, build an IndexClause based on the SAOP node. Use + * make_simple_restrictinfo() to get RestrictInfo with clean selectivity + * estimations because it may differ from the estimation made for an OR + * clause. Although it is not a lossy expression, keep the old version of + * rinfo in iclause->rinfo to detect duplicates and recheck the original + * clause. + */ + iclause = makeNode(IndexClause); + iclause->rinfo = rinfo; + iclause->indexquals = list_make1(make_simple_restrictinfo(root, + &saopexpr->xpr)); + iclause->lossy = false; + iclause->indexcol = indexcol; + iclause->indexcols = NIL; + return iclause; +} + /* * expand_indexqual_rowcompare --- expand a single indexqual condition * that is a RowCompareExpr diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index b1df05593c4..e73a0525cf4 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1844,18 +1844,67 @@ DROP TABLE onek_with_null; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------ - Bitmap Heap Scan on tenk1 - Recheck Cond: (((thousand = 42) AND (tenthous = 1)) OR ((thousand = 42) AND (tenthous = 3)) OR ((thousand = 42) AND (tenthous = 42))) - -> BitmapOr - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 1)) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 3)) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 42)) -(9 rows) + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(2 rows) + +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + QUERY PLAN +---------------------------------------------------------------------------------------- + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY (ARRAY[1, (InitPlan 1).col1, 42]))) + InitPlan 1 + -> Result +(4 rows) + +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + QUERY PLAN +--------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(2 rows) SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); @@ -1864,6 +1913,27 @@ SELECT * FROM tenk1 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx (1 row) +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric); + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: (((tenthous)::numeric = '1'::numeric) OR (tenthous = 3) OR ((tenthous)::numeric = '42'::numeric)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric; + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Seq Scan on tenk1 + Filter: (((tenthous)::numeric = '1'::numeric) OR (tenthous = 3) OR ((tenthous)::numeric = '42'::numeric)) +(2 rows) + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -1872,6 +1942,102 @@ SELECT count(*) FROM tenk1 Aggregate -> Bitmap Heap Scan on tenk1 Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand < 42) OR (thousand < 99) OR (43 > thousand) OR (42 > thousand))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand < ANY ('{42,99,43,42}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND ((tenthous = 1) OR (tenthous = 3))) OR (thousand = 41)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3}'::integer[]))) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 41) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) + -> BitmapOr + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 99) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (tenthous < 2) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 41) +(16 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + count +------- + 20 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2)))) -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) @@ -1879,16 +2045,90 @@ SELECT count(*) FROM tenk1 -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (thousand = 42) -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 99) -(11 rows) + Index Cond: (thousand = 41) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 99) AND (tenthous = 2)) +(13 rows) SELECT count(*) FROM tenk1 - WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); count ------- 10 (1 row) +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk1.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Aggregate + -> Nested Loop + Join Filter: ((tenk2.thousand = 42) OR (tenk1.thousand = 41) OR (tenk2.tenthous = 2)) + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Materialize + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = 42) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = 42) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 LEFT JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: (tenk1.hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + -> Memoize + Cache Key: tenk1.hundred + Cache Mode: logical + -> Index Scan using tenk2_hundred on tenk2 + Index Cond: (hundred = tenk1.hundred) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) +(10 rows) + -- -- Check behavior with duplicate index column contents -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 2eaadceed0d..a6359614b24 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4278,15 +4278,64 @@ select * from tenk1 a join tenk1 b on Index Cond: (hundred = 4) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr -> Bitmap Index Scan on tenk1_unique1 Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 3) + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) +(18 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 4) + -> Materialize + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 7) -(19 rows) + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) +(18 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 < 20 or a.unique1 = 3 or a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: ((a.unique1 < 20) OR (a.unique1 = 3) OR ((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Seq Scan on tenk1 b + -> Materialize + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Filter: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 < 20) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) +(16 rows) -- -- test placement of movable quals in a parameterized join tree diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 2e23dcee6da..2cae751e12c 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -4492,6 +4492,13 @@ SELECT * FROM rls_tbl WHERE a <<< 1000; --- (0 rows) +EXPLAIN (COSTS OFF) SELECT * FROM rls_tbl WHERE a <<< 1000 or a <<< 900; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + RESET SESSION AUTHORIZATION; CREATE TABLE rls_child_tbl () INHERITS (rls_tbl); INSERT INTO rls_child_tbl SELECT x/10 FROM generate_series(1, 100) x; diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index 6026e15ed31..8f4ef0d7a6a 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -129,6 +129,37 @@ CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <> '11111111111111111111111111111111' OR + guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------ + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field <> '11111111-1111-1111-1111-111111111111'::uuid) OR (guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid)) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <= '22222222-2222-2222-2222-222222222222' OR + guid_field <= '11111111111111111111111111111111' OR + guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field <= '22222222-2222-2222-2222-222222222222'::uuid) OR (guid_field <= '11111111-1111-1111-1111-111111111111'::uuid) OR (guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid)) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e' OR + guid_field = '11111111111111111111111111111111'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid) OR (guid_field = '11111111-1111-1111-1111-111111111111'::uuid)) +(3 rows) + -- should fail INSERT INTO guid1(guid_field) VALUES('11111111-1111-1111-1111-111111111111'); ERROR: duplicate key value violates unique constraint "guid1_unique_btree" diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 0c292cd660b..a4ca4cdfda1 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -732,12 +732,81 @@ SELECT * FROM tenk1 SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk1.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 LEFT JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; -- -- Check behavior with duplicate index column contents -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index dcc94c0715d..e7343af6d05 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1433,6 +1433,15 @@ select * from tenk1 a join tenk1 b on (a.unique1 = 1 and b.unique1 = 2) or ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 < 20 or a.unique1 = 3 or a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + -- -- test placement of movable quals in a parameterized join tree -- diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 6e71dc7236b..86eb30e24f0 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -2177,6 +2177,7 @@ CREATE FUNCTION op_leak(int, int) RETURNS bool CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM rls_tbl WHERE a <<< 1000; +EXPLAIN (COSTS OFF) SELECT * FROM rls_tbl WHERE a <<< 1000 or a <<< 900; RESET SESSION AUTHORIZATION; CREATE TABLE rls_child_tbl () INHERITS (rls_tbl); diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql index c88f6d087a7..75ee966ded0 100644 --- a/src/test/regress/sql/uuid.sql +++ b/src/test/regress/sql/uuid.sql @@ -63,6 +63,18 @@ CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <> '11111111111111111111111111111111' OR + guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <= '22222222-2222-2222-2222-222222222222' OR + guid_field <= '11111111111111111111111111111111' OR + guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e' OR + guid_field = '11111111111111111111111111111111'; + -- should fail INSERT INTO guid1(guid_field) VALUES('11111111-1111-1111-1111-111111111111'); From d4c9a2e74430610fee7191b84af2c19962f649a0 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 23 Sep 2024 14:04:03 +0300 Subject: [PATCH 50/79] Teach bitmap path generation about transforming OR-clauses to SAOP's When optimizer generates bitmap paths, it considers breaking OR-clause arguments one-by-one. But now, a group of similar OR-clauses can be transformed into SAOP during index matching. So, bitmap paths should keep up. This commit teaches bitmap paths generation machinery to group similar OR-clauses into dedicated RestrictInfos. Those RestrictInfos are considered both to match index as a whole (as SAOP), or to match as a set of individual OR-clause argument one-by-one (the old way). Therefore, bitmap path generation will takes advantage of OR-clauses to SAOP's transformation. The old way of handling them is also considered. So, there shouldn't be planning regression. Discussion: https://postgr.es/m/567ED6CA.2040504%40sigaev.ru Reviewed-by: Alexander Korotkov --- src/backend/optimizer/path/indxpath.c | 439 ++++++++++++++++++++- src/backend/optimizer/util/restrictinfo.c | 107 +++-- src/include/optimizer/restrictinfo.h | 11 + src/test/regress/expected/create_index.out | 125 +++++- src/test/regress/expected/join.out | 56 ++- src/test/regress/sql/create_index.sql | 38 ++ src/tools/pgindent/typedefs.list | 1 + 7 files changed, 670 insertions(+), 107 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index d5ba0c5cd68..cae5f3d7e63 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -1162,6 +1162,383 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, return result; } +/* + * Utility structure used to group similar OR-clause arguments in + * group_similar_or_args(). It represents information about the OR-clause + * argument and its matching index key. + */ +typedef struct +{ + int indexnum; /* index of the matching index, or -1 if no + * matching index */ + int colnum; /* index of the matching column, or -1 if no + * matching index */ + Oid opno; /* OID of the OpClause operator, or InvalidOid + * if not an OpExpr */ + Oid inputcollid; /* OID of the OpClause input collation */ + int argindex; /* index of the clause in the list of + * arguments */ +} OrArgIndexMatch; + +/* + * Comparison function for OrArgIndexMatch which provides sort order placing + * similar OR-clause arguments together. + */ +static int +or_arg_index_match_cmp(const void *a, const void *b) +{ + const OrArgIndexMatch *match_a = (const OrArgIndexMatch *) a; + const OrArgIndexMatch *match_b = (const OrArgIndexMatch *) b; + + if (match_a->indexnum < match_b->indexnum) + return -1; + else if (match_a->indexnum > match_b->indexnum) + return 1; + + if (match_a->colnum < match_b->colnum) + return -1; + else if (match_a->colnum > match_b->colnum) + return 1; + + if (match_a->opno < match_b->opno) + return -1; + else if (match_a->opno > match_b->opno) + return 1; + + if (match_a->inputcollid < match_b->inputcollid) + return -1; + else if (match_a->inputcollid > match_b->inputcollid) + return 1; + + if (match_a->argindex < match_b->argindex) + return -1; + else if (match_a->argindex > match_b->argindex) + return 1; + + return 0; +} + +/* + * group_similar_or_args + * Transform incoming OR-restrictinfo into a list of sub-restrictinfos, + * each of them containing a subset of OR-clauses from the source rinfo + * matching the same index column with the same operator and collation, + * It may be employed later, during the match_clause_to_indexcol() to + * transform whole OR-sub-rinfo to an SAOP clause. + * + * Similar arguments clauses of form "indexkey op constant" having same + * indexkey, operator, and collation. Constant may comprise either Const + * or Param. + * + * Returns the processed list of arguments. + */ +static List * +group_similar_or_args(PlannerInfo *root, RelOptInfo *rel, RestrictInfo *rinfo) +{ + int n; + int i; + int group_start; + OrArgIndexMatch *matches; + bool matched = false; + ListCell *lc; + ListCell *lc2; + List *orargs; + List *result = NIL; + + Assert(IsA(rinfo->orclause, BoolExpr)); + orargs = ((BoolExpr *) rinfo->orclause)->args; + n = list_length(orargs); + + /* + * To avoid N^2 behavior, take utility pass along the list of OR-clause + * arguments. For each argument, fill the OrArgIndexMatch structure, + * which will be used to sort these arguments at the next step. + */ + i = -1; + matches = (OrArgIndexMatch *) palloc(sizeof(OrArgIndexMatch) * n); + foreach(lc, orargs) + { + Node *arg = lfirst(lc); + RestrictInfo *argrinfo; + OpExpr *clause; + Oid opno; + Node *leftop, + *rightop; + Node *nonConstExpr; + int indexnum; + int colnum; + + i++; + matches[i].argindex = i; + matches[i].indexnum = -1; + matches[i].colnum = -1; + matches[i].opno = InvalidOid; + matches[i].inputcollid = InvalidOid; + + if (!IsA(arg, RestrictInfo)) + continue; + + argrinfo = castNode(RestrictInfo, arg); + + /* Only operator clauses can match */ + if (!IsA(argrinfo->clause, OpExpr)) + continue; + + clause = (OpExpr *) argrinfo->clause; + opno = clause->opno; + + /* Only binary operators can match */ + if (list_length(clause->args) != 2) + continue; + + /* + * Ignore any RelabelType node above the operands. This is needed to + * be able to apply indexscanning in binary-compatible-operator cases. + * Note: we can assume there is at most one RelabelType node; + * eval_const_expressions() will have simplified if more than one. + */ + leftop = get_leftop(clause); + if (IsA(leftop, RelabelType)) + leftop = (Node *) ((RelabelType *) leftop)->arg; + + rightop = get_rightop(clause); + if (IsA(rightop, RelabelType)) + rightop = (Node *) ((RelabelType *) rightop)->arg; + + /* + * Check for clauses of the form: (indexkey operator constant) or + * (constant operator indexkey). But we don't know a particular index + * yet. First check for a constant, which must be Const or Param. + * That's cheaper than search for an index key among all indexes. + */ + if (IsA(leftop, Const) || IsA(leftop, Param)) + { + opno = get_commutator(opno); + + if (!OidIsValid(opno)) + { + /* commutator doesn't exist, we can't reverse the order */ + continue; + } + nonConstExpr = rightop; + } + else if (IsA(rightop, Const) || IsA(rightop, Param)) + { + nonConstExpr = leftop; + } + else + { + continue; + } + + /* + * Match non-constant part to the index key. It's possible that a + * single non-constant part matches multiple index keys. It's OK, we + * just stop with first matching index key. Given that this choice is + * determined the same for every clause, we will group similar clauses + * together anyway. + */ + indexnum = 0; + foreach(lc2, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc2); + + /* Ignore index if it doesn't support bitmap scans */ + if (!index->amhasgetbitmap) + continue; + + for (colnum = 0; colnum < index->nkeycolumns; colnum++) + { + if (match_index_to_operand(nonConstExpr, colnum, index)) + { + matches[i].indexnum = indexnum; + matches[i].colnum = colnum; + matches[i].opno = opno; + matches[i].inputcollid = clause->inputcollid; + matched = true; + break; + } + } + + /* + * Stop looping through the indexes, if we managed to match + * nonConstExpr to any index column. + */ + if (matches[i].indexnum >= 0) + break; + indexnum++; + } + } + + /* + * Fast-path check: if no clause is matching to the index column, we can + * just give up at this stage and return the clause list as-is. + */ + if (!matched) + { + pfree(matches); + return orargs; + } + + /* Sort clauses to make similar clauses go together */ + qsort(matches, n, sizeof(OrArgIndexMatch), or_arg_index_match_cmp); + + /* + * Group similar clauses into single sub-restrictinfo. Side effect: the + * resulting list of restrictions will be sorted by indexnum and colnum. + */ + group_start = 0; + for (i = 1; i <= n; i++) + { + /* Check if it's a group boundary */ + if (group_start >= 0 && + (i == n || + matches[i].indexnum != matches[group_start].indexnum || + matches[i].colnum != matches[group_start].colnum || + matches[i].opno != matches[group_start].opno || + matches[i].inputcollid != matches[group_start].inputcollid || + matches[i].indexnum == -1)) + { + /* + * One clause in group: add it "as is" to the upper-level OR. + */ + if (i - group_start == 1) + { + result = lappend(result, + list_nth(orargs, + matches[group_start].argindex)); + } + else + { + /* + * Two or more clauses in a group: create a nested OR. + */ + List *args = NIL; + List *rargs = NIL; + RestrictInfo *subrinfo; + int j; + + Assert(i - group_start >= 2); + + /* Construct the list of nested OR arguments */ + for (j = group_start; j < i; j++) + { + Node *arg = list_nth(orargs, matches[j].argindex); + + rargs = lappend(rargs, arg); + if (IsA(arg, RestrictInfo)) + args = lappend(args, ((RestrictInfo *) arg)->clause); + else + args = lappend(args, arg); + } + + /* Construct the nested OR and wrap it with RestrictInfo */ + subrinfo = make_plain_restrictinfo(root, + make_orclause(args), + make_orclause(rargs), + rinfo->is_pushed_down, + rinfo->has_clone, + rinfo->is_clone, + rinfo->pseudoconstant, + rinfo->security_level, + rinfo->required_relids, + rinfo->incompatible_relids, + rinfo->outer_relids); + result = lappend(result, subrinfo); + } + + group_start = i; + } + } + pfree(matches); + return result; +} + +/* + * make_bitmap_paths_for_or_group + * Generate bitmap paths for a group of similar OR-clause arguments + * produced by group_similar_or_args(). + * + * This function considers two cases: (1) matching a group of clauses to + * the index as a whole, and (2) matching the individual clauses one-by-one. + * (1) typically comprises an optimal solution. If not, (2) typically + * comprises fair alternative. + * + * Ideally, we could consider all arbitrary splits of arguments into + * subgroups, but that could lead to unacceptable computational complexity. + * This is why we only consider two cases of above. + */ +static List * +make_bitmap_paths_for_or_group(PlannerInfo *root, RelOptInfo *rel, + RestrictInfo *ri, List *other_clauses) +{ + List *jointlist = NIL; + List *splitlist = NIL; + ListCell *lc; + List *orargs; + List *args = ((BoolExpr *) ri->orclause)->args; + Cost jointcost = 0.0, + splitcost = 0.0; + Path *bitmapqual; + List *indlist; + + /* + * First, try to match the whole group to the one index. + */ + orargs = list_make1(ri); + indlist = build_paths_for_OR(root, rel, + orargs, + other_clauses); + if (indlist != NIL) + { + bitmapqual = choose_bitmap_and(root, rel, indlist); + jointcost = bitmapqual->total_cost; + jointlist = list_make1(bitmapqual); + } + + /* + * If we manage to find a bitmap scan, which uses the group of OR-clause + * arguments as a whole, we can skip matching OR-clause arguments + * one-by-one as long as there are no other clauses, which can bring more + * efficiency to one-by-one case. + */ + if (jointlist != NIL && other_clauses == NIL) + return jointlist; + + /* + * Also try to match all containing clauses one-by-one. + */ + foreach(lc, args) + { + orargs = list_make1(lfirst(lc)); + + indlist = build_paths_for_OR(root, rel, + orargs, + other_clauses); + + if (indlist == NIL) + { + splitlist = NIL; + break; + } + + bitmapqual = choose_bitmap_and(root, rel, indlist); + splitcost += bitmapqual->total_cost; + splitlist = lappend(splitlist, bitmapqual); + } + + /* + * Pick the best option. + */ + if (splitlist == NIL) + return jointlist; + else if (jointlist == NIL) + return splitlist; + else + return (jointcost < splitcost) ? jointlist : splitlist; +} + + /* * generate_bitmap_or_paths * Look through the list of clauses to find OR clauses, and generate @@ -1192,6 +1569,8 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, List *pathlist; Path *bitmapqual; ListCell *j; + List *groupedArgs; + List *inner_other_clauses = NIL; /* Ignore RestrictInfos that aren't ORs */ if (!restriction_is_or_clause(rinfo)) @@ -1202,7 +1581,28 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, * the OR, else we can't use it. */ pathlist = NIL; - foreach(j, ((BoolExpr *) rinfo->orclause)->args) + + /* + * Group the similar OR-clause argument into dedicated RestrictInfos, + * because those RestrictInfos might match to the index as a whole. + */ + groupedArgs = group_similar_or_args(root, rel, rinfo); + + if (groupedArgs != ((BoolExpr *) rinfo->orclause)->args) + { + /* + * Some parts of the rinfo were grouped. In this case, we have a + * set of sub-rinfos that together are an exact duplicate of + * rinfo. Thus, we need to remove the rinfo from other clauses. + * match_clauses_to_index detects duplicated iclauses by comparing + * pointers to original rinfos that would be different. So, we + * must delete rinfo to avoid de-facto duplicated clauses in the + * index clauses list. + */ + inner_other_clauses = list_delete(list_copy(all_clauses), rinfo); + } + + foreach(j, groupedArgs) { Node *orarg = (Node *) lfirst(j); List *indlist; @@ -1222,12 +1622,34 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, andargs, all_clauses)); } + else if (restriction_is_or_clause(castNode(RestrictInfo, orarg))) + { + RestrictInfo *ri = castNode(RestrictInfo, orarg); + + /* + * Generate bitmap paths for the group of similar OR-clause + * arguments. + */ + indlist = make_bitmap_paths_for_or_group(root, + rel, ri, + inner_other_clauses); + + if (indlist == NIL) + { + pathlist = NIL; + break; + } + else + { + pathlist = list_concat(pathlist, indlist); + continue; + } + } else { RestrictInfo *ri = castNode(RestrictInfo, orarg); List *orargs; - Assert(!restriction_is_or_clause(ri)); orargs = list_make1(ri); indlist = build_paths_for_OR(root, rel, @@ -1253,6 +1675,9 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, pathlist = lappend(pathlist, bitmapqual); } + if (inner_other_clauses != NIL) + list_free(inner_other_clauses); + /* * If we have a match for every arm, then turn them into a * BitmapOrPath, and add to result list. @@ -2430,7 +2855,7 @@ match_opclause_to_indexcol(PlannerInfo *root, /* * Check for clauses of the form: (indexkey operator constant) or - * (constant operator indexkey). See match_clause_to_indexcol's notes + * (constant operator indexkey). See match_clause_to_indexcol()'s notes * about const-ness. * * Note that we don't ask the support function about clauses that don't @@ -2991,8 +3416,12 @@ match_orclause_to_indexcol(PlannerInfo *root, get_typlenbyvalalign(consttype, &typlen, &typbyval, &typalign); elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); - foreach(lc, consts) - elems[i++] = ((Const *) lfirst(lc))->constvalue; + foreach_node(Const, value, consts) + { + Assert(!value->constisnull && value->constvalue); + + elems[i++] = value->constvalue; + } arrayConst = construct_array(elems, i, consttype, typlen, typbyval, typalign); diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c index 0b406e93342..9e1458401c2 100644 --- a/src/backend/optimizer/util/restrictinfo.c +++ b/src/backend/optimizer/util/restrictinfo.c @@ -21,17 +21,6 @@ #include "optimizer/restrictinfo.h" -static RestrictInfo *make_restrictinfo_internal(PlannerInfo *root, - Expr *clause, - Expr *orclause, - bool is_pushed_down, - bool has_clone, - bool is_clone, - bool pseudoconstant, - Index security_level, - Relids required_relids, - Relids incompatible_relids, - Relids outer_relids); static Expr *make_sub_restrictinfos(PlannerInfo *root, Expr *clause, bool is_pushed_down, @@ -90,36 +79,38 @@ make_restrictinfo(PlannerInfo *root, /* Shouldn't be an AND clause, else AND/OR flattening messed up */ Assert(!is_andclause(clause)); - return make_restrictinfo_internal(root, - clause, - NULL, - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return make_plain_restrictinfo(root, + clause, + NULL, + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } /* - * make_restrictinfo_internal + * make_plain_restrictinfo * - * Common code for the main entry points and the recursive cases. + * Common code for the main entry points and the recursive cases. Also, + * useful while contrucitng RestrictInfos above OR clause, which already has + * RestrictInfos above its subclauses. */ -static RestrictInfo * -make_restrictinfo_internal(PlannerInfo *root, - Expr *clause, - Expr *orclause, - bool is_pushed_down, - bool has_clone, - bool is_clone, - bool pseudoconstant, - Index security_level, - Relids required_relids, - Relids incompatible_relids, - Relids outer_relids) +RestrictInfo * +make_plain_restrictinfo(PlannerInfo *root, + Expr *clause, + Expr *orclause, + bool is_pushed_down, + bool has_clone, + bool is_clone, + bool pseudoconstant, + Index security_level, + Relids required_relids, + Relids incompatible_relids, + Relids outer_relids) { RestrictInfo *restrictinfo = makeNode(RestrictInfo); Relids baserels; @@ -296,17 +287,17 @@ make_sub_restrictinfos(PlannerInfo *root, NULL, incompatible_relids, outer_relids)); - return (Expr *) make_restrictinfo_internal(root, - clause, - make_orclause(orlist), - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return (Expr *) make_plain_restrictinfo(root, + clause, + make_orclause(orlist), + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } else if (is_andclause(clause)) { @@ -328,17 +319,17 @@ make_sub_restrictinfos(PlannerInfo *root, return make_andclause(andlist); } else - return (Expr *) make_restrictinfo_internal(root, - clause, - NULL, - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return (Expr *) make_plain_restrictinfo(root, + clause, + NULL, + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } /* diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h index 1b42c832c59..b77bf7ddfe9 100644 --- a/src/include/optimizer/restrictinfo.h +++ b/src/include/optimizer/restrictinfo.h @@ -22,6 +22,17 @@ make_restrictinfo(root, clause, true, false, false, false, 0, \ NULL, NULL, NULL) +extern RestrictInfo *make_plain_restrictinfo(PlannerInfo *root, + Expr *clause, + Expr *orclause, + bool is_pushed_down, + bool has_clone, + bool is_clone, + bool pseudoconstant, + Index security_level, + Relids required_relids, + Relids incompatible_relids, + Relids outer_relids); extern RestrictInfo *make_restrictinfo(PlannerInfo *root, Expr *clause, bool is_pushed_down, diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index e73a0525cf4..7397c98f623 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1875,6 +1875,60 @@ SELECT * FROM tenk1 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx (1 row) +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous IS NULL); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND (tenthous IS NULL)) OR ((thousand = 42) AND ((tenthous = 1) OR (tenthous = 3) OR (tenthous = 42)))) + Filter: ((tenthous = 1) OR (tenthous = 3) OR (tenthous = 42) OR (tenthous IS NULL)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous IS NULL)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(8 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous = 42::int8); + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: ((tenthous = '1'::smallint) OR ((tenthous)::smallint = '3'::bigint) OR (tenthous = '42'::bigint)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous::int2 = 42::int8); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: ((tenthous = '1'::smallint) OR ((tenthous)::smallint = '3'::bigint) OR ((tenthous)::smallint = '42'::bigint)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous = 3::int8 OR tenthous = 42::int8); + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND ((tenthous = '3'::bigint) OR (tenthous = '42'::bigint))) OR ((thousand = 42) AND (tenthous = '1'::smallint))) + Filter: ((tenthous = '1'::smallint) OR (tenthous = '3'::bigint) OR (tenthous = '42'::bigint)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{3,42}'::bigint[]))) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = '1'::smallint)) +(8 rows) + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -2003,25 +2057,24 @@ SELECT count(*) FROM tenk1 EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- Aggregate -> Bitmap Heap Scan on tenk1 - Recheck Cond: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) + Recheck Cond: (((hundred = 42) AND (((thousand = 42) OR (thousand = 99)) OR (tenthous < 2))) OR (thousand = 41)) + Filter: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) -> BitmapOr -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) -> BitmapOr -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 42) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 99) + Index Cond: (thousand = ANY ('{42,99}'::integer[])) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (tenthous < 2) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (thousand = 41) -(16 rows) +(15 rows) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; @@ -2033,22 +2086,21 @@ SELECT count(*) FROM tenk1 EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- Aggregate -> Bitmap Heap Scan on tenk1 - Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2)))) + Recheck Cond: ((hundred = 42) AND (((thousand = 99) AND (tenthous = 2)) OR ((thousand = 42) OR (thousand = 41)))) + Filter: ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2))) -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) -> BitmapOr - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 42) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 41) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: ((thousand = 99) AND (tenthous = 2)) -(13 rows) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,41}'::integer[])) +(12 rows) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); @@ -3144,6 +3196,49 @@ SELECT b.relname, (2 rows) DROP TABLE concur_temp_tab_1, concur_temp_tab_2, reindex_temp_before; +-- Check bitmap scan can consider similar OR arguments separately without +-- grouping them into SAOP. +CREATE TABLE bitmap_split_or (a int NOT NULL, b int NOT NULL, c int NOT NULL); +INSERT INTO bitmap_split_or (SELECT 1, 1, i FROM generate_series(1, 1000) i); +INSERT INTO bitmap_split_or (select i, 2, 2 FROM generate_series(1, 1000) i); +VACUUM ANALYZE bitmap_split_or; +CREATE INDEX t_b_partial_1_idx ON bitmap_split_or (b) WHERE a = 1; +CREATE INDEX t_b_partial_2_idx ON bitmap_split_or (b) WHERE a = 2; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE (a = 1 OR a = 2) AND b = 2; + QUERY PLAN +------------------------------------------------------------------ + Bitmap Heap Scan on bitmap_split_or + Recheck Cond: (((b = 2) AND (a = 1)) OR ((b = 2) AND (a = 2))) + -> BitmapOr + -> Bitmap Index Scan on t_b_partial_1_idx + Index Cond: (b = 2) + -> Bitmap Index Scan on t_b_partial_2_idx + Index Cond: (b = 2) +(7 rows) + +DROP INDEX t_b_partial_1_idx; +DROP INDEX t_b_partial_2_idx; +CREATE INDEX t_a_b_idx ON bitmap_split_or (a, b); +CREATE INDEX t_b_c_idx ON bitmap_split_or (b, c); +CREATE STATISTICS t_a_b_stat (mcv) ON a, b FROM bitmap_split_or; +CREATE STATISTICS t_b_c_stat (mcv) ON b, c FROM bitmap_split_or; +ANALYZE bitmap_split_or; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE a = 1 AND (b = 1 OR b = 2) AND c = 2; + QUERY PLAN +------------------------------------------------------------------ + Bitmap Heap Scan on bitmap_split_or + Recheck Cond: (((b = 1) AND (c = 2)) OR ((a = 1) AND (b = 2))) + Filter: ((a = 1) AND (c = 2)) + -> BitmapOr + -> Bitmap Index Scan on t_b_c_idx + Index Cond: ((b = 1) AND (c = 2)) + -> Bitmap Index Scan on t_a_b_idx + Index Cond: ((a = 1) AND (b = 2)) +(8 rows) + +DROP TABLE bitmap_split_or; -- -- REINDEX SCHEMA -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index a6359614b24..ae3fdff7ec3 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4226,20 +4226,20 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (17 rows) explain (costs off) @@ -4253,12 +4253,12 @@ select * from tenk1 a join tenk1 b on Filter: ((unique1 = 2) OR (ten = 4)) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (12 rows) explain (costs off) @@ -4270,21 +4270,21 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR (unique1 = 1)) Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = ANY ('{3,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (18 rows) explain (costs off) @@ -4296,21 +4296,21 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR (unique1 = 1)) Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = ANY ('{3,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (18 rows) explain (costs off) @@ -4324,18 +4324,16 @@ select * from tenk1 a join tenk1 b on -> Seq Scan on tenk1 b -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR ((unique1 = 3) OR (unique1 = 1)) OR (unique1 < 20)) Filter: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 < 20) - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 3) - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = ANY ('{3,7}'::integer[])) -(16 rows) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = ANY ('{3,1}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 < 20) +(14 rows) -- -- test placement of movable quals in a parameterized join tree diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index a4ca4cdfda1..58270d7fd9d 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -738,6 +738,23 @@ SELECT * FROM tenk1 SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous IS NULL); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous = 42::int8); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous::int2 = 42::int8); + + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous = 3::int8 OR tenthous = 42::int8); + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -1321,6 +1338,27 @@ SELECT b.relname, ORDER BY 1; DROP TABLE concur_temp_tab_1, concur_temp_tab_2, reindex_temp_before; +-- Check bitmap scan can consider similar OR arguments separately without +-- grouping them into SAOP. +CREATE TABLE bitmap_split_or (a int NOT NULL, b int NOT NULL, c int NOT NULL); +INSERT INTO bitmap_split_or (SELECT 1, 1, i FROM generate_series(1, 1000) i); +INSERT INTO bitmap_split_or (select i, 2, 2 FROM generate_series(1, 1000) i); +VACUUM ANALYZE bitmap_split_or; +CREATE INDEX t_b_partial_1_idx ON bitmap_split_or (b) WHERE a = 1; +CREATE INDEX t_b_partial_2_idx ON bitmap_split_or (b) WHERE a = 2; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE (a = 1 OR a = 2) AND b = 2; +DROP INDEX t_b_partial_1_idx; +DROP INDEX t_b_partial_2_idx; +CREATE INDEX t_a_b_idx ON bitmap_split_or (a, b); +CREATE INDEX t_b_c_idx ON bitmap_split_or (b, c); +CREATE STATISTICS t_a_b_stat (mcv) ON a, b FROM bitmap_split_or; +CREATE STATISTICS t_b_c_stat (mcv) ON b, c FROM bitmap_split_or; +ANALYZE bitmap_split_or; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE a = 1 AND (b = 1 OR b = 2) AND c = 2; +DROP TABLE bitmap_split_or; + -- -- REINDEX SCHEMA -- diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 5c2719500dd..5ddc2508508 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1762,6 +1762,7 @@ OprCacheKey OprInfo OprProofCacheEntry OprProofCacheKey +OrArgIndexMatch OuterJoinClauseInfo OutputPluginCallbacks OutputPluginOptions From 84b14ca07678237c12efa874750502797efc506c Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 25 Nov 2024 09:05:26 +0200 Subject: [PATCH 51/79] Remove the wrong assertion from match_orclause_to_indexcol() Obviously, the constant could be zero. Also, add the relevant check to regression tests. Reported-by: Richard Guo Discussion: https://postgr.es/m/CAMbWs4-siKJdtWhcbqk4Y-xG12do2Ckm1qw672GNsSnDqL9FQg%40mail.gmail.com --- src/backend/optimizer/path/indxpath.c | 2 +- src/test/regress/expected/create_index.out | 10 +++++----- src/test/regress/sql/create_index.sql | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index cae5f3d7e63..31315f28ef1 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -3418,7 +3418,7 @@ match_orclause_to_indexcol(PlannerInfo *root, elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); foreach_node(Const, value, consts) { - Assert(!value->constisnull && value->constvalue); + Assert(!value->constisnull); elems[i++] = value->constvalue; } diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 7397c98f623..07da307efb4 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1843,15 +1843,15 @@ DROP TABLE onek_with_null; -- EXPLAIN (COSTS OFF) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); - QUERY PLAN ------------------------------------------------------------------------------- + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); + QUERY PLAN +-------------------------------------------------------------------------------- Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42,0}'::integer[]))) (2 rows) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 58270d7fd9d..3a30f58230e 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -728,9 +728,9 @@ DROP TABLE onek_with_null; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); EXPLAIN (COSTS OFF) SELECT * FROM tenk1 From bb7f54f3f76813338593f139af2b0a16cf4805dd Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Fri, 29 Nov 2024 01:46:43 +0200 Subject: [PATCH 52/79] Skip not SOAP-supported indexes while transforming an OR clause into SAOP There is no point in transforming OR-clauses into SAOP's if the target index doesn't support SAOP scans anyway. This commit adds corresponding checks to match_orclause_to_indexcol() and group_similar_or_args(). The first check fixes the actual bug, while the second just saves some cycles. Reported-by: Alexander Lakhin Discussion: https://postgr.es/m/8174de69-9e1a-0827-0e81-ef97f56a5939%40gmail.com Author: Alena Rybakina Reviewed-by: Ranier Vilela, Alexander Korotkov --- src/backend/optimizer/path/indxpath.c | 11 +++++++++-- src/test/regress/expected/create_index.out | 18 ++++++++++++++++++ src/test/regress/sql/create_index.sql | 6 ++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 31315f28ef1..a698f888d71 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -1343,8 +1343,11 @@ group_similar_or_args(PlannerInfo *root, RelOptInfo *rel, RestrictInfo *rinfo) { IndexOptInfo *index = (IndexOptInfo *) lfirst(lc2); - /* Ignore index if it doesn't support bitmap scans */ - if (!index->amhasgetbitmap) + /* + * Ignore index if it doesn't support bitmap scans or SAOP + * clauses. + */ + if (!index->amhasgetbitmap || !index->amsearcharray) continue; for (colnum = 0; colnum < index->nkeycolumns; colnum++) @@ -3235,6 +3238,10 @@ match_orclause_to_indexcol(PlannerInfo *root, Assert(IsA(orclause, BoolExpr)); Assert(orclause->boolop == OR_EXPR); + /* Ignore index if it doesn't support SAOP clauses */ + if(!index->amsearcharray) + return NULL; + /* * Try to convert a list of OR-clauses to a single SAOP expression. Each * OR entry must be in the form: (indexkey operator constant) or (constant diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 07da307efb4..47471eeab50 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1233,6 +1233,24 @@ SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; 14 (1 row) +-- OR-clauses shouldn't be transformed into SAOP because hash indexes don't +-- support SAOP scans. +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM tenk1 WHERE stringu1 = 'TVAAAA' OR stringu1 = 'TVAAAB'; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((stringu1 = 'TVAAAA'::name) OR (stringu1 = 'TVAAAB'::name)) + -> BitmapOr + -> Bitmap Index Scan on hash_tuplesort_idx + Index Cond: (stringu1 = 'TVAAAA'::name) + -> Bitmap Index Scan on hash_tuplesort_idx + Index Cond: (stringu1 = 'TVAAAB'::name) +(8 rows) + +RESET enable_seqscan; DROP INDEX hash_tuplesort_idx; RESET maintenance_work_mem; -- diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 3a30f58230e..4c5682fce9c 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -372,6 +372,12 @@ CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fi EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; +-- OR-clauses shouldn't be transformed into SAOP because hash indexes don't +-- support SAOP scans. +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM tenk1 WHERE stringu1 = 'TVAAAA' OR stringu1 = 'TVAAAB'; +RESET enable_seqscan; DROP INDEX hash_tuplesort_idx; RESET maintenance_work_mem; From 983fdf503c1d8c0a4db4319feb75cbc18c10e8df Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 5 Feb 2025 13:07:34 +0200 Subject: [PATCH 53/79] Update SnapshotData: split row-level and page-level undo locations Also, get xmin out of RetainUndoLocationPHNode. --- src/backend/utils/time/snapmgr.c | 15 ++++++--------- src/include/utils/snapshot.h | 4 ++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 4206c1bfd0b..b15ec90f637 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -206,10 +206,9 @@ typedef struct SerializedSnapshotData TimestampTz whenTaken; XLogRecPtr lsn; CSNSnapshotData csnSnapshotData; - uint64 undoRegularLocation; - uint64 undoRegularXmin; + uint64 undoRegularRowLocation; + uint64 undoRegularPageLocation; uint64 undoSystemLocation; - uint64 undoSystemXmin; } SerializedSnapshotData; /* @@ -1767,9 +1766,8 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.csnSnapshotData.xmin = snapshot->csnSnapshotData.xmin; serialized_snapshot.csnSnapshotData.snapshotcsn = snapshot->csnSnapshotData.snapshotcsn; serialized_snapshot.csnSnapshotData.xlogptr = snapshot->csnSnapshotData.xlogptr; - serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; - serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; - serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; + serialized_snapshot.undoRegularRowLocation = snapshot->undoRegularRowLocationPhNode.undoLocation; + serialized_snapshot.undoRegularPageLocation = snapshot->undoRegularPageLocationPhNode.undoLocation; serialized_snapshot.undoSystemLocation = snapshot->undoSystemLocationPhNode.undoLocation; /* @@ -1849,9 +1847,8 @@ RestoreSnapshot(char *start_address) snapshot->csnSnapshotData.xmin = serialized_snapshot.csnSnapshotData.xmin; snapshot->csnSnapshotData.snapshotcsn = serialized_snapshot.csnSnapshotData.snapshotcsn; snapshot->csnSnapshotData.xlogptr = serialized_snapshot.csnSnapshotData.xlogptr; - snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; - snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; - snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; + snapshot->undoRegularRowLocationPhNode.undoLocation = serialized_snapshot.undoRegularRowLocation; + snapshot->undoRegularPageLocationPhNode.undoLocation = serialized_snapshot.undoRegularPageLocation; snapshot->undoSystemLocationPhNode.undoLocation = serialized_snapshot.undoSystemLocation; /* Copy XIDs, if present. */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 9eec035622d..68464b66127 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -125,7 +125,6 @@ typedef struct SnapshotData *Snapshot; typedef struct { uint64 undoLocation; /* undo log location retained by this snapshot */ - uint64 xmin; pairingheap_node ph_node; } RetainUndoLocationPHNode; @@ -229,7 +228,8 @@ typedef struct SnapshotData */ uint64 snapXactCompletionCount; - RetainUndoLocationPHNode undoRegularLocationPhNode; + RetainUndoLocationPHNode undoRegularRowLocationPhNode; + RetainUndoLocationPHNode undoRegularPageLocationPhNode; RetainUndoLocationPHNode undoSystemLocationPhNode; CSNSnapshotData csnSnapshotData; } SnapshotData; From 0600177b40472cea8afc55414a00f97d5f5e2316 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 3 Apr 2025 14:31:26 +0200 Subject: [PATCH 54/79] Added hint bits horizon --- src/backend/access/heap/heapam_visibility.c | 13 +++++++++++++ src/include/access/heapam.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 9243feed01f..bd06c87e0de 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -78,6 +78,8 @@ #include "utils/builtins.h" #include "utils/snapmgr.h" +static TransactionId hint_bit_horizon = InvalidTransactionId; + /* * SetHintBits() @@ -127,6 +129,11 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, } } + if (TransactionIdIsValid(hint_bit_horizon) && + TransactionIdIsValid(xid) && + TransactionIdFollows(xid, hint_bit_horizon)) + return; + tuple->t_infomask |= infomask; MarkBufferDirtyHint(buffer, true); } @@ -1786,3 +1793,9 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; /* keep compiler quiet */ } + +void +SetHintBitsHorizon(TransactionId new_horizon) +{ + hint_bit_horizon = new_horizon; +} diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index ad71a220ed9..c4d3bdc5bd2 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -420,6 +420,8 @@ extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); extern bool HeapTupleIsSurelyDead(HeapTuple htup, struct GlobalVisState *vistest); +extern void SetHintBitsHorizon(TransactionId new_horizon); + /* * To avoid leaking too much knowledge about reorderbuffer implementation * details this is implemented in reorderbuffer.c not heapam_visibility.c From 4c226adc13bd3e9197f4f7dfb387bded5fead2ab Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Tue, 15 Apr 2025 19:47:27 +0300 Subject: [PATCH 55/79] Regress check with orioledb storage engine --- patches/test_setup_enable_oriole.diff | 27 +++++ src/test/regress/GNUmakefile | 4 + src/test/regress/parallel_schedule_oriole | 129 ++++++++++++++++++++++ src/test/regress/pg_regress.c | 4 +- 4 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 patches/test_setup_enable_oriole.diff create mode 100644 src/test/regress/parallel_schedule_oriole diff --git a/patches/test_setup_enable_oriole.diff b/patches/test_setup_enable_oriole.diff new file mode 100644 index 00000000000..302f616810a --- /dev/null +++ b/patches/test_setup_enable_oriole.diff @@ -0,0 +1,27 @@ +diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out +index 3d0eeec996..70ce94e21a 100644 +--- a/src/test/regress/expected/test_setup.out ++++ b/src/test/regress/expected/test_setup.out +@@ -21,6 +21,8 @@ GRANT ALL ON SCHEMA public TO public; + -- Create a tablespace we can use in tests. + SET allow_in_place_tablespaces = true; + CREATE TABLESPACE regress_tblspace LOCATION ''; ++-- Enable orioledb extension ++CREATE EXTENSION orioledb; + -- + -- These tables have traditionally been referenced by many tests, + -- so create and populate them. Insert only non-error values here. +diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql +index 06b0e2121f..867ad6a2df 100644 +--- a/src/test/regress/sql/test_setup.sql ++++ b/src/test/regress/sql/test_setup.sql +@@ -27,6 +27,9 @@ GRANT ALL ON SCHEMA public TO public; + SET allow_in_place_tablespaces = true; + CREATE TABLESPACE regress_tblspace LOCATION ''; + ++-- Enable orioledb extension ++CREATE EXTENSION orioledb; ++ + -- + -- These tables have traditionally been referenced by many tests, + -- so create and populate them. Insert only non-error values here. diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile index 090e49ea55b..f93eb33e11a 100644 --- a/src/test/regress/GNUmakefile +++ b/src/test/regress/GNUmakefile @@ -123,6 +123,10 @@ check-tests: all | temp-install installcheck: all $(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/parallel_schedule --max-connections=1 $(EXTRA_TESTS) +# Run tests that work with oriole +installcheck-oriole: all + $(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/parallel_schedule_oriole --max-connections=1 $(EXTRA_TESTS) + installcheck-parallel: all $(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/parallel_schedule $(MAXCONNOPT) $(EXTRA_TESTS) diff --git a/src/test/regress/parallel_schedule_oriole b/src/test/regress/parallel_schedule_oriole new file mode 100644 index 00000000000..dfd35775913 --- /dev/null +++ b/src/test/regress/parallel_schedule_oriole @@ -0,0 +1,129 @@ +# ---------- +# src/test/regress/parallel_schedule +# +# Most test scripts can be run after running just test_setup and possibly +# create_index. Exceptions to this rule are documented below. +# +# By convention, we put no more than twenty tests in any one parallel group; +# this limits the number of connections needed to run the tests. +# ---------- + +# required setup steps +test: test_setup + +# ---------- +# The first group of parallel tests +# ---------- +test: boolean char name varchar text int2 int4 int8 oid float4 float8 bit numeric txid uuid enum money pg_lsn regproc + +# ---------- +# The second group of parallel tests +# multirangetypes depends on rangetypes +# multirangetypes shouldn't run concurrently with type_sanity +# ---------- +test: md5 numerology lseg line path circle date time timetz timestamp timestamptz interval inet macaddr macaddr8 + +# ---------- +# Another group of parallel tests +# geometry depends on point, lseg, line, box, path, polygon, circle +# horology depends on date, time, timetz, timestamp, timestamptz, interval +# ---------- +test: horology tstypes regex comments expressions unicode xid mvcc + +# ---------- +# Load huge amounts of data +# We should split the data files into single files and then +# execute two copy tests in parallel, to check that copy itself +# is concurrent safe. +# ---------- +test: copy copyselect copydml + +# ---------- +# More groups of parallel tests +# Note: many of the tests in later groups depend on create_index +# ---------- +test: create_function_c create_misc create_operator create_procedure create_type create_schema create_view + +# ---------- +# Another group of parallel tests +# ---------- +test: create_aggregate create_function_sql create_cast typed_table drop_if_exists + +# ---------- +# sanity_check does a vacuum, affecting the sort order of SELECT * +# results. So it should not run parallel to other tests. +# ---------- + +# ---------- +# Another group of parallel tests +# aggregates depends on create_aggregate +# join depends on create_misc +# ---------- +test: select_into select_implicit select_having random delete + +# ---------- +# Another group of parallel tests +# ---------- +test: init_privs security_label collate object_address drop_operator password + +# ---------- +# Additional BRIN tests +# ---------- + +# ---------- +# Another group of parallel tests +# psql depends on create_am +# amutils depends on geometry, create_index_spgist, hash_index, brin +# ---------- +test: alter_generic alter_operator misc async dbsize sysviews tsrf create_role + +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other + +# ---------- +# Run these alone so they don't run out of parallel workers +# select_parallel depends on create_misc +# ---------- + +# no relation related tests can be put in this group +test: subscription + +# ---------- +# Another group of parallel tests +# select_views depends on create_view +# ---------- +test: select_views portals_p2 dependency guc bitmapops xmlmap functional_deps advisory_lock + +# ---------- +# Another group of parallel tests (JSON related) +# ---------- +test: json_encoding jsonpath jsonpath_encoding jsonb_jsonpath sqljson sqljson_queryfuncs sqljson_jsontable + +# ---------- +# Another group of parallel tests +# with depends on create_misc +# NB: temp.sql does a reconnect which transiently uses 2 connections, +# so keep this parallel group to at most 19 tests +# ---------- +test: plancache rangefuncs prepare sequence polymorphism largeobject xml + +# ---------- +# Another group of parallel tests +# +# The stats test resets stats, so nothing else needing stats access can be in +# this group. +# ---------- +test: hash_part predicate + +# event_trigger depends on create_am and cannot run concurrently with +# any test that runs DDL +# oidjoins is read-only, though, and should run late for best coverage +test: oidjoins + +# event_trigger_login cannot run concurrently with any other tests because +# on-login event handling could catch connection of a concurrent test. +test: event_trigger_login + +# this test also uses event triggers, so likewise run it by itself + +# run tablespace test at the end because it drops the tablespace created during +# setup that other tests may use. diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 53435c47420..fc8ad4c36dc 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -62,8 +62,8 @@ static char *shellprog = SHELLPROG; * Windows-style newlines, but the comparison files might or might not. */ #ifndef WIN32 -const char *basic_diff_opts = ""; -const char *pretty_diff_opts = "-U3"; +const char *basic_diff_opts = "-I \"NOTICE\" -I \"DETAIL\" -I \"WARNING\""; +const char *pretty_diff_opts = "-I \"NOTICE\" -I \"DETAIL\" -I \"WARNING\" -U3"; #else const char *basic_diff_opts = "--strip-trailing-cr"; const char *pretty_diff_opts = "--strip-trailing-cr -U3"; From 402cd385ef4e55e8982f59a9123d2745429cca15 Mon Sep 17 00:00:00 2001 From: Artur Zakirov Date: Mon, 28 Apr 2025 14:11:41 +0200 Subject: [PATCH 56/79] Use Ubuntu 24.04 image Github Actions deprecated usage of Ubuntu 20.04 --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c6f1bef64aa..f22a5232baf 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,14 +7,14 @@ on: jobs: test: runs-on: - - ubuntu-20.04 + - ubuntu-24.04 strategy: fail-fast: false matrix: compiler: [clang, gcc] check_type: [normal, debug] env: - LLVM_VER: 10 + LLVM_VER: 19 COMPILER: ${{ matrix.compiler }} CHECK_TYPE: ${{ matrix.check_type }} steps: From 333be23a11f6fb423443780e3b50d450f7ed02ac Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Fri, 4 Apr 2025 16:52:43 +0400 Subject: [PATCH 57/79] Add wait event for rewind worker --- src/backend/utils/activity/wait_event_names.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 16144c2b72d..d638bbdeff0 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -55,6 +55,7 @@ ARCHIVER_MAIN "Waiting in main loop of archiver process." AUTOVACUUM_MAIN "Waiting in main loop of autovacuum launcher process." BGWRITER_HIBERNATE "Waiting in background writer process, hibernating." BGWRITER_MAIN "Waiting in main loop of background writer process." +REWIND_WORKER_MAIN "Waiting in main loop of rewind worker process." CHECKPOINTER_MAIN "Waiting in main loop of checkpointer process." LOGICAL_APPLY_MAIN "Waiting in main loop of logical replication apply process." LOGICAL_LAUNCHER_MAIN "Waiting in main loop of logical replication launcher process." From a2f2291c53f97e4bb1c90c4a3165d371829e70e6 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 21 May 2025 03:12:58 +0300 Subject: [PATCH 58/79] Add VacuumHorizonHook instead of just hint bits horizon --- src/backend/access/heap/heapam_visibility.c | 20 +++++++++----------- src/backend/commands/vacuum.c | 16 ++++++++++++++++ src/include/access/heapam.h | 6 ++++-- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index bd06c87e0de..61bf5437786 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -78,7 +78,7 @@ #include "utils/builtins.h" #include "utils/snapmgr.h" -static TransactionId hint_bit_horizon = InvalidTransactionId; +VacuumHorizonHookType VacuumHorizonHook = NULL; /* @@ -129,10 +129,14 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, } } - if (TransactionIdIsValid(hint_bit_horizon) && - TransactionIdIsValid(xid) && - TransactionIdFollows(xid, hint_bit_horizon)) - return; + if (TransactionIdIsValid(xid) && VacuumHorizonHook) + { + TransactionId horizon = VacuumHorizonHook(); + + if (TransactionIdIsValid(horizon) && + TransactionIdFollows(xid, horizon)) + return; + } tuple->t_infomask |= infomask; MarkBufferDirtyHint(buffer, true); @@ -1793,9 +1797,3 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; /* keep compiler quiet */ } - -void -SetHintBitsHorizon(TransactionId new_horizon) -{ - hint_bit_horizon = new_horizon; -} diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index a2132ecedaf..6833564a85f 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1117,6 +1117,14 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, * any time, and that each vacuum is always an independent transaction. */ cutoffs->OldestXmin = GetOldestNonRemovableTransactionId(rel); + if (VacuumHorizonHook) + { + TransactionId horizon = VacuumHorizonHook(); + + if (TransactionIdIsValid(horizon) && + TransactionIdFollows(cutoffs->OldestXmin, horizon)) + cutoffs->OldestXmin = horizon; + } Assert(TransactionIdIsNormal(cutoffs->OldestXmin)); @@ -1614,6 +1622,14 @@ vac_update_datfrozenxid(void) * cannot produce a wrong minimum by starting with this. */ newFrozenXid = GetOldestNonRemovableTransactionId(NULL); + if (VacuumHorizonHook) + { + TransactionId horizon = VacuumHorizonHook(); + + if (TransactionIdIsValid(horizon) && + TransactionIdFollows(newFrozenXid, horizon)) + newFrozenXid = horizon; + } /* * Similarly, initialize the MultiXact "min" with the value that would be diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index c4d3bdc5bd2..d158bf3b557 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -406,6 +406,10 @@ extern void heap_vacuum_rel(Relation rel, struct VacuumParams *params, BufferAccessStrategy bstrategy); /* in heap/heapam_visibility.c */ +typedef TransactionId (*VacuumHorizonHookType) (void); + +extern VacuumHorizonHookType VacuumHorizonHook; + extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer); extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, @@ -420,8 +424,6 @@ extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); extern bool HeapTupleIsSurelyDead(HeapTuple htup, struct GlobalVisState *vistest); -extern void SetHintBitsHorizon(TransactionId new_horizon); - /* * To avoid leaking too much knowledge about reorderbuffer implementation * details this is implemented in reorderbuffer.c not heapam_visibility.c From dc11bad7df22909ba0bf212fd73cd67f7ad1b426 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Thu, 22 May 2025 17:19:03 +0400 Subject: [PATCH 59/79] Modify CountOtherDBBackends/TerminateOtherDBBackends to process whole cluster by providing invalid databaseId For bgworkers CountOtherDBBackends doesn't add it to nbackends, but check if they are prepared transactions to add to nprepared --- src/backend/storage/ipc/procarray.c | 37 +++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index fd53e6b9df6..752edc8fe7e 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -3753,6 +3753,8 @@ CountUserBackends(Oid roleid) * backend startup. The caller should normally hold an exclusive lock on the * target DB before calling this, which is one reason we mustn't wait * indefinitely. + * + * If databaseId is InvalidOid, count all non-bgworker backends in a cluster. */ bool CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) @@ -3782,8 +3784,22 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) PGPROC *proc = &allProcs[pgprocno]; uint8 statusFlags = ProcGlobal->statusFlags[index]; - if (proc->databaseId != databaseId) - continue; + if (databaseId != InvalidOid) + { + if (proc->databaseId != databaseId) + continue; + } + else + { + if (proc->isBackgroundWorker) + { + if (proc->pid == 0) + (*nprepared)++; + + continue; /* do not count background workers */ + } + } + if (proc == MyProc) continue; @@ -3831,6 +3847,8 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) * * If the target database has a prepared transaction or permissions checks * fail for a connection, this fails without terminating anything. + * + * If databaseId is InvalidOid, terminate all backends in a cluster. */ void TerminateOtherDBBackends(Oid databaseId) @@ -3847,7 +3865,7 @@ TerminateOtherDBBackends(Oid databaseId) int pgprocno = arrayP->pgprocnos[i]; PGPROC *proc = &allProcs[pgprocno]; - if (proc->databaseId != databaseId) + if (databaseId != InvalidOid && proc->databaseId != databaseId) continue; if (proc == MyProc) continue; @@ -3861,7 +3879,9 @@ TerminateOtherDBBackends(Oid databaseId) LWLockRelease(ProcArrayLock); if (nprepared > 0) - ereport(ERROR, + { + if (databaseId != InvalidOid) + ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("database \"%s\" is being used by prepared transactions", get_database_name(databaseId)), @@ -3869,6 +3889,15 @@ TerminateOtherDBBackends(Oid databaseId) "There are %d prepared transactions using the database.", nprepared, nprepared))); + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("cluster is being used by prepared transactions"), + errdetail_plural("There is %d prepared transaction using the cluster.", + "There are %d prepared transactions using the cluster.", + nprepared, + nprepared))); + } if (pids) { From 3124cf9f35f91e909b654632ab28b0ac2ddb2b53 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Mon, 26 May 2025 20:51:29 +0400 Subject: [PATCH 60/79] Disable assert in clog This is required to have rewind functionality on heap tables --- src/backend/access/transam/clog.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index e6f79320e94..478668238e6 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -688,10 +688,10 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i * Current state change should be from 0 or subcommitted to target state * or we should already be there when replaying changes during recovery. */ - Assert(curval == 0 || - (curval == TRANSACTION_STATUS_SUB_COMMITTED && - status != TRANSACTION_STATUS_IN_PROGRESS) || - curval == status); +// Assert(curval == 0 || +// (curval == TRANSACTION_STATUS_SUB_COMMITTED && +// status != TRANSACTION_STATUS_IN_PROGRESS) || +// curval == status); /* note this assumes exclusive access to the clog page */ byteval = *byteptr; From 37b1ee90313029abf7c0b324b3d87bb2aef863cb Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 29 Jun 2025 17:45:18 +0300 Subject: [PATCH 61/79] Use VacuumHorizonHook in more places --- src/backend/storage/ipc/procarray.c | 40 ++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 752edc8fe7e..493f19194cf 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -47,6 +47,7 @@ #include +#include "access/heapam.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" @@ -2008,23 +2009,36 @@ TransactionId GetOldestNonRemovableTransactionId(Relation rel) { ComputeXidHorizonsResult horizons; + TransactionId result = InvalidTransactionId; ComputeXidHorizons(&horizons); switch (GlobalVisHorizonKindForRel(rel)) { case VISHORIZON_SHARED: - return horizons.shared_oldest_nonremovable; + result = horizons.shared_oldest_nonremovable; + break; case VISHORIZON_CATALOG: - return horizons.catalog_oldest_nonremovable; + result = horizons.catalog_oldest_nonremovable; + break; case VISHORIZON_DATA: - return horizons.data_oldest_nonremovable; + result = horizons.data_oldest_nonremovable; + break; case VISHORIZON_TEMP: - return horizons.temp_oldest_nonremovable; + result = horizons.temp_oldest_nonremovable; + break; } - /* just to prevent compiler warnings */ - return InvalidTransactionId; + if (VacuumHorizonHook) + { + TransactionId horizon = VacuumHorizonHook(); + + if (TransactionIdIsValid(horizon) && + TransactionIdFollows(result, horizon)) + result = horizon; + } + + return result; } /* @@ -4164,6 +4178,20 @@ GlobalVisTestFor(Relation rel) break; } + if (VacuumHorizonHook) + { + TransactionId horizon = VacuumHorizonHook(); + if (TransactionIdIsValid(horizon)) + { + FullTransactionId fullHorizon = FullXidRelativeTo(state->definitely_needed, horizon); + + if (FullTransactionIdFollows(state->definitely_needed, fullHorizon)) + state->definitely_needed = fullHorizon; + if (FullTransactionIdFollows(state->maybe_needed, fullHorizon)) + state->maybe_needed = fullHorizon; + } + } + Assert(FullTransactionIdIsValid(state->definitely_needed) && FullTransactionIdIsValid(state->maybe_needed)); From 85bdfd05c2f6c93e591336959e8ef88425f5b9ee Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Wed, 28 May 2025 13:55:49 +0200 Subject: [PATCH 62/79] Serialize reftype for RTE_RELATION Fixes "rules" regress test for running with orioledb --- src/backend/nodes/outfuncs.c | 1 + src/backend/nodes/readfuncs.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 3337b77ae6d..9cb04a56156 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -507,6 +507,7 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node) WRITE_OID_FIELD(relid); WRITE_BOOL_FIELD(inh); WRITE_CHAR_FIELD(relkind); + WRITE_INT_FIELD(reftype); WRITE_INT_FIELD(rellockmode); WRITE_UINT_FIELD(perminfoindex); WRITE_NODE_FIELD(tablesample); diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index c4d01a441a0..41a2d341ec1 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -358,6 +358,7 @@ _readRangeTblEntry(void) READ_OID_FIELD(relid); READ_BOOL_FIELD(inh); READ_CHAR_FIELD(relkind); + READ_INT_FIELD(reftype); READ_INT_FIELD(rellockmode); READ_UINT_FIELD(perminfoindex); READ_NODE_FIELD(tablesample); From 3cbacbd8c7d4e7547d72e7d902056efc0b7c04c0 Mon Sep 17 00:00:00 2001 From: Artur Zakirov Date: Fri, 22 Aug 2025 19:11:27 +0200 Subject: [PATCH 63/79] Issue #487: Add TableSupportsBackwardScan() and TableAmRoutine->amcanbackward OrioleDB table AM doesn't support BACKWARD SCAN in all cases. A user needs to declare a cursor with SCROLL option. --- src/backend/access/heap/heapam_handler.c | 1 + src/backend/commands/portalcmds.c | 2 +- src/backend/executor/execAmi.c | 51 +++++++++++++++++++++--- src/backend/executor/spi.c | 3 +- src/backend/optimizer/plan/planner.c | 2 +- src/include/access/tableam.h | 2 + src/include/executor/executor.h | 2 +- 7 files changed, 54 insertions(+), 9 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 7d6828db403..354f1f225c3 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2964,6 +2964,7 @@ heapam_reloptions(char relkind, Datum reloptions, bool validate) static const TableAmRoutine heapam_methods = { .type = T_TableAmRoutine, + .amcanbackward = true, .slot_callbacks = heapam_slot_callbacks, .get_row_ref_type = heapam_get_row_ref_type, diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index 4f6acf67198..afee706025a 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -134,7 +134,7 @@ PerformCursorOpen(ParseState *pstate, DeclareCursorStmt *cstmt, ParamListInfo pa if (!(portal->cursorOptions & (CURSOR_OPT_SCROLL | CURSOR_OPT_NO_SCROLL))) { if (plan->rowMarks == NIL && - ExecSupportsBackwardScan(plan->planTree)) + ExecSupportsBackwardScan(plan->planTree, plan->rtable)) portal->cursorOptions |= CURSOR_OPT_SCROLL; else portal->cursorOptions |= CURSOR_OPT_NO_SCROLL; diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 1a7f6ae2c9b..491d8fc0416 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -14,6 +14,7 @@ #include "access/amapi.h" #include "access/htup_details.h" +#include "access/tableam.h" #include "catalog/pg_class.h" #include "executor/nodeAgg.h" #include "executor/nodeAppend.h" @@ -60,9 +61,11 @@ #include "executor/nodeWorktablescan.h" #include "nodes/extensible.h" #include "nodes/pathnodes.h" +#include "parser/parsetree.h" #include "utils/syscache.h" static bool IndexSupportsBackwardScan(Oid indexid); +static bool TableSupportsBackwardScan(Oid tableid); /* @@ -507,7 +510,7 @@ ExecSupportsMarkRestore(Path *pathnode) * children do. Therefore, this routine must be passed a complete plan tree. */ bool -ExecSupportsBackwardScan(Plan *node) +ExecSupportsBackwardScan(Plan *node, List *rtable) { if (node == NULL) return false; @@ -524,7 +527,7 @@ ExecSupportsBackwardScan(Plan *node) { case T_Result: if (outerPlan(node) != NULL) - return ExecSupportsBackwardScan(outerPlan(node)); + return ExecSupportsBackwardScan(outerPlan(node), rtable); else return false; @@ -538,7 +541,7 @@ ExecSupportsBackwardScan(Plan *node) foreach(l, ((Append *) node)->appendplans) { - if (!ExecSupportsBackwardScan((Plan *) lfirst(l))) + if (!ExecSupportsBackwardScan((Plan *) lfirst(l), rtable)) return false; } /* need not check tlist because Append doesn't evaluate it */ @@ -559,7 +562,7 @@ ExecSupportsBackwardScan(Plan *node) return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid); case T_SubqueryScan: - return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan); + return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan, rtable); case T_CustomScan: if (((CustomScan *) node)->flags & CUSTOMPATH_SUPPORT_BACKWARD_SCAN) @@ -569,6 +572,16 @@ ExecSupportsBackwardScan(Plan *node) case T_SeqScan: case T_TidScan: case T_TidRangeScan: + { + RangeTblEntry *rte; + + Assert(((Scan *) node)->scanrelid > 0 && + ((Scan *) node)->scanrelid <= list_length(rtable)); + + rte = rt_fetch(((Scan *) node)->scanrelid, rtable); + return TableSupportsBackwardScan(rte->relid); + } + case T_FunctionScan: case T_ValuesScan: case T_CteScan: @@ -587,7 +600,7 @@ ExecSupportsBackwardScan(Plan *node) case T_LockRows: case T_Limit: - return ExecSupportsBackwardScan(outerPlan(node)); + return ExecSupportsBackwardScan(outerPlan(node), rtable); default: return false; @@ -623,6 +636,34 @@ IndexSupportsBackwardScan(Oid indexid) return result; } +/* + * An SeqScan, TidScan or TidRangeScan node supports backward scan only if the + * table's AM does. + */ +static bool +TableSupportsBackwardScan(Oid tableid) +{ + bool result; + HeapTuple ht_tabrel; + Form_pg_class tabrelrec; + const TableAmRoutine *amroutine; + + /* Fetch the pg_class tuple of the index relation */ + ht_tabrel = SearchSysCache1(RELOID, ObjectIdGetDatum(tableid)); + if (!HeapTupleIsValid(ht_tabrel)) + elog(ERROR, "cache lookup failed for relation %u", tableid); + tabrelrec = (Form_pg_class) GETSTRUCT(ht_tabrel); + + /* Fetch the table AM's API struct */ + amroutine = GetTableAmRoutineByAmOid(tabrelrec->relam); + + result = amroutine->amcanbackward; + + ReleaseSysCache(ht_tabrel); + + return result; +} + /* * ExecMaterializesOutput - does a plan type materialize its output? * diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index cf07d47e958..ae4a8949313 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -1697,7 +1697,8 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, if (list_length(stmt_list) == 1 && linitial_node(PlannedStmt, stmt_list)->commandType != CMD_UTILITY && linitial_node(PlannedStmt, stmt_list)->rowMarks == NIL && - ExecSupportsBackwardScan(linitial_node(PlannedStmt, stmt_list)->planTree)) + ExecSupportsBackwardScan(linitial_node(PlannedStmt, stmt_list)->planTree, + linitial_node(PlannedStmt, stmt_list)->rtable)) portal->cursorOptions |= CURSOR_OPT_SCROLL; else portal->cursorOptions |= CURSOR_OPT_NO_SCROLL; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 80a1e353a44..ca26b778049 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -427,7 +427,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, */ if (cursorOptions & CURSOR_OPT_SCROLL) { - if (!ExecSupportsBackwardScan(top_plan)) + if (!ExecSupportsBackwardScan(top_plan, root->parse->rtable)) top_plan = materialize_finished_plan(top_plan); } diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 5c55a5f78a5..09175483dce 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -312,6 +312,8 @@ typedef struct TableAmRoutine /* this must be set to T_TableAmRoutine */ NodeTag type; + /* does AM support backward scanning? */ + bool amcanbackward; /* ------------------------------------------------------------------------ * Slot related callbacks. diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index a044e76c437..da19d944f19 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -106,7 +106,7 @@ extern void ExecReScan(PlanState *node); extern void ExecMarkPos(PlanState *node); extern void ExecRestrPos(PlanState *node); extern bool ExecSupportsMarkRestore(struct Path *pathnode); -extern bool ExecSupportsBackwardScan(Plan *node); +extern bool ExecSupportsBackwardScan(Plan *node, List *rtable); extern bool ExecMaterializesOutput(NodeTag plantype); /* From 5939a666be4d88ead62641f985ec65b9184c6543 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Mon, 4 Aug 2025 20:50:59 +0400 Subject: [PATCH 64/79] Extend tableam->relation_size() functionality to use different calculation methods. This allows to output size metrics for relations and indexes provided by OrioleDB extension using existing PG functions. If extension function doesn't support the requested method it should output negative value and thus asking to fall back to using PG internal calculation e.g. - Orioledb relation_size outputs -1 for bridged indexes to fallback to counting them as PG indexes - PG table_block_relation_size outputs -1 for any method except the only DEFAULT_SIZE that it supports. This API relies on table AM extensibility and doesn't use index AM extensibility yet, which could look more logical, but also more complicated. --- src/backend/access/table/tableam.c | 12 +++- src/backend/utils/adt/dbsize.c | 106 ++++++++++++++++++++++++++++- src/include/access/tableam.h | 16 ++++- 3 files changed, 127 insertions(+), 7 deletions(-) diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 865d30b61af..bc29bffa6fc 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -626,12 +626,18 @@ table_block_parallelscan_nextpage(Relation rel, * is stored, and if it uses them in the expected manner (e.g. the actual data * is in the main fork rather than some other), it can use this implementation * of the relation_size callback rather than implementing its own. + * + * Different counting methods is not supported for this function yet. It's expected + * DEFAULT_SIZE in all cases. */ -uint64 -table_block_relation_size(Relation rel, ForkNumber forkNumber) +int64 +table_block_relation_size(Relation rel, ForkNumber forkNumber, uint8 method) { uint64 nblocks = 0; + if (method != DEFAULT_SIZE) + return -1; + /* InvalidForkNumber indicates returning the size for all forks */ if (forkNumber == InvalidForkNumber) { @@ -641,7 +647,7 @@ table_block_relation_size(Relation rel, ForkNumber forkNumber) else nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber); - return nblocks * BLCKSZ; + return (int64) nblocks * BLCKSZ; } /* diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index b2d9cc27929..e6a29afe6a7 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -30,6 +30,7 @@ #include "utils/relfilenumbermap.h" #include "utils/relmapper.h" #include "utils/syscache.h" +#include "access/tableam.h" /* Divide by two and round away from zero */ #define half_rounded(x) (((x) + ((x) < 0 ? -1 : 1)) / 2) @@ -342,6 +343,61 @@ calculate_relation_size(RelFileLocator *rfn, ProcNumber backend, ForkNumber fork return totalsize; } +/* + * Try to get size using proper relation_size method in table am. Fallback to previous method + * if rd_tableam->relation_size doesn't support requested counting method or otherwise refuses + * to count (with negative output) + */ +static +int64 try_tableam_relation_size(Relation rel, ForkNumber forkNum, bool allforks, uint8 method) +{ + int64 size = 0; + + if (rel->rd_rel->relkind == RELKIND_INDEX) + { + /* For index check rd_tableam for parent relation */ + Relation tbl; + + tbl = relation_open(rel->rd_index->indrelid, AccessShareLock); + if(tbl->rd_tableam && tbl->rd_tableam->relation_size) + { + /* We call relation_size method for parent relation but provide index relation as an argument. + * Method for index is always RELATION_SIZE + */ + if (allforks) + { + for (ForkNumber i = 0; i <= MAX_FORKNUM; i++) + { + size += tbl->rd_tableam->relation_size(rel, i, RELATION_SIZE); + } + } + else + size = tbl->rd_tableam->relation_size(rel, forkNum, RELATION_SIZE); + + if (size >= 0) + { + relation_close(tbl, AccessShareLock); + return size; + } + } + relation_close(tbl, AccessShareLock); + } + else if (rel->rd_tableam && rel->rd_tableam->relation_size) + { + if (allforks) + { + for (ForkNumber i = 0; i <= MAX_FORKNUM; i++) + size += rel->rd_tableam->relation_size(rel, i, method); + } + else + size = rel->rd_tableam->relation_size(rel, forkNum, method); + + if (size >= 0) + return size; + } + return -1; +} + Datum pg_relation_size(PG_FUNCTION_ARGS) { @@ -362,6 +418,18 @@ pg_relation_size(PG_FUNCTION_ARGS) if (rel == NULL) PG_RETURN_NULL(); + /* + * Try to get size using proper relation_size method in table am. Fallback to previous method + * if rd_tableam->relation_size doesn't support requested counting method or otherwise refuses + * to count (with negative output) + */ + size = try_tableam_relation_size(rel, forkname_to_number(text_to_cstring(forkName)), false, RELATION_SIZE); + if (size >= 0) + { + relation_close(rel, AccessShareLock); + PG_RETURN_INT64(size); + } + size = calculate_relation_size(&(rel->rd_locator), rel->rd_backend, forkname_to_number(text_to_cstring(forkName))); @@ -494,6 +562,18 @@ pg_table_size(PG_FUNCTION_ARGS) if (rel == NULL) PG_RETURN_NULL(); + /* + * Try to get size using proper relation_size method in table am. Fallback to previous method + * if rd_tableam->relation_size doesn't support requested counting method or otherwise refuses + * to count (with negative output) + */ + size = try_tableam_relation_size(rel, InvalidForkNumber, true, TABLE_SIZE); + if (size >= 0) + { + relation_close(rel, AccessShareLock); + PG_RETURN_INT64(size); + } + size = calculate_table_size(rel); relation_close(rel, AccessShareLock); @@ -506,13 +586,25 @@ pg_indexes_size(PG_FUNCTION_ARGS) { Oid relOid = PG_GETARG_OID(0); Relation rel; - int64 size; + int64 size = 0; rel = try_relation_open(relOid, AccessShareLock); if (rel == NULL) PG_RETURN_NULL(); + if (rel->rd_tableam && rel->rd_tableam->relation_size) + { + for (ForkNumber forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) + size += rel->rd_tableam->relation_size(rel, forkNum , INDEXES_SIZE); + + if(size >= 0) + { + relation_close(rel, AccessShareLock); + PG_RETURN_INT64(size); + } + } + size = calculate_indexes_size(rel); relation_close(rel, AccessShareLock); @@ -555,6 +647,18 @@ pg_total_relation_size(PG_FUNCTION_ARGS) if (rel == NULL) PG_RETURN_NULL(); + /* + * Try to get size using proper relation_size method in table am. Fallback to previous method + * if rd_tableam->relation_size doesn't support requested counting method or otherwise refuses + * to count (with negative output) + */ + size = try_tableam_relation_size(rel, InvalidForkNumber, true, TOTAL_SIZE); + if (size >= 0) + { + relation_close(rel, AccessShareLock); + PG_RETURN_INT64(size); + } + size = calculate_total_relation_size(rel); relation_close(rel, AccessShareLock); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 09175483dce..b142c098d8c 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -287,6 +287,15 @@ typedef struct TM_IndexDeleteOp #define TABLE_MODIFY_FETCH_OLD_TUPLE 0x0002 #define TABLE_MODIFY_LOCK_UPDATED 0x0004 +/* "method" flag bits for relation_size */ +/* Default behavior implemented for heap AM */ +#define DEFAULT_SIZE (0) +/* Extended behavior that AM can provide */ +#define RELATION_SIZE (1) +#define TABLE_SIZE (2) +#define TOTAL_SIZE (3) +#define TOAST_TABLE_SIZE (4) +#define INDEXES_SIZE (5) /* Typedef for callback function for table_index_build_scan */ typedef void (*IndexBuildCallback) (Relation index, @@ -746,7 +755,7 @@ typedef struct TableAmRoutine * probable that we'll need to revise the interface for those at some * point. */ - uint64 (*relation_size) (Relation rel, ForkNumber forkNumber); + int64 (*relation_size) (Relation rel, ForkNumber forkNumber, uint8 method); /* @@ -1909,7 +1918,8 @@ table_index_validate_scan(Relation table_rel, static inline uint64 table_relation_size(Relation rel, ForkNumber forkNumber) { - return rel->rd_tableam->relation_size(rel, forkNumber); + int64 res = rel->rd_tableam->relation_size(rel, forkNumber, DEFAULT_SIZE); + return res >= 0 ? res : 0; } /* @@ -2132,7 +2142,7 @@ extern void table_block_parallelscan_startblock_init(Relation rel, * ---------------------------------------------------------------------------- */ -extern uint64 table_block_relation_size(Relation rel, ForkNumber forkNumber); +extern int64 table_block_relation_size(Relation rel, ForkNumber forkNumber, uint8 method); extern void table_block_relation_estimate_size(Relation rel, int32 *attr_widths, BlockNumber *pages, From 32e8790dd34c546f35190ce0b301b367760a9e10 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Thu, 19 Dec 2024 00:30:47 +0100 Subject: [PATCH 65/79] fix pg_rewind docs --- doc/src/sgml/ref/pg_rewind.sgml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 0c8e7dd2cc3..063364f9702 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -302,11 +302,19 @@ PostgreSQL documentation This option has no effect when is used. + + + + + - Load shared library that performs custom rewind for postgres extension. The path may be full or relative to PKGLIBDIR. File extension is optional. Multiple extensions can be selected by multiple switches. + Load shared library that performs custom rewind for postgres extension. + The path may be full or + relative to PKGLIBDIR. File extension is optional. Multiple extensions + can be selected by multiple switches. From 1c4146bc4abe645a59f3ebb585c0f9150c6e4986 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 24 Sep 2025 01:41:41 +0300 Subject: [PATCH 66/79] Fix copying datum in ExecModifyTable() Save the alignment. --- src/backend/executor/nodeModifyTable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index eb5934a9f12..83dc55e7b87 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3953,7 +3953,9 @@ ExecModifyTable(PlanState *pstate) if (isNull) elog(ERROR, "rowid is NULL"); - tupleid = datumCopy(datum, false, -1); + tupleid = PointerGetDatum(PG_DETOAST_DATUM(datum)); + if (tupleid == datum) + tupleid = datumCopy(datum, false, -1); } } From 29f34783810a847d2d76cb476f5cc6b188ea6d7f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 30 Sep 2025 14:00:37 +0300 Subject: [PATCH 67/79] Add GetReplayXlogPtrHook allowing extensions to adjust replay LSN Useful when extension performs some async operations in the WAL record redo function. --- src/backend/access/transam/xlogfuncs.c | 2 +- src/backend/access/transam/xlogrecovery.c | 22 ++++++++++++++++++++++ src/backend/replication/walreceiver.c | 2 +- src/backend/replication/walreceiverfuncs.c | 2 +- src/include/access/xlogrecovery.h | 6 ++++++ 5 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 4e46baaebdf..803b3b42722 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -357,7 +357,7 @@ pg_last_wal_replay_lsn(PG_FUNCTION_ARGS) { XLogRecPtr recptr; - recptr = GetXLogReplayRecPtr(NULL); + recptr = GetEffectiveXlogReplayRecPtr(); if (recptr == 0) PG_RETURN_NULL(); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index cfcf4763b15..aa2a31a2d2c 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4554,6 +4554,28 @@ GetXLogReplayRecPtr(TimeLineID *replayTLI) return recptr; } +GetReplayXlogPtrHookType GetReplayXlogPtrHook = NULL; + +/* + * Get effective latest redo apply position. + * + * Can be tuned by extensions processing WAL records asyncronously. + */ +XLogRecPtr +GetEffectiveXlogReplayRecPtr(void) +{ + XLogRecPtr recptr = InvalidXLogRecPtr; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + if (GetReplayXlogPtrHook) + recptr = GetReplayXlogPtrHook(); + if (recptr == InvalidXLogRecPtr) + recptr = XLogRecoveryCtl->lastReplayedEndRecPtr; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return recptr; +} + /* * Get position of last applied, or the record being applied. diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index acda5f68d9a..ef6b6e12145 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1135,7 +1135,7 @@ XLogWalRcvSendReply(bool force, bool requestReply) /* Construct a new message */ writePtr = LogstreamResult.Write; flushPtr = LogstreamResult.Flush; - applyPtr = GetXLogReplayRecPtr(NULL); + applyPtr = GetEffectiveXlogReplayRecPtr(); resetStringInfo(&reply_message); pq_sendbyte(&reply_message, 'r'); diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c index 85a19cdfa5c..5199e181d95 100644 --- a/src/backend/replication/walreceiverfuncs.c +++ b/src/backend/replication/walreceiverfuncs.c @@ -372,7 +372,7 @@ GetReplicationApplyDelay(void) receivePtr = walrcv->flushedUpto; SpinLockRelease(&walrcv->mutex); - replayPtr = GetXLogReplayRecPtr(NULL); + replayPtr = GetEffectiveXlogReplayRecPtr(); if (receivePtr == replayPtr) return 0; diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index c423464e8bc..0fcbca03f02 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -48,6 +48,8 @@ typedef enum RecoveryPauseState RECOVERY_PAUSED, /* recovery is paused */ } RecoveryPauseState; +typedef XLogRecPtr (*GetReplayXlogPtrHookType) (void); + /* User-settable GUC parameters */ extern PGDLLIMPORT bool recoveryTargetInclusive; extern PGDLLIMPORT int recoveryTargetAction; @@ -76,6 +78,9 @@ extern PGDLLIMPORT bool reachedConsistency; /* Are we currently in standby mode? */ extern PGDLLIMPORT bool StandbyMode; +/* Hook for extensions to tune replay xlog pointer */ +extern PGDLLIMPORT GetReplayXlogPtrHookType GetReplayXlogPtrHook; + extern Size XLogRecoveryShmemSize(void); extern void XLogRecoveryShmemInit(void); @@ -137,6 +142,7 @@ extern void RemovePromoteSignalFiles(void); extern bool HotStandbyActive(void); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); +extern XLogRecPtr GetEffectiveXlogReplayRecPtr(void); extern RecoveryPauseState GetRecoveryPauseState(void); extern void SetRecoveryPause(bool recoveryPause); extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); From 7698e716b7d3c038c08426b0506ae50f7a040afe Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 5 Oct 2025 22:14:58 +0300 Subject: [PATCH 68/79] Rework handling of running transactions by extensions. * RunningTransactionsExtension structure as a part of xl_running_xacts and RunningTransactionsData to be filled by extensions. * getRunningTransactionsExtension hook to fill RunningTransactionsExtension. * Add nextXid field to CSNSnapshotData. * SnapBuildGetCSNSnaphot() function to modify CSNSnapshotData in the SnapBuild. * waitSnapshotHook hook to wait for transactions inside the snapshot builder. --- src/backend/replication/logical/snapbuild.c | 29 ++++++++++++++------- src/backend/storage/ipc/procarray.c | 20 +++++++++++++- src/backend/storage/ipc/standby.c | 2 +- src/backend/utils/time/snapmgr.c | 8 ++---- src/include/replication/snapbuild.h | 8 ++++-- src/include/storage/standby.h | 6 ++++- src/include/storage/standbydefs.h | 13 ++++++++- src/include/utils/snapshot.h | 1 + 8 files changed, 65 insertions(+), 22 deletions(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 3b4d16445b5..f7d8c14d13f 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -1294,8 +1294,9 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact ReorderBufferTXN *txn; TransactionId xmin; - builder->csnSnapshotData.snapshotcsn = running->csn; - builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.snapshotcsn = running->extension.csn; + builder->csnSnapshotData.xmin = running->extension.runXmin; + builder->csnSnapshotData.nextXid = running->extension.nextXid; builder->csnSnapshotData.xlogptr = lsn; /* @@ -1325,9 +1326,6 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we hit fast paths in heapam_visibility.c. */ builder->xmin = running->oldestRunningXid; - builder->csnSnapshotData.snapshotcsn = running->csn; - builder->csnSnapshotData.xmin = 0; - builder->csnSnapshotData.xlogptr = lsn; /* Remove transactions we don't need to keep track off anymore */ SnapBuildPurgeOlderTxn(builder); @@ -1454,7 +1452,8 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn * NB: We might have already started to incrementally assemble a snapshot, * so we need to be careful to deal with that. */ - if (running->oldestRunningXid == running->nextXid) + if (running->oldestRunningXid == running->nextXid && + running->extension.runXmin == running->extension.nextXid) { if (builder->start_decoding_at == InvalidXLogRecPtr || builder->start_decoding_at <= lsn) @@ -1583,6 +1582,11 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn return true; } +/* + * Hook for custom waits in SnapBuildWaitSnapshot() provided by extensions. + */ +WaitSnapshotHookType waitSnapshotHook = NULL; + /* --- * Iterate through xids in record, wait for all older than the cutoff to * finish. Then, if possible, log a new xl_running_xacts record. @@ -1617,6 +1621,12 @@ SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff) XactLockTableWait(xid, NULL, NULL, XLTW_None); } + /* + * Give extensions chance for their custom waits. + */ + if (waitSnapshotHook) + waitSnapshotHook(&running->extension); + /* * All transactions we needed to finish finished - try to ensure there is * another xl_running_xacts record in a timely manner, without having to @@ -2247,9 +2257,8 @@ SnapBuildSnapshotExists(XLogRecPtr lsn) return ret == 0; } -void -SnapBuildUpdateCSNSnaphot(SnapBuild *builder, - CSNSnapshotData *csnSnapshotData) +CSNSnapshotData * +SnapBuildGetCSNSnaphot(SnapBuild *builder) { - builder->csnSnapshotData = *csnSnapshotData; + return &builder->csnSnapshotData; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 493f19194cf..e4d8f5336ba 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2680,6 +2680,11 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) return result; } +/* + * A hook for filling RunningTransactionsExtension structure by extensions. + */ +GetRunningTransactionsExtensionHookType getRunningTransactionsExtension = NULL; + /* * GetRunningTransactionData -- returns information about running transactions. * @@ -2872,7 +2877,20 @@ GetRunningTransactionData(void) CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; - CurrentRunningXacts->csn = pg_atomic_read_u64(&TransamVariables->nextCommitSeqNo); + + /* + * Give extensions chance to fill their structs. + */ + if (getRunningTransactionsExtension) + { + getRunningTransactionsExtension(&CurrentRunningXacts->extension); + } + else + { + CurrentRunningXacts->extension.csn = 0; + CurrentRunningXacts->extension.nextXid = 0; + CurrentRunningXacts->extension.runXmin = 0; + } Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 17ddeb893c6..cf51f92d93c 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1353,7 +1353,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; - xlrec.csn = CurrRunningXacts->csn; + xlrec.extension = CurrRunningXacts->extension; /* Header */ XLogBeginInsert(); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index b15ec90f637..c8de31657b5 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -1763,9 +1763,7 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; - serialized_snapshot.csnSnapshotData.xmin = snapshot->csnSnapshotData.xmin; - serialized_snapshot.csnSnapshotData.snapshotcsn = snapshot->csnSnapshotData.snapshotcsn; - serialized_snapshot.csnSnapshotData.xlogptr = snapshot->csnSnapshotData.xlogptr; + serialized_snapshot.csnSnapshotData = snapshot->csnSnapshotData; serialized_snapshot.undoRegularRowLocation = snapshot->undoRegularRowLocationPhNode.undoLocation; serialized_snapshot.undoRegularPageLocation = snapshot->undoRegularPageLocationPhNode.undoLocation; serialized_snapshot.undoSystemLocation = snapshot->undoSystemLocationPhNode.undoLocation; @@ -1844,9 +1842,7 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; - snapshot->csnSnapshotData.xmin = serialized_snapshot.csnSnapshotData.xmin; - snapshot->csnSnapshotData.snapshotcsn = serialized_snapshot.csnSnapshotData.snapshotcsn; - snapshot->csnSnapshotData.xlogptr = serialized_snapshot.csnSnapshotData.xlogptr; + snapshot->csnSnapshotData = serialized_snapshot.csnSnapshotData; snapshot->undoRegularRowLocationPhNode.undoLocation = serialized_snapshot.undoRegularRowLocation; snapshot->undoRegularPageLocationPhNode.undoLocation = serialized_snapshot.undoRegularPageLocation; snapshot->undoSystemLocationPhNode.undoLocation = serialized_snapshot.undoSystemLocation; diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 4a74c89c358..d38e340db92 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -13,6 +13,7 @@ #define SNAPBUILD_H #include "access/xlogdefs.h" +#include "storage/standbydefs.h" #include "utils/snapmgr.h" typedef enum @@ -57,6 +58,10 @@ struct ReorderBuffer; struct xl_heap_new_cid; struct xl_running_xacts; +typedef void (*WaitSnapshotHookType) (RunningTransactionsExtension *extension); + +extern PGDLLIMPORT WaitSnapshotHookType waitSnapshotHook; + extern void CheckPointSnapBuild(void); extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *reorder, @@ -92,8 +97,7 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, struct xl_running_xacts *running); extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn); -extern void SnapBuildUpdateCSNSnaphot(SnapBuild *builder, - CSNSnapshotData *csnSnapshotData); +extern CSNSnapshotData *SnapBuildGetCSNSnaphot(SnapBuild *builder); extern bool SnapBuildSnapshotExists(XLogRecPtr lsn); diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 4a42f9a767b..458f685ba52 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -93,13 +93,17 @@ typedef struct RunningTransactionsData TransactionId oldestDatabaseRunningXid; /* same as above, but within the * current database */ TransactionId latestCompletedXid; /* so we can set xmax */ - CommitSeqNo csn; /* current csn */ + RunningTransactionsExtension extension; TransactionId *xids; /* array of (sub)xids still running */ } RunningTransactionsData; typedef RunningTransactionsData *RunningTransactions; +typedef void (*GetRunningTransactionsExtensionHookType) (RunningTransactionsExtension *extension); + +extern PGDLLIMPORT GetRunningTransactionsExtensionHookType getRunningTransactionsExtension; + extern void LogAccessExclusiveLock(Oid dbOid, Oid relOid); extern void LogAccessExclusiveLockPrepare(void); diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index 394bc42052f..9149e77de63 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -41,6 +41,17 @@ typedef struct xl_standby_locks xl_standby_lock locks[FLEXIBLE_ARRAY_MEMBER]; } xl_standby_locks; +/* + * A part of xl_running_xacts and RunningTransactionsData to be filled by + * extensions. + */ +typedef struct +{ + uint64 nextXid; + uint64 runXmin; + CommitSeqNo csn; /* current csn */ +} RunningTransactionsExtension; + /* * When we write running xact data to WAL, we use this structure. */ @@ -52,7 +63,7 @@ typedef struct xl_running_xacts TransactionId nextXid; /* xid from TransamVariables->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ - CommitSeqNo csn; /* current csn */ + RunningTransactionsExtension extension; TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; } xl_running_xacts; diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 68464b66127..27bf09654ee 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -131,6 +131,7 @@ typedef struct typedef struct CSNSnapshotData { uint64 xmin; + uint64 nextXid; CommitSeqNo snapshotcsn; XLogRecPtr xlogptr; } CSNSnapshotData; From e42c6aa695cc1e02e9d82c01bb09dfd07aa7d164 Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Mon, 6 Oct 2025 18:17:04 +0300 Subject: [PATCH 69/79] TableAM: add relation_reindex --- src/backend/access/heap/heapam_handler.c | 7 +++++ src/backend/catalog/index.c | 34 +++++++++++------------- src/backend/commands/cluster.c | 17 +++++++++--- src/backend/commands/indexcmds.c | 26 ++++++++++++++++-- src/backend/commands/tablecmds.c | 7 ++--- src/include/access/tableam.h | 11 ++++++++ src/include/catalog/index.h | 2 +- 7 files changed, 74 insertions(+), 30 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 354f1f225c3..0f4bbb556a1 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -979,6 +979,12 @@ heapam_relation_nontransactional_truncate(Relation rel) RelationTruncate(rel, 0); } +static bool +heapam_relation_reindex(Relation rel, const ReindexStmt *stmt, int flags, const ReindexParams *params) +{ + return reindex_relation(stmt, rel, flags, params); +} + static void heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) { @@ -3002,6 +3008,7 @@ static const TableAmRoutine heapam_methods = { .relation_set_new_filelocator = heapam_relation_set_new_filelocator, .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, + .relation_reindex = heapam_relation_reindex, .relation_copy_data = heapam_relation_copy_data, .relation_copy_for_cluster = heapam_relation_copy_for_cluster, .relation_vacuum = heap_vacuum_rel, diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index b447a080da5..f5c9297d368 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3878,10 +3878,9 @@ reindex_index(const ReindexStmt *stmt, Oid indexId, * index rebuild. */ bool -reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, +reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, const ReindexParams *params) { - Relation rel; Oid toast_relid; List *indexIds; char persistence; @@ -3889,15 +3888,6 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, ListCell *indexId; int i; - /* - * Open and lock the relation. ShareLock is sufficient since we only need - * to prevent schema and data changes in it. The lock level used here - * should match ReindexTable(). - */ - if ((params->options & REINDEXOPT_MISSING_OK) != 0) - rel = try_table_open(relid, ShareLock); - else - rel = table_open(relid, ShareLock); /* if relation is gone, leave */ if (!rel) @@ -3955,10 +3945,21 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, * This rule is enforced by setting tablespaceOid to InvalidOid. */ ReindexParams newparams = *params; - + /* + * Open and lock the relation. ShareLock is sufficient since we only need + * to prevent schema and data changes in it. The lock level used here + * should match ReindexTable(). + */ + Relation toast_rel = table_open(toast_relid, ShareLock); + newparams.options &= ~(REINDEXOPT_MISSING_OK); newparams.tablespaceOid = InvalidOid; - result |= reindex_relation(stmt, toast_relid, flags, &newparams); + result |= reindex_relation(stmt, toast_rel, flags, &newparams); + + /* + * Close rel, but continue to hold the lock. + */ + table_close(toast_rel, NoLock); } /* @@ -4016,12 +4017,7 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, i); i++; } - - /* - * Close rel, but continue to hold the lock. - */ - table_close(rel, NoLock); - + result |= (indexIds != NIL); return result; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 78f96789b0e..ec94ea612cc 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1449,6 +1449,7 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, int reindex_flags; ReindexParams reindex_params = {0}; int i; + Relation oldHeap; /* Report that we are now swapping relation files */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, @@ -1504,9 +1505,19 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, /* Report that we are now reindexing relations */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); - - reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params); - + + /* + * Open and lock the relation. ShareLock is sufficient since we only need + * to prevent schema and data changes in it. The lock level used here + * should match ReindexTable(). + */ + oldHeap = table_open(OIDOldHeap, ShareLock); + table_relation_reindex(oldHeap, NULL, reindex_flags, &reindex_params); + /* + * Close rel, but continue to hold the lock. + */ + table_close(oldHeap, NoLock); + /* Report that we are now doing clean up */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 0782ba10a66..b5fa81de709 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -2950,12 +2950,23 @@ ReindexTable(const ReindexStmt *stmt, const ReindexParams *params, bool isTopLev else { ReindexParams newparams = *params; + Relation rel; newparams.options |= REINDEXOPT_REPORT_PROGRESS; - result = reindex_relation(stmt, heapOid, + + if ((newparams.options & REINDEXOPT_MISSING_OK) != 0) + rel = try_table_open(heapOid, ShareLock); + else + rel = table_open(heapOid, ShareLock); + result = table_relation_reindex(rel, stmt, REINDEX_REL_PROCESS_TOAST | REINDEX_REL_CHECK_CONSTRAINTS, &newparams); + /* + * Close rel, but continue to hold the lock. + */ + table_close(rel, NoLock); + if (!result) ereport(NOTICE, (errmsg("table \"%s\" has no indexes to reindex", @@ -3386,13 +3397,24 @@ ReindexMultipleInternal(const ReindexStmt *stmt, const List *relids, const Reind { bool result; ReindexParams newparams = *params; + Relation rel; newparams.options |= REINDEXOPT_REPORT_PROGRESS | REINDEXOPT_MISSING_OK; - result = reindex_relation(stmt, relid, + /* + * Open and lock the relation. ShareLock is sufficient since we only need + * to prevent schema and data changes in it. The lock level used here + * should match ReindexTable(). + */ + rel = try_table_open(relid, ShareLock); + result = table_relation_reindex(rel, stmt, REINDEX_REL_PROCESS_TOAST | REINDEX_REL_CHECK_CONSTRAINTS, &newparams); + /* + * Close rel, but continue to hold the lock. + */ + table_close(rel, NoLock); if (result && (params->options & REINDEXOPT_VERBOSE) != 0) ereport(INFO, diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index eb68bd8b552..f2925c5b2a1 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -2145,7 +2145,6 @@ ExecuteTruncateGuts(List *explicit_rels, } else { - Oid heap_relid; Oid toast_relid; ReindexParams reindex_params = {0}; @@ -2166,8 +2165,6 @@ ExecuteTruncateGuts(List *explicit_rels, */ RelationSetNewRelfilenumber(rel, rel->rd_rel->relpersistence); - heap_relid = RelationGetRelid(rel); - /* * The same for the toast table, if any. */ @@ -2185,8 +2182,8 @@ ExecuteTruncateGuts(List *explicit_rels, /* * Reconstruct the indexes to match, and we're done. */ - reindex_relation(NULL, heap_relid, REINDEX_REL_PROCESS_TOAST, - &reindex_params); + table_relation_reindex(rel, NULL, REINDEX_REL_PROCESS_TOAST, + &reindex_params); } pgstat_count_truncate(rel); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index b142c098d8c..085d16b7257 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -28,6 +28,7 @@ #include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "catalog/index.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -647,6 +648,9 @@ typedef struct TableAmRoutine */ void (*relation_nontransactional_truncate) (Relation rel); + /* See reindex_relation for reference about parameters */ + bool (*relation_reindex) (Relation rel, const ReindexStmt *stmt, int flags, const ReindexParams *params); + /* * See table_relation_copy_data(). * @@ -1692,6 +1696,13 @@ table_relation_nontransactional_truncate(Relation rel) rel->rd_tableam->relation_nontransactional_truncate(rel); } +/* See reindex_relation for reference about parameters */ +static inline bool +table_relation_reindex(Relation rel, const ReindexStmt *stmt, int flags, const ReindexParams *params) +{ + return rel->rd_tableam->relation_reindex(rel, stmt, flags, params); +} + /* * Copy data from `rel` into the new relfilelocator `newrlocator`. The new * relfilelocator may not have storage associated before this function is diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 0beab397c79..44cf86aa515 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -161,7 +161,7 @@ extern void reindex_index(const ReindexStmt *stmt, Oid indexId, #define REINDEX_REL_FORCE_INDEXES_UNLOGGED 0x08 #define REINDEX_REL_FORCE_INDEXES_PERMANENT 0x10 -extern bool reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, +extern bool reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, const ReindexParams *params); extern bool ReindexIsProcessingHeap(Oid heapOid); From 1b6365bc8eeba0fa155fd39604a633207137fe22 Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Fri, 10 Oct 2025 20:11:29 +0300 Subject: [PATCH 70/79] IndexAM: Add amreuse --- contrib/bloom/blutils.c | 1 + src/backend/access/brin/brin.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gist/gist.c | 1 + src/backend/access/hash/hash.c | 1 + src/backend/access/nbtree/nbtree.c | 1 + src/backend/access/spgist/spgutils.c | 1 + src/backend/commands/tablecmds.c | 42 +++++++++++++++++++ src/include/access/amapi.h | 4 ++ .../modules/dummy_index_am/dummy_index_am.c | 1 + src/tools/pgindent/typedefs.list | 1 + 11 files changed, 55 insertions(+) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 9b72303c895..b7024747d08 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -130,6 +130,7 @@ blhandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = blbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = blbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = blinsert; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 1e264145051..b2593a99223 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -272,6 +272,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = brinbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = brinbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = brininsert; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 68ce032f150..1d138051593 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -62,6 +62,7 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = ginbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = ginbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = gininsert; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 0117b62831e..9b3fa4f98b6 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -84,6 +84,7 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = gistbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = gistbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = gistinsert; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 557c7a3f316..fb2cc76f547 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -82,6 +82,7 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->amkeytype = INT4OID; amroutine->ambuild = hashbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = hashbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = hashinsert; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b661adb689e..d978fd2d542 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -126,6 +126,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = btbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = btbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = btinsert; diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index c1228ed2c01..094d8d7c200 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -69,6 +69,7 @@ spghandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = spgbuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = spgbuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = spginsert; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index f2925c5b2a1..8263dea7d6f 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -94,6 +94,7 @@ #include "tcop/utility.h" #include "utils/acl.h" #include "utils/builtins.h" +#include "utils/elog.h" #include "utils/fmgroids.h" #include "utils/inval.h" #include "utils/lsyscache.h" @@ -14308,9 +14309,50 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) /* If it's a partitioned index, there is no storage to share. */ if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) { + HeapTuple tuple; + Form_pg_am accessMethodForm; + IndexAmRoutine *amRoutine; + char *accessMethodName; + Oid heapRelId = IndexGetRelation(oldId, false); + Relation heapRel = table_open(heapRelId, ShareLock); + stmt->oldNumber = irel->rd_locator.relNumber; stmt->oldCreateSubid = irel->rd_createSubid; stmt->oldFirstRelfilelocatorSubid = irel->rd_firstRelfilelocatorSubid; + + + /* + * look up the access method to call amreuse + */ + accessMethodName = stmt->accessMethod; + tuple = SearchSysCache1(AMNAME, PointerGetDatum(accessMethodName)); + if (!HeapTupleIsValid(tuple)) + { + /* + * Hack to provide more-or-less-transparent updating of old RTREE + * indexes to GiST: if RTREE is requested and not found, use GIST. + */ + if (strcmp(accessMethodName, "rtree") == 0) + { + ereport(NOTICE, + (errmsg("substituting access method \"gist\" for obsolete method \"rtree\""))); + accessMethodName = "gist"; + tuple = SearchSysCache1(AMNAME, PointerGetDatum(accessMethodName)); + } + + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("access method \"%s\" does not exist", + accessMethodName))); + } + accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); + amRoutine = GetIndexAmRoutineWithTableAM(heapRel->rd_rel->relam, accessMethodForm->amhandler); + table_close(heapRel, NoLock); + + if(amRoutine->amreuse) { + (*amRoutine->amreuse)(irel); + } } index_close(irel, NoLock); } diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 1addc03701d..c8cf2b2b2bd 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -104,6 +104,9 @@ typedef IndexBuildResult *(*ambuild_function) (Relation heapRelation, Relation indexRelation, struct IndexInfo *indexInfo); +/* reuse current index - don't drop it */ +typedef void (*amreuse_function) (Relation indexRelation); + /* build empty index */ typedef void (*ambuildempty_function) (Relation indexRelation); @@ -301,6 +304,7 @@ typedef struct IndexAmRoutine /* interface functions */ ambuild_function ambuild; + amreuse_function amreuse; ambuildempty_function ambuildempty; aminsert_function aminsert; aminsert_extended_function aminsertextended; diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 1c6825f391a..0924f13dc3b 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -302,6 +302,7 @@ dihandler(PG_FUNCTION_ARGS) amroutine->amkeytype = InvalidOid; amroutine->ambuild = dibuild; + amroutine->amreuse = NULL; amroutine->ambuildempty = dibuildempty; amroutine->aminsert = NULL; amroutine->aminsertextended = diinsert; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 5ddc2508508..5760d479504 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3258,6 +3258,7 @@ allocfunc amadjustmembers_function ambeginscan_function ambuild_function +amreuse_function ambuildempty_function ambuildphasename_function ambulkdelete_function From fd5baeb1b4af0affa698d480f46380aecae37ad1 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Wed, 22 Oct 2025 18:23:43 +0200 Subject: [PATCH 71/79] Revert "Merge pull request #25 from e-ivkov/truncate-reindex" This reverts commit d7ccd3d4c403626ed16409ed7e5eea9f71bfde64, reversing changes made to 7698e716b7d3c038c08426b0506ae50f7a040afe. --- src/backend/access/heap/heapam_handler.c | 7 ----- src/backend/catalog/index.c | 34 +++++++++++++----------- src/backend/commands/cluster.c | 17 +++--------- src/backend/commands/indexcmds.c | 26 ++---------------- src/backend/commands/tablecmds.c | 7 +++-- src/include/access/tableam.h | 11 -------- src/include/catalog/index.h | 2 +- 7 files changed, 30 insertions(+), 74 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 0f4bbb556a1..354f1f225c3 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -979,12 +979,6 @@ heapam_relation_nontransactional_truncate(Relation rel) RelationTruncate(rel, 0); } -static bool -heapam_relation_reindex(Relation rel, const ReindexStmt *stmt, int flags, const ReindexParams *params) -{ - return reindex_relation(stmt, rel, flags, params); -} - static void heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) { @@ -3008,7 +3002,6 @@ static const TableAmRoutine heapam_methods = { .relation_set_new_filelocator = heapam_relation_set_new_filelocator, .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, - .relation_reindex = heapam_relation_reindex, .relation_copy_data = heapam_relation_copy_data, .relation_copy_for_cluster = heapam_relation_copy_for_cluster, .relation_vacuum = heap_vacuum_rel, diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f5c9297d368..b447a080da5 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3878,9 +3878,10 @@ reindex_index(const ReindexStmt *stmt, Oid indexId, * index rebuild. */ bool -reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, +reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, const ReindexParams *params) { + Relation rel; Oid toast_relid; List *indexIds; char persistence; @@ -3888,6 +3889,15 @@ reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, ListCell *indexId; int i; + /* + * Open and lock the relation. ShareLock is sufficient since we only need + * to prevent schema and data changes in it. The lock level used here + * should match ReindexTable(). + */ + if ((params->options & REINDEXOPT_MISSING_OK) != 0) + rel = try_table_open(relid, ShareLock); + else + rel = table_open(relid, ShareLock); /* if relation is gone, leave */ if (!rel) @@ -3945,21 +3955,10 @@ reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, * This rule is enforced by setting tablespaceOid to InvalidOid. */ ReindexParams newparams = *params; - /* - * Open and lock the relation. ShareLock is sufficient since we only need - * to prevent schema and data changes in it. The lock level used here - * should match ReindexTable(). - */ - Relation toast_rel = table_open(toast_relid, ShareLock); - + newparams.options &= ~(REINDEXOPT_MISSING_OK); newparams.tablespaceOid = InvalidOid; - result |= reindex_relation(stmt, toast_rel, flags, &newparams); - - /* - * Close rel, but continue to hold the lock. - */ - table_close(toast_rel, NoLock); + result |= reindex_relation(stmt, toast_relid, flags, &newparams); } /* @@ -4017,7 +4016,12 @@ reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, i); i++; } - + + /* + * Close rel, but continue to hold the lock. + */ + table_close(rel, NoLock); + result |= (indexIds != NIL); return result; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index ec94ea612cc..78f96789b0e 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1449,7 +1449,6 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, int reindex_flags; ReindexParams reindex_params = {0}; int i; - Relation oldHeap; /* Report that we are now swapping relation files */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, @@ -1505,19 +1504,9 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, /* Report that we are now reindexing relations */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); - - /* - * Open and lock the relation. ShareLock is sufficient since we only need - * to prevent schema and data changes in it. The lock level used here - * should match ReindexTable(). - */ - oldHeap = table_open(OIDOldHeap, ShareLock); - table_relation_reindex(oldHeap, NULL, reindex_flags, &reindex_params); - /* - * Close rel, but continue to hold the lock. - */ - table_close(oldHeap, NoLock); - + + reindex_relation(NULL, OIDOldHeap, reindex_flags, &reindex_params); + /* Report that we are now doing clean up */ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b5fa81de709..0782ba10a66 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -2950,23 +2950,12 @@ ReindexTable(const ReindexStmt *stmt, const ReindexParams *params, bool isTopLev else { ReindexParams newparams = *params; - Relation rel; newparams.options |= REINDEXOPT_REPORT_PROGRESS; - - if ((newparams.options & REINDEXOPT_MISSING_OK) != 0) - rel = try_table_open(heapOid, ShareLock); - else - rel = table_open(heapOid, ShareLock); - result = table_relation_reindex(rel, stmt, + result = reindex_relation(stmt, heapOid, REINDEX_REL_PROCESS_TOAST | REINDEX_REL_CHECK_CONSTRAINTS, &newparams); - /* - * Close rel, but continue to hold the lock. - */ - table_close(rel, NoLock); - if (!result) ereport(NOTICE, (errmsg("table \"%s\" has no indexes to reindex", @@ -3397,24 +3386,13 @@ ReindexMultipleInternal(const ReindexStmt *stmt, const List *relids, const Reind { bool result; ReindexParams newparams = *params; - Relation rel; newparams.options |= REINDEXOPT_REPORT_PROGRESS | REINDEXOPT_MISSING_OK; - /* - * Open and lock the relation. ShareLock is sufficient since we only need - * to prevent schema and data changes in it. The lock level used here - * should match ReindexTable(). - */ - rel = try_table_open(relid, ShareLock); - result = table_relation_reindex(rel, stmt, + result = reindex_relation(stmt, relid, REINDEX_REL_PROCESS_TOAST | REINDEX_REL_CHECK_CONSTRAINTS, &newparams); - /* - * Close rel, but continue to hold the lock. - */ - table_close(rel, NoLock); if (result && (params->options & REINDEXOPT_VERBOSE) != 0) ereport(INFO, diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 8263dea7d6f..12c51389e0d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -2146,6 +2146,7 @@ ExecuteTruncateGuts(List *explicit_rels, } else { + Oid heap_relid; Oid toast_relid; ReindexParams reindex_params = {0}; @@ -2166,6 +2167,8 @@ ExecuteTruncateGuts(List *explicit_rels, */ RelationSetNewRelfilenumber(rel, rel->rd_rel->relpersistence); + heap_relid = RelationGetRelid(rel); + /* * The same for the toast table, if any. */ @@ -2183,8 +2186,8 @@ ExecuteTruncateGuts(List *explicit_rels, /* * Reconstruct the indexes to match, and we're done. */ - table_relation_reindex(rel, NULL, REINDEX_REL_PROCESS_TOAST, - &reindex_params); + reindex_relation(NULL, heap_relid, REINDEX_REL_PROCESS_TOAST, + &reindex_params); } pgstat_count_truncate(rel); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 085d16b7257..b142c098d8c 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -28,7 +28,6 @@ #include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" -#include "catalog/index.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -648,9 +647,6 @@ typedef struct TableAmRoutine */ void (*relation_nontransactional_truncate) (Relation rel); - /* See reindex_relation for reference about parameters */ - bool (*relation_reindex) (Relation rel, const ReindexStmt *stmt, int flags, const ReindexParams *params); - /* * See table_relation_copy_data(). * @@ -1696,13 +1692,6 @@ table_relation_nontransactional_truncate(Relation rel) rel->rd_tableam->relation_nontransactional_truncate(rel); } -/* See reindex_relation for reference about parameters */ -static inline bool -table_relation_reindex(Relation rel, const ReindexStmt *stmt, int flags, const ReindexParams *params) -{ - return rel->rd_tableam->relation_reindex(rel, stmt, flags, params); -} - /* * Copy data from `rel` into the new relfilelocator `newrlocator`. The new * relfilelocator may not have storage associated before this function is diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 44cf86aa515..0beab397c79 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -161,7 +161,7 @@ extern void reindex_index(const ReindexStmt *stmt, Oid indexId, #define REINDEX_REL_FORCE_INDEXES_UNLOGGED 0x08 #define REINDEX_REL_FORCE_INDEXES_PERMANENT 0x10 -extern bool reindex_relation(const ReindexStmt *stmt, Relation rel, int flags, +extern bool reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, const ReindexParams *params); extern bool ReindexIsProcessingHeap(Oid heapOid); From e949bf36a5e74706bbcc49b69d1b3fec9802e4b3 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 25 Oct 2025 19:50:06 +0300 Subject: [PATCH 72/79] Correctly extract ORIOLEDB_PATCHSET_VERSION from git * Use patchset number is there is an exact match to the tag * Use commit hash otherwise --- configure | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index ae1e42f3046..ea1a86e2833 100755 --- a/configure +++ b/configure @@ -19378,7 +19378,7 @@ _ACEOF # Needed to check postgresql patches git tag during orioledb extension build -ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2` +ORIOLEDB_PATCHSET_VERSION=`tag=$(git describe --tags --exact-match 2>/dev/null) && echo "$tag" | cut -d'_' -f2 || git rev-parse HEAD` # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not diff --git a/configure.ac b/configure.ac index 74149784c30..a8a7a90d827 100644 --- a/configure.ac +++ b/configure.ac @@ -2461,7 +2461,7 @@ AC_DEFINE_UNQUOTED(PG_VERSION_NUM, $PG_VERSION_NUM, [PostgreSQL version as a num AC_SUBST(PG_VERSION_NUM) # Needed to check postgresql patches git tag during orioledb extension build -[ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2`] +[ORIOLEDB_PATCHSET_VERSION=`tag=$(git describe --tags --exact-match 2>/dev/null) && echo "$tag" | cut -d'_' -f2 || git rev-parse HEAD`] AC_SUBST(ORIOLEDB_PATCHSET_VERSION) # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not From abfba239d6057acd271261c4e630c7f1cc69a15a Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 25 Oct 2025 21:04:34 +0300 Subject: [PATCH 73/79] Allow skipping transactions for ReorderBufferAbortOld() Transactions need to be marked as RBTXN_DISTR_SKIP_CLEANUP to be skipped in ReorderBufferAbortOld(). --- src/backend/replication/logical/reorderbuffer.c | 3 ++- src/include/replication/reorderbuffer.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 4bd1f7af061..ee103bf5f34 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -3028,7 +3028,8 @@ ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid) txn = dlist_container(ReorderBufferTXN, node, it.cur); - if (TransactionIdPrecedes(txn->xid, oldestRunningXid)) + if (!(txn->txn_flags & RBTXN_DISTR_SKIP_CLEANUP) && + TransactionIdPrecedes(txn->xid, oldestRunningXid)) { elog(DEBUG2, "aborting old transaction %u", txn->xid); diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 0cfa9005141..05fa54da999 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -169,6 +169,7 @@ typedef struct ReorderBufferChange #define RBTXN_SKIPPED_PREPARE 0x0080 #define RBTXN_HAS_STREAMABLE_CHANGE 0x0100 #define RBTXN_DISTR_INVAL_OVERFLOWED 0x0200 +#define RBTXN_DISTR_SKIP_CLEANUP 0x0400 /* Does the transaction have catalog changes? */ #define rbtxn_has_catalog_changes(txn) \ From 00aff1ddc299330310c57ddc087e83b1f819e191 Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Mon, 20 Oct 2025 17:47:53 +0300 Subject: [PATCH 74/79] Enable running subscription tests with oriole --- patches/subscription_enable_oriole.diff | 20 ++++++++++++++++++++ src/test/subscription/Makefile | 22 ++++++++++++++++++++++ src/test/subscription/oriole_tests.txt | 12 ++++++++++++ src/test/subscription/orioledb.conf | 2 ++ 4 files changed, 56 insertions(+) create mode 100644 patches/subscription_enable_oriole.diff create mode 100644 src/test/subscription/oriole_tests.txt create mode 100644 src/test/subscription/orioledb.conf diff --git a/patches/subscription_enable_oriole.diff b/patches/subscription_enable_oriole.diff new file mode 100644 index 00000000000..fa020b8f2f2 --- /dev/null +++ b/patches/subscription_enable_oriole.diff @@ -0,0 +1,20 @@ +diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm +index b95ba721f71..ea04f1c0b1b 100644 +--- a/src/test/perl/PostgreSQL/Test/Cluster.pm ++++ b/src/test/perl/PostgreSQL/Test/Cluster.pm +@@ -1060,6 +1060,15 @@ sub start + } + + $self->_update_pid(1); ++ ++ if ((! -e $self->data_dir . '/standby.signal') && (! -e $self->data_dir . '/recovery.signal')) ++ { ++ $self->safe_psql( ++ 'template1', qq{CREATE EXTENSION IF NOT EXISTS orioledb;}); ++ $self->safe_psql( ++ 'postgres', qq{CREATE EXTENSION IF NOT EXISTS orioledb;}); ++ } ++ + return 1; + } + diff --git a/src/test/subscription/Makefile b/src/test/subscription/Makefile index ce1ca430095..e4696a7cbd8 100644 --- a/src/test/subscription/Makefile +++ b/src/test/subscription/Makefile @@ -23,5 +23,27 @@ check: installcheck: $(prove_installcheck) +# Run OrioleDB-specific subscription tests +# Tests to run are listed in oriole_tests.txt +installcheck-oriole: + @if [ ! -f oriole_tests.txt ]; then \ + echo "Error: oriole_tests.txt not found"; \ + echo "Create oriole_tests.txt with a list of test files to run"; \ + exit 1; \ + fi + @if [ ! -f orioledb.conf ]; then \ + echo "Error: orioledb.conf not found"; \ + exit 1; \ + fi + @ORIOLE_TESTS=$$(grep -v '^#' oriole_tests.txt | grep -v '^$$' | tr '\n' ' '); \ + if [ -z "$$ORIOLE_TESTS" ]; then \ + echo "No tests found in oriole_tests.txt"; \ + exit 1; \ + fi; \ + $(MAKE) installcheck \ + TEMP_CONFIG="$(CURDIR)/orioledb.conf" \ + PG_TEST_INITDB_EXTRA_OPTS="--locale=C" \ + PROVE_TESTS="$$ORIOLE_TESTS" + clean distclean: rm -rf tmp_check diff --git a/src/test/subscription/oriole_tests.txt b/src/test/subscription/oriole_tests.txt new file mode 100644 index 00000000000..2d31289033c --- /dev/null +++ b/src/test/subscription/oriole_tests.txt @@ -0,0 +1,12 @@ +# List of subscription tests to run with OrioleDB +# Add test files here, one per line + +t/005_encoding.pl +t/006_rewrite.pl +t/007_ddl.pl +t/008_diff_schema.pl +t/010_truncate.pl +t/011_generated.pl +t/020_messages.pl +t/024_add_drop_pub.pl +t/026_stats.pl diff --git a/src/test/subscription/orioledb.conf b/src/test/subscription/orioledb.conf new file mode 100644 index 00000000000..a24cdcbc305 --- /dev/null +++ b/src/test/subscription/orioledb.conf @@ -0,0 +1,2 @@ +default_table_access_method = 'orioledb' +shared_preload_libraries = 'orioledb' From 916a946c7fbdd0405a21651cdde9a41965cce945 Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Wed, 5 Nov 2025 14:44:04 +0300 Subject: [PATCH 75/79] Fix warning: resource was not closed: cache pg_am --- src/backend/commands/tablecmds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 12c51389e0d..b3a39917d54 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14351,6 +14351,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) } accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); amRoutine = GetIndexAmRoutineWithTableAM(heapRel->rd_rel->relam, accessMethodForm->amhandler); + ReleaseSysCache(tuple); table_close(heapRel, NoLock); if(amRoutine->amreuse) { From 65f77f49d79156c3132f51a908c0896ad67b0185 Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Wed, 5 Nov 2025 14:49:03 +0300 Subject: [PATCH 76/79] Fix formatting for amreuse call --- src/backend/commands/tablecmds.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b3a39917d54..fc745542a63 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14316,14 +14316,14 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) Form_pg_am accessMethodForm; IndexAmRoutine *amRoutine; char *accessMethodName; - Oid heapRelId = IndexGetRelation(oldId, false); - Relation heapRel = table_open(heapRelId, ShareLock); - + Oid heapRelId = IndexGetRelation(oldId, false); + Relation heapRel = table_open(heapRelId, ShareLock); + stmt->oldNumber = irel->rd_locator.relNumber; stmt->oldCreateSubid = irel->rd_createSubid; stmt->oldFirstRelfilelocatorSubid = irel->rd_firstRelfilelocatorSubid; - - + + /* * look up the access method to call amreuse */ @@ -14332,8 +14332,9 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) if (!HeapTupleIsValid(tuple)) { /* - * Hack to provide more-or-less-transparent updating of old RTREE - * indexes to GiST: if RTREE is requested and not found, use GIST. + * Hack to provide more-or-less-transparent updating of old + * RTREE indexes to GiST: if RTREE is requested and not found, + * use GIST. */ if (strcmp(accessMethodName, "rtree") == 0) { @@ -14353,9 +14354,10 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) amRoutine = GetIndexAmRoutineWithTableAM(heapRel->rd_rel->relam, accessMethodForm->amhandler); ReleaseSysCache(tuple); table_close(heapRel, NoLock); - - if(amRoutine->amreuse) { - (*amRoutine->amreuse)(irel); + + if (amRoutine->amreuse) + { + (*amRoutine->amreuse) (irel); } } index_close(irel, NoLock); From 932d7bdb2036b37311ebde9aa2484a9ce124237d Mon Sep 17 00:00:00 2001 From: Artur Zakirov Date: Tue, 4 Nov 2025 14:28:54 +0100 Subject: [PATCH 77/79] Add RecoveryStopsHook for extensions to handle custom WAL records --- src/backend/access/transam/xlogrecovery.c | 24 ++++++++++++++++++++++- src/include/access/xlogrecovery.h | 10 ++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index aa2a31a2d2c..2cd9acf9dc4 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -2617,8 +2617,28 @@ recoveryStopsBefore(XLogReaderState *record) return true; } + /* Check RecoveryStopsHook for custom records */ + if (RmgrIdIsCustom(XLogRecGetRmid(record)) && (RecoveryStopsBeforeHook != NULL)) + { + stopsHere = RecoveryStopsBeforeHook(record, &recordXid, &recordXtime); + + if (stopsHere) + { + recoveryStopAfter = false; + recoveryStopXid = recordXid; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopTime = recordXtime; + recoveryStopName[0] = '\0'; + + ereport(LOG, + (errmsg("recovery stopping by hook before transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + return true; + } + } /* Otherwise we only consider stopping before COMMIT or ABORT records. */ - if (XLogRecGetRmid(record) != RM_XACT_ID) + else if (XLogRecGetRmid(record) != RM_XACT_ID) return false; xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; @@ -4556,6 +4576,8 @@ GetXLogReplayRecPtr(TimeLineID *replayTLI) GetReplayXlogPtrHookType GetReplayXlogPtrHook = NULL; +RecoveryStopsBeforeHookType RecoveryStopsBeforeHook = NULL; + /* * Get effective latest redo apply position. * diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 0fcbca03f02..8966177915b 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -50,6 +50,10 @@ typedef enum RecoveryPauseState typedef XLogRecPtr (*GetReplayXlogPtrHookType) (void); +typedef bool (*RecoveryStopsBeforeHookType) (XLogReaderState *record, + TransactionId *recordXid, + TimestampTz *recordXtime); + /* User-settable GUC parameters */ extern PGDLLIMPORT bool recoveryTargetInclusive; extern PGDLLIMPORT int recoveryTargetAction; @@ -81,6 +85,12 @@ extern PGDLLIMPORT bool StandbyMode; /* Hook for extensions to tune replay xlog pointer */ extern PGDLLIMPORT GetReplayXlogPtrHookType GetReplayXlogPtrHook; +/* + * Hook for extensions to be able to decides to stop applying the WAL files + * based on custom WAL records. + */ +extern PGDLLIMPORT RecoveryStopsBeforeHookType RecoveryStopsBeforeHook; + extern Size XLogRecoveryShmemSize(void); extern void XLogRecoveryShmemInit(void); From 4c429002c81f457ae31f84a455d7d6633c61caf3 Mon Sep 17 00:00:00 2001 From: Egor Ivkov Date: Mon, 24 Nov 2025 16:24:17 +0300 Subject: [PATCH 78/79] Enable subscription test 004_sync.pl --- src/test/subscription/oriole_tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/subscription/oriole_tests.txt b/src/test/subscription/oriole_tests.txt index 2d31289033c..f3aac24b3bd 100644 --- a/src/test/subscription/oriole_tests.txt +++ b/src/test/subscription/oriole_tests.txt @@ -1,6 +1,7 @@ # List of subscription tests to run with OrioleDB # Add test files here, one per line +t/004_sync.pl t/005_encoding.pl t/006_rewrite.pl t/007_ddl.pl From 94a5a4ff2cc1f4062d988094c1da74ad9edf8672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BB=D0=B8=D0=B7=D0=B0=D0=B2=D0=B5=D1=82=D0=B0=20?= =?UTF-8?q?=D0=A0=D0=B5=D1=83=D1=82=D1=82?= Date: Thu, 27 Nov 2025 15:35:58 +0300 Subject: [PATCH 79/79] Ignore RowRefType field in RangeTblEntry on jumble query Added RowRefType field in RangeTblEntry should not change queryid calculated in jumble query. It is internal field which is not depend on query parsing. --- src/include/nodes/parsenodes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 82443390a85..c328de51078 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1103,7 +1103,7 @@ typedef struct RangeTblEntry Index perminfoindex pg_node_attr(query_jumble_ignore); /* sampling info, or NULL */ struct TableSampleClause *tablesample; - RowRefType reftype; + RowRefType reftype pg_node_attr(query_jumble_ignore); /* * Fields valid for a subquery RTE (else NULL):