diff --git a/.abi-check/7.1.0/postgres.symbols.ignore b/.abi-check/7.1.0/postgres.symbols.ignore index 848dbf2841d..d42d77c4039 100644 --- a/.abi-check/7.1.0/postgres.symbols.ignore +++ b/.abi-check/7.1.0/postgres.symbols.ignore @@ -1 +1,12 @@ pgarch_start +ConfigureNamesInt_gp +child_triggers +has_update_triggers +ConfigureNamesBool_gp +aocs_beginscan +AppendOnlyBlockDirectory_GetEntry +ConfigureNamesString_gp +gp_pause_on_restore_point_replay +ConfigureNamesReal_gp +TableAmRoutine +MainLWLockNames diff --git a/GNUmakefile.in b/GNUmakefile.in index e6333e39bec..7b44f13dbf5 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -212,6 +212,11 @@ installcheck-gpcheckcat: $(call recurse,installcheck-world,gpcontrib/gp_replica_check,installcheck) $(call recurse,installcheck-world,src/bin/pg_upgrade,check) +.PHONY: installcheck-hot-standby +installcheck-hot-standby: submake-generated-headers + $(MAKE) -C src/test/regress installcheck-hot-standby + $(MAKE) -C src/test/isolation2 installcheck-hot-standby + # Run mock tests, that don't require a running server. Arguably these should # be part of [install]check-world, but we treat them more like part of # compilation than regression testing, in the CI. But they are too heavy-weight diff --git a/gpMgmt/bin/gpstart b/gpMgmt/bin/gpstart index 6937d86ac51..2ce947885d5 100755 --- a/gpMgmt/bin/gpstart +++ b/gpMgmt/bin/gpstart @@ -55,7 +55,8 @@ class GpStart: skip_heap_checksum_validation=False, fts_hosts=None, etcd_hosts=None, - is_external_fts=False + is_external_fts=False, + segment_config_file=None ): assert (specialMode in [None, 'maintenance']) self.specialMode = specialMode @@ -78,6 +79,7 @@ class GpStart: self.etcd_hosts = etcd_hosts self.is_external_fts = is_external_fts self.singlenodemode = False + self.segment_config_file = segment_config_file # # Some variables that are set during execution @@ -510,7 +512,11 @@ class GpStart: logger.info("Obtaining Segment details from coordinator...") self.dburl = dbconn.DbURL(port=self.port, dbname='template1') - self.gparray = GpArray.initFromCatalog(self.dburl, utility=True) + if self.segment_config_file: + self.gparray = GpArray.initFromFile(self.segment_config_file) + self.gparray.is_singlenode= False + else: + self.gparray = GpArray.initFromCatalog(self.dburl, utility=True) logger.info("Setting new coordinator era") e = GpEraFile(self.coordinator_datadir, logger=get_logger_if_verbose()) @@ -876,6 +882,8 @@ class GpStart: addTo.add_option('-E', dest='etcd_hosts', type='string',default=None , help='specify the file that contains all etcd hosts.If this argument is set, `gpstart` will attempt' 'to start all etcd in the specified hosts') + addTo.add_option('-f', '--segment_config_file', dest='segment_config_file', type='string', default=None, + help='specify the gp_segment_configuration file to load for this cluster') parser.set_defaults(verbose=False, filters=[], slice=(None, None)) @@ -922,7 +930,8 @@ class GpStart: skip_heap_checksum_validation=options.skip_heap_checksum_validation, fts_hosts=options.fts_hosts, etcd_hosts=options.etcd_hosts, - is_external_fts=external_fts + is_external_fts=external_fts, + segment_config_file=options.segment_config_file ) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 048ce9231a9..0003425b79f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -530,6 +530,14 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); +#ifdef FAULT_INJECTOR + FaultInjector_InjectFaultIfSet( + "heapgetpage_after_unlock_buffer", + DDLNotSpecified, + "", /* databaseName */ + RelationGetRelationName(scan->rs_base.rs_rd)); /* tableName */ +#endif + Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; } diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c index 01ee7ac6d2c..899c621b240 100644 --- a/src/backend/access/rmgrdesc/standbydesc.c +++ b/src/backend/access/rmgrdesc/standbydesc.c @@ -66,6 +66,14 @@ standby_desc(StringInfo buf, XLogReaderState *record) xlrec->dbId, xlrec->tsId, xlrec->relcacheInitFileInval); } + else if (info == XLOG_LATESTCOMPLETED_GXID) + { + DistributedTransactionId gxid; + + gxid = *((DistributedTransactionId *) rec); + appendStringInfo(buf, UINT64_FORMAT, gxid); + } + } const char * @@ -84,6 +92,9 @@ standby_identify(uint8 info) case XLOG_INVALIDATIONS: id = "INVALIDATIONS"; break; + case XLOG_LATESTCOMPLETED_GXID: + id = "XLOG_LATESTCOMPLETED_GXID"; + break; } return id; diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index f3112ff3070..efac0cb505e 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -897,3 +897,48 @@ yet simplifies emulation of subtransactions considerably. Further details on locking mechanics in recovery are given in comments with the Lock rmgr code. + +Distributed Transaction Emulation during Recovery +------------------------------------- + +In GPDB, the MVCC snapshot also includes distributed transactions (aka dtx). +Accordingly, on a hot standby we also emulate running dtx. The way to do that +is to re-use the shmCommittedGxidArray which has been used on a primary for dtx +recovery: it tracks all the 2PC dtx that have their PREPARE phase done, +but for which the COMMIT phase hasn't finished (i.e. window between the +XLOG_XACT_DISTRIBUTED_COMMIT record being written and the +XLOG_XACT_DISTRIBUTED_FORGET record being written on the QD). On a hot standby, +any dtx shown in that array are regarded as in-progress. The MVCC snapshot does +not really need to account for dtx not in that array: for a dtx that hasn't +done PREPARE, we know no segment has committed any data yet; for a dtx that +hasn't done COMMIT, we know all segments have committed their data. + +Note: dtxes that are preparing will not be tracked in this array, and thus will +not be included in this snapshot. This is slightly different from a primary QD, +where such transactions would have been included in the distributed snapshot's +inProgressXidArray (as we construct the inProgressXidArray from the PGXACTs that +would contain the dummy entries for prepared transactions). However, as +mentioned in CreateDistributedSnapshot, including these is not a requirement for +correctness. + +Note: aborted/aborting dtxes are not accounted for by the standby either. Those +are the dtxes that encountered error during preparing. Same as the previous +point, the standby does not need to be aware of them for correctness. Worth also +noting that if a dtx encountered error after being prepared, it cannot be +aborted anymore and must be committed by the dtx recovery process. Until +committed, such a dtx will be seen as in-progress to the standby. + +For 1PC dtx, however, there is a known limitation where the hot standby won't +see the last 1PC (or the last few 1PCs if they are all 1PC). This is because +since 1PC does not have any WAL on QD, the standby QD won't advance its +latestCompletedGxid, so its distributed snapshot horizon does not include the +last 1PC - it would view the last 1PC not yet started or at best still in +progress. Only if another 2PC comes, the standby would advance its +latestCompletedGxid and its distributed snapshot will include the previous 1PC. + +We don't emulate the full architecture of "running transaction" for dtx because +that is unnecessary, at least ATM. For example, we don't create a dtx-version +of XLOG_RUNNING_XACTS, because we already have that information as part of the +extended checkpoint (see TMGXACT_CHECKPOINT). We also don't need to emulate +other members in RunningTransactionsData, like subxid or xid-pruning related +variables because those do not apply to dtx. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index f3f2a035281..ed655baf989 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2475,11 +2475,10 @@ StartTransaction(void) /* * Transactions may be started while recovery is in progress, if - * hot standby is enabled. This mode is not supported in - * Cloudberry yet. + * hot standby is enabled. */ AssertImply(DistributedTransactionContext != DTX_CONTEXT_LOCAL_ONLY, - !s->startedInRecovery); + EnableHotStandby || !s->startedInRecovery); /* * MPP Modification * @@ -2526,20 +2525,39 @@ StartTransaction(void) case DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER: case DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER: + /* + * Sanity check for the global xid. + * + * Note for hot standby dispatch: the standby QEs are still + * writers, just like primary QEs for SELECT queries. But + * hot standby dispatch never has a valid gxid, so we skip + * the gxid checks for the standby QEs. + */ + if (!IS_HOT_STANDBY_QE()) + { + if (QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId) + elog(ERROR, + "distributed transaction id is invalid in context %s", + DtxContextToString(DistributedTransactionContext)); + + /* + * Update distributed XID info, this is only used for + * debugging. + */ + LocalDistribXactData *ele = &MyProc->localDistribXactData; + ele->distribXid = QEDtxContextInfo.distributedXid; + ele->state = LOCALDISTRIBXACT_STATE_ACTIVE; + } + else + Assert(QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId); + + /* fall through */ case DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT: { /* If we're running in test-mode insert a delay in writer. */ if (gp_enable_slow_writer_testmode) pg_usleep(500000); - if (DistributedTransactionContext != DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT && - QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId) - { - elog(ERROR, - "distributed transaction id is invalid in context %s", - DtxContextToString(DistributedTransactionContext)); - } - /* * Snapshot must not be created before setting transaction * isolation level. @@ -2552,28 +2570,14 @@ StartTransaction(void) XactReadOnly = isMppTxOptions_ReadOnly( QEDtxContextInfo.distributedTxnOptions); + /* a hot standby transaction must be read-only */ + AssertImply(IS_HOT_STANDBY_QE(), XactReadOnly); + /* * MPP: we're a QE Writer. */ MyTmGxact->gxid = QEDtxContextInfo.distributedXid; - if (DistributedTransactionContext == - DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER || - DistributedTransactionContext == - DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER) - { - Assert(QEDtxContextInfo.distributedXid != - InvalidDistributedTransactionId); - - /* - * Update distributed XID info, this is only used for - * debugging. - */ - LocalDistribXactData *ele = &MyProc->localDistribXactData; - ele->distribXid = QEDtxContextInfo.distributedXid; - ele->state = LOCALDISTRIBXACT_STATE_ACTIVE; - } - if (SharedLocalSnapshotSlot != NULL) { LWLockAcquire(SharedLocalSnapshotSlot->slotLock, LW_EXCLUSIVE); @@ -6880,8 +6884,8 @@ XactLogCommitRecord(TimestampTz commit_time, xl_xact_distrib xl_distrib; xl_xact_deldbs xl_deldbs; XLogRecPtr recptr; - bool isOnePhaseQE = (Gp_role == GP_ROLE_EXECUTE && MyTmGxactLocal->isOnePhaseCommit); bool isDtxPrepared = isPreparedDtxTransaction(); + DistributedTransactionId distrib_xid = getDistributedTransactionId(); uint8 info; @@ -6971,10 +6975,11 @@ XactLogCommitRecord(TimestampTz commit_time, xl_origin.origin_timestamp = replorigin_session_origin_timestamp; } - if (isDtxPrepared || isOnePhaseQE) + /* include distributed xid if there's one */ + if (distrib_xid != InvalidDistributedTransactionId) { xl_xinfo.xinfo |= XACT_XINFO_HAS_DISTRIB; - xl_distrib.distrib_xid = getDistributedTransactionId(); + xl_distrib.distrib_xid = distrib_xid; } #if 0 diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3fb9f121b93..be73d8fae2d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -133,7 +133,14 @@ bool track_wal_io_timing = false; int FileEncryptionEnabled = false; /* GPDB specific */ -bool gp_pause_on_restore_point_replay = false; +char *gp_pause_on_restore_point_replay = ""; + +/* + * GPDB: Have we reached a specific continuous recovery target? We set this to + * true if WAL replay has found a restore point matching the GPDB-specific GUC + * gp_pause_on_restore_point_replay and a promotion has been requested. + */ +static bool reachedContinuousRecoveryTarget = false; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -6012,6 +6019,59 @@ recoveryStopsBefore(XLogReaderState *record) return stopsHere; } +/* + * GPDB: Restore point records can act as a point of synchronization to ensure + * cluster-wide consistency during WAL replay. If a restore point is specified + * in the gp_pause_on_restore_point_replay GUC, WAL replay will be paused at + * that restore point until replay is explicitly resumed. + */ +static void +pauseRecoveryOnRestorePoint(XLogReaderState *record) +{ + uint8 info; + uint8 rmid; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return; + + info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + rmid = XLogRecGetRmid(record); + + if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + xl_restore_point *recordRestorePointData; + + recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); + + if (strcmp(recordRestorePointData->rp_name, gp_pause_on_restore_point_replay) == 0) + { + ereport(LOG, + (errmsg("setting recovery pause at restore point \"%s\", time %s", + recordRestorePointData->rp_name, + timestamptz_to_str(recordRestorePointData->rp_time)))); + + SetRecoveryPause(true); + recoveryPausesHere(false); + + /* + * If we've unpaused and there is a promotion request, then we've + * reached our continuous recovery target and need to immediately + * promote. We piggyback on the existing recovery target logic to + * do this. See recoveryStopsAfter(). + */ + if (CheckForStandbyTrigger()) + { + reachedContinuousRecoveryTarget = true; + recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE; + } + } + } +} + /* * Same as recoveryStopsBefore, but called after applying the record. * @@ -6039,15 +6099,19 @@ recoveryStopsAfter(XLogReaderState *record) /* * There can be many restore points that share the same name; we stop at * the first one. + * + * GPDB: If we've reached the continuous recovery target, we'll use the + * below logic to immediately stop recovery. */ - if (recoveryTarget == RECOVERY_TARGET_NAME && + if ((reachedContinuousRecoveryTarget || recoveryTarget == RECOVERY_TARGET_NAME) && rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) { xl_restore_point *recordRestorePointData; recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); - if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) + if (reachedContinuousRecoveryTarget || + strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) { recoveryStopAfter = true; recoveryStopXid = InvalidTransactionId; @@ -6565,6 +6629,16 @@ UpdateCatalogForStandbyPromotion(void) /* I am privileged */ InitializeSessionUserIdStandalone(); gp_activate_standby(); + + if (gp_segment_configuration_file && access(gp_segment_configuration_file, F_OK) == 0) + { + write_gp_segment_configuration(); + } + else + { + elog(DEBUG1, "Skipping write_gp_segment_configuration: file not found or not configured"); + } + /* close the transaction we started above */ CommitTransactionCommand(); Gp_role = old_role; @@ -7900,6 +7974,9 @@ StartupXLOG(void) WalSndWakeup(); } + if (gp_pause_on_restore_point_replay) + pauseRecoveryOnRestorePoint(xlogreader); + /* Exit loop if we reached inclusive recovery target */ if (recoveryStopsAfter(xlogreader)) { @@ -8331,6 +8408,8 @@ StartupXLOG(void) */ InRecovery = false; + SIMPLE_FAULT_INJECTOR("out_of_recovery_in_startupxlog"); + /* * Hook for plugins to do additional startup works. * @@ -9801,8 +9880,11 @@ CreateCheckPoint(int flags) * recovery we don't need to write running xact data. */ if (!shutdown && XLogStandbyInfoActive()) + { LogStandbySnapshot(); + } + SIMPLE_FAULT_INJECTOR("checkpoint_after_redo_calculated"); START_CRIT_SECTION(); @@ -11126,14 +11208,7 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_RESTORE_POINT) { - /* - * GPDB: Restore point records can act as a point of - * synchronization to ensure cluster-wide consistency during WAL - * replay. WAL replay is paused at each restore point until it is - * explicitly resumed. - */ - if (gp_pause_on_restore_point_replay) - SetRecoveryPause(true); + /* nothing to do here */ } else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) { diff --git a/src/backend/catalog/.gitignore b/src/backend/catalog/.gitignore index 6c4c6d228db..3912b022a03 100644 --- a/src/backend/catalog/.gitignore +++ b/src/backend/catalog/.gitignore @@ -8,3 +8,4 @@ /pg_*_d.h /gp_*_d.h /bki-stamp +/system_views_gp.sql diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 8a58b8e5897..260bd608d50 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -56,6 +56,9 @@ OBJS += pg_extprotocol.o \ gp_matview_aux.o \ pg_directory_table.o storage_directory_table.o +GP_SYSVIEW_IN = system_views_gp.in +GP_SYSVIEW_SQL = system_views_gp.sql + CATALOG_JSON:= $(addprefix $(top_srcdir)/gpMgmt/bin/gppylib/data/, $(addsuffix .json,$(GP_MAJORVERSION))) include $(top_srcdir)/src/backend/common.mk @@ -133,7 +136,7 @@ POSTGRES_BKI_DATA += $(addprefix $(top_srcdir)/src/include/catalog/,\ $(top_builddir)/src/include/catalog/gp_version_at_initdb.dat -all: distprep generated-header-symlinks +all: distprep generated-header-symlinks $(GP_SYSVIEW_SQL) distprep: bki-stamp @@ -197,6 +200,7 @@ ifeq ($(USE_INTERNAL_FTS_FOUND), false) endif $(INSTALL_DATA) $(srcdir)/system_functions.sql '$(DESTDIR)$(datadir)/system_functions.sql' $(INSTALL_DATA) $(srcdir)/system_views.sql '$(DESTDIR)$(datadir)/system_views.sql' + $(INSTALL_DATA) $(srcdir)/$(GP_SYSVIEW_SQL) '$(DESTDIR)$(datadir)/$(GP_SYSVIEW_SQL)' $(INSTALL_DATA) $(srcdir)/information_schema.sql '$(DESTDIR)$(datadir)/information_schema.sql' $(INSTALL_DATA) $(call vpathsearch,cdb_schema.sql) '$(DESTDIR)$(datadir)/cdb_init.d/cdb_schema.sql' $(INSTALL_DATA) $(srcdir)/sql_features.txt '$(DESTDIR)$(datadir)/sql_features.txt' @@ -216,4 +220,4 @@ endif clean: maintainer-clean: clean - rm -f bki-stamp postgres.bki system_constraints.sql $(GENERATED_HEADERS) + rm -f bki-stamp postgres.bki system_constraints.sql $(GENERATED_HEADERS) $(GP_SYSVIEW_SQL) diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 6b0b604ab5e..d5b7b81e8a2 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1098,6 +1098,7 @@ $$ $$ LANGUAGE SQL EXECUTE ON ALL SEGMENTS; +-- This view has an additional column than pg_stat_replication so cannot be generated using system_views_gp.in CREATE VIEW gp_stat_replication AS SELECT *, pg_catalog.gp_replication_error() AS sync_error FROM pg_catalog.gp_stat_get_master_replication() AS R @@ -1498,6 +1499,10 @@ rq.oid=rc.resqueueid AND rc.restypid = rt.restypid ORDER BY rsqname, restypid ; +-- FIXME: we have a cluster-wide view gp_stat_database_conflicts, but that is +-- only showing conflicts of every segment. Some conflict might be encountered +-- on just part of the segments. Ideally we should have a view like +-- gp_stat_database_conflicts_summary that prints the overall conflicts and types. CREATE VIEW pg_stat_database_conflicts AS SELECT D.oid AS datid, @@ -1801,11 +1806,6 @@ UNION ALL SELECT gp_segment_id, gp_get_suboverflowed_backends() FROM gp_dist_random('gp_id') order by 1; -CREATE OR REPLACE VIEW gp_stat_archiver AS - SELECT -1 AS gp_segment_id, * FROM pg_stat_archiver - UNION - SELECT gp_execution_segment() AS gp_segment_id, * FROM gp_dist_random('pg_stat_archiver'); - CREATE FUNCTION gp_get_session_endpoints (OUT gp_segment_id int, OUT auth_token text, OUT cursorname text, OUT sessionid int, OUT hostname varchar(64), OUT port int, OUT username text, OUT state text, diff --git a/src/backend/catalog/system_views_gp.in b/src/backend/catalog/system_views_gp.in new file mode 100644 index 00000000000..d46dde3191e --- /dev/null +++ b/src/backend/catalog/system_views_gp.in @@ -0,0 +1,48 @@ +# This file lists all the PG system views 'pg_%' that we would like to create an +# MPP-aware view 'gp_%' out of. The generated 'gp_%' view definitions will be placed +# in system_views_gp.sql, and initialized at the same time as system_views.sql. +#pg_backend_memory_contexts +pg_config +pg_cursors +pg_file_settings +pg_replication_origin_status +pg_replication_slots +pg_settings +pg_stat_activity +pg_stat_archiver +pg_stat_bgwriter +#pg_stat_database +pg_stat_database_conflicts +pg_stat_gssapi +pg_stat_operations +#pg_stat_progress_analyze +#pg_stat_progress_basebackup +#pg_stat_progress_cluster +#pg_stat_progress_copy +#pg_stat_progress_create_index +#pg_stat_progress_vacuum +pg_stat_slru +pg_stat_ssl +pg_stat_subscription +pg_stat_sys_indexes +pg_stat_sys_tables +pg_stat_user_functions +pg_stat_user_indexes +pg_stat_user_tables +#pg_stat_wal +pg_stat_wal_receiver +pg_stat_xact_all_tables +pg_stat_xact_sys_tables +pg_stat_xact_user_functions +pg_stat_xact_user_tables +pg_statio_all_indexes +pg_statio_all_sequences +pg_statio_all_tables +pg_statio_sys_indexes +pg_statio_sys_sequences +pg_statio_sys_tables +pg_statio_user_indexes +pg_statio_user_sequences +pg_statio_user_tables +#pg_stats ERROR: column "most_common_vals" has pseudo-type anyarray +pg_stats_ext diff --git a/src/backend/cdb/cdbdtxcontextinfo.c b/src/backend/cdb/cdbdtxcontextinfo.c index 1a3c1b8f295..2994821f8df 100644 --- a/src/backend/cdb/cdbdtxcontextinfo.c +++ b/src/backend/cdb/cdbdtxcontextinfo.c @@ -60,7 +60,7 @@ DtxContextInfo_CreateOnMaster(DtxContextInfo *dtxContextInfo, bool inCursor, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("cannot have more than 2^32-2 commands in a session"))); - AssertImply(inCursor, + AssertImply(inCursor && !IS_HOT_STANDBY_QD(), dtxContextInfo->distributedXid != InvalidDistributedTransactionId && gp_command_count == MySessionState->latestCursorCommandId); diff --git a/src/backend/cdb/cdbdtxrecovery.c b/src/backend/cdb/cdbdtxrecovery.c index 186b01ff214..605ce323ddb 100644 --- a/src/backend/cdb/cdbdtxrecovery.c +++ b/src/backend/cdb/cdbdtxrecovery.c @@ -202,6 +202,11 @@ recoverInDoubtTransactions(void) for (i = 0; i < *shmNumCommittedGxacts; i++) { + /* + * No need to acquire CommittedGxidArrayLock since dtx recovery + * only happens on primary, but not hot standby where concurrent + * access to this array is possible from CreateDistributedSnapshot. + */ DistributedTransactionId gxid = shmCommittedGxidArray[i]; char gid[TMGIDSIZE]; @@ -486,7 +491,12 @@ void redoDistributedCommitRecord(DistributedTransactionId gxid) { int i; + bool is_hot_standby_qd = IS_HOT_STANDBY_QD(); + /* + * Only the startup process can be modifying shmNumCommittedGxacts + * and shmCommittedGxidArray. So should be OK reading the value w/o lock. + */ for (i = 0; i < *shmNumCommittedGxacts; i++) { if (gxid == shmCommittedGxidArray[i]) @@ -526,7 +536,18 @@ redoDistributedCommitRecord(DistributedTransactionId gxid) "around this issue and then report a bug"))); } + /* + * only on hot standby there might be backends that call CreateDistributedSnapshot() + * to access the committed gxid array concurrently. + */ + if (is_hot_standby_qd) + LWLockAcquire(CommittedGxidArrayLock, LW_EXCLUSIVE); + shmCommittedGxidArray[(*shmNumCommittedGxacts)++] = gxid; + + if (is_hot_standby_qd) + LWLockRelease(CommittedGxidArrayLock); + elog((Debug_print_full_dtm ? LOG : DEBUG5), "Crash recovery redo added committed distributed transaction gid = "UINT64_FORMAT, gxid); } @@ -539,7 +560,13 @@ void redoDistributedForgetCommitRecord(DistributedTransactionId gxid) { int i; - + bool is_hot_standby_qd = IS_HOT_STANDBY_QD(); + + SIMPLE_FAULT_INJECTOR("redoDistributedForgetCommitRecord"); + /* + * Only the startup process can be modifying shmNumCommittedGxacts + * and shmCommittedGxidArray. So should be OK reading the value w/o lock. + */ for (i = 0; i < *shmNumCommittedGxacts; i++) { if (gxid == shmCommittedGxidArray[i]) @@ -550,13 +577,27 @@ redoDistributedForgetCommitRecord(DistributedTransactionId gxid) gxid); /* - * there's no concurrent access to shmCommittedGxidArray during - * recovery + * only on hot standby there might be backends that call CreateDistributedSnapshot() + * to access the committed gxid array concurrently. */ + if (is_hot_standby_qd) + LWLockAcquire(CommittedGxidArrayLock, LW_EXCLUSIVE); + (*shmNumCommittedGxacts)--; if (i != *shmNumCommittedGxacts) shmCommittedGxidArray[i] = shmCommittedGxidArray[*shmNumCommittedGxacts]; + if (is_hot_standby_qd) + LWLockRelease(CommittedGxidArrayLock); + + /* on the hot standby, we rely on the forget record to advance latestCompletedGxid */ + if (is_hot_standby_qd) + { + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + if (gxid > ShmemVariableCache->latestCompletedGxid) + ShmemVariableCache->latestCompletedGxid = gxid; + LWLockRelease(ProcArrayLock); + } return; } } diff --git a/src/backend/cdb/cdbfts.c b/src/backend/cdb/cdbfts.c index 754d3054cbb..de268b6f662 100644 --- a/src/backend/cdb/cdbfts.c +++ b/src/backend/cdb/cdbfts.c @@ -84,6 +84,10 @@ FtsNotifyProber(void) int32 started; int32 done; + /* Ignore if we don't have a FTS probe process, like a standby QD in a mirrored cluster. */ + if (FtsProbePID() == 0) + return; + if (am_ftsprobe) return; diff --git a/src/backend/cdb/cdbtm.c b/src/backend/cdb/cdbtm.c index f0cd5fcb3f6..37550261149 100644 --- a/src/backend/cdb/cdbtm.c +++ b/src/backend/cdb/cdbtm.c @@ -264,6 +264,21 @@ currentDtxActivate(void) { bool signal_dtx_recovery; + /* + * A hot standby transaction does not have a valid gxid, so can skip + * most of the things in this function. We still explicitly set some + * fields that are irrelevant to hot standby for cleanness. + */ + if (IS_HOT_STANDBY_QD()) + { + /* standby QD will stay in this state until transaction completed */ + setCurrentDtxState(DTX_STATE_ACTIVE_DISTRIBUTED); + MyTmGxact->sessionId = gp_session_id; + MyTmGxact->gxid = InvalidDistributedTransactionId; + MyTmGxact->includeInCkpt = false; + return; + } + if (ShmemVariableCache->GxidCount <= GXID_PRETCH_THRESHOLD && (GetDtxRecoveryEvent() & DTX_RECOVERY_EVENT_BUMP_GXID) == 0) { @@ -1644,7 +1659,7 @@ isDtxQueryDispatcher(void) isSharedLocalSnapshotSlotPresent = (SharedLocalSnapshotSlot != NULL); return (Gp_role == GP_ROLE_DISPATCH && - isDtmStarted && + (isDtmStarted || EnableHotStandby) && isSharedLocalSnapshotSlotPresent); } @@ -2047,6 +2062,8 @@ sendDtxExplicitBegin(void) static void performDtxProtocolPrepare(const char *gid) { + SIMPLE_FAULT_INJECTOR("qe_start_prepared"); + StartTransactionCommand(); elog(DTM_DEBUG5, "performDtxProtocolCommand going to call PrepareTransactionBlock for distributed transaction (id = '%s')", gid); @@ -2126,6 +2143,7 @@ performDtxProtocolCommitOnePhase(const char *gid) static void performDtxProtocolCommitPrepared(const char *gid, bool raiseErrorIfNotFound) { + SIMPLE_FAULT_INJECTOR("qe_start_commit_prepared"); Assert(Gp_role == GP_ROLE_EXECUTE); elog(DTM_DEBUG5, @@ -2158,6 +2176,7 @@ performDtxProtocolCommitPrepared(const char *gid, bool raiseErrorIfNotFound) sendWaitGxidsToQD(waitGxids); finishDistributedTransactionContext("performDtxProtocolCommitPrepared -- Commit Prepared", false); + SIMPLE_FAULT_INJECTOR("finish_commit_prepared"); } /** diff --git a/src/backend/cdb/cdbutil.c b/src/backend/cdb/cdbutil.c index 1671b17223b..888b5b08708 100644 --- a/src/backend/cdb/cdbutil.c +++ b/src/backend/cdb/cdbutil.c @@ -37,6 +37,7 @@ #include "utils/memutils.h" #include "catalog/gp_id.h" #include "catalog/indexing.h" +#include "catalog/heap.h" #include "cdb/cdbhash.h" #include "cdb/cdbutil.h" #include "cdb/cdbmotion.h" @@ -60,6 +61,9 @@ #include "catalog/gp_indexing.h" #include "utils/etcd.h" #include "common/etcdutils.h" +#include "storage/sinvaladt.h" +#include "storage/bufmgr.h" +#include "utils/syscache.h" #include "catalog/gp_indexing.h" @@ -79,6 +83,7 @@ MemoryContext CdbComponentsContext = NULL; static CdbComponentDatabases *cdb_component_dbs = NULL; +char *gp_segment_configuration_file = NULL; #ifdef USE_INTERNAL_FTS @@ -92,6 +97,7 @@ static int CdbComponentDatabaseInfoCompare(const void *p1, const void *p2); static GpSegConfigEntry * readGpSegConfigFromCatalog(int *total_dbs); static GpSegConfigEntry * readGpSegConfigFromFTSFiles(int *total_dbs); +static GpSegConfigEntry * readGpSegConfigFromExtFile(int *total_dbs); static void getAddressesForDBid(GpSegConfigEntry *c, int elevel); static HTAB *hostPrimaryCountHashTableInit(void); @@ -372,7 +378,14 @@ getCdbComponentInfo(void) HTAB *hostPrimaryCountHash = hostPrimaryCountHashTableInit(); - if (IsTransactionState()) + /* On hotstandby, if gp_segment_configuration_file is configured, try + * to load configs from it. Since hotstandby may be created from a + * basebackup that the table gp_segment_configuration is backuped from + * the souce cluster and cannot be modified in read repilca mode. + */ + if (EnableHotStandby && gp_segment_configuration_file) + configs = readGpSegConfigFromExtFile(&total_dbs); + else if (IsTransactionState()) configs = readGpSegConfigFromCatalog(&total_dbs); else configs = readGpSegConfigFromFTSFiles(&total_dbs); @@ -565,7 +578,7 @@ getCdbComponentInfo(void) { cdbInfo = &component_databases->segment_db_info[i]; - if (cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY) + if (!IS_HOT_STANDBY_QD() && cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY) continue; hsEntry = (HostPrimaryCountEntry *) hash_search(hostPrimaryCountHash, cdbInfo->config->hostname, HASH_FIND, &found); @@ -577,7 +590,7 @@ getCdbComponentInfo(void) { cdbInfo = &component_databases->entry_db_info[i]; - if (cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY) + if (!IS_HOT_STANDBY_QD() && cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY) continue; hsEntry = (HostPrimaryCountEntry *) hash_search(hostPrimaryCountHash, cdbInfo->config->hostname, HASH_FIND, &found); @@ -1005,7 +1018,16 @@ cdbcomponent_getComponentInfo(int contentId) /* entry db */ if (contentId == -1) { - cdbInfo = &cdbs->entry_db_info[0]; + Assert(cdbs->total_entry_dbs == 1 || cdbs->total_entry_dbs == 2); + /* + * For a standby QD, get the last entry db which can be the first (on + * a replica cluster) or the second (on a mirrored cluster) entry. + */ + if (IS_HOT_STANDBY_QD()) + cdbInfo = &cdbs->entry_db_info[cdbs->total_entry_dbs - 1]; + else + cdbInfo = &cdbs->entry_db_info[0]; + return cdbInfo; } @@ -1022,10 +1044,10 @@ cdbcomponent_getComponentInfo(int contentId) Assert(cdbs->total_segment_dbs == cdbs->total_segments * 2); cdbInfo = &cdbs->segment_db_info[2 * contentId]; - if (!SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo)) - { + /* use the other segment if it is not what the QD wants */ + if ((IS_HOT_STANDBY_QD() && SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo)) + || (!IS_HOT_STANDBY_QD() && !SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo))) cdbInfo = &cdbs->segment_db_info[2 * contentId + 1]; - } return cdbInfo; } @@ -1124,10 +1146,21 @@ cdb_setup(void) * * Ignore background worker because bgworker_should_start_mpp() already did * the check. + * + * Ignore if we are the standby coordinator started in hot standby mode. + * We don't expect dtx recovery to have finished, as dtx recovery is + * performed at the end of startup. In hot standby, we are recovering + * continuously and should allow queries much earlier. Since a hot standby + * won't proceed dtx, it is not required to wait for recovery of the dtx + * that has been prepared but not committed (i.e. to commit them); on the + * other hand, the recovery of any in-doubt transactions (i.e. not prepared) + * won't bother a hot standby either, just like they can be recovered in the + * background when a primary instance is running. */ if (!IsBackgroundWorker && Gp_role == GP_ROLE_DISPATCH && - !*shmDtmStarted) + !*shmDtmStarted && + !IS_HOT_STANDBY_QD()) { ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), @@ -1978,7 +2011,172 @@ gp_get_suboverflowed_backends(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } -#else +void +add_segment_config_entry(GpSegConfigEntry *i) +{ + Relation rel = table_open(GpSegmentConfigRelationId, AccessExclusiveLock); + Datum values[Natts_gp_segment_configuration]; + bool nulls[Natts_gp_segment_configuration]; + HeapTuple tuple; + + MemSet(nulls, false, sizeof(nulls)); + + values[Anum_gp_segment_configuration_dbid - 1] = Int16GetDatum(i->dbid); + values[Anum_gp_segment_configuration_content - 1] = Int16GetDatum(i->segindex); + values[Anum_gp_segment_configuration_role - 1] = CharGetDatum(i->role); + values[Anum_gp_segment_configuration_preferred_role - 1] = + CharGetDatum(i->preferred_role); + values[Anum_gp_segment_configuration_mode - 1] = + CharGetDatum(i->mode); + values[Anum_gp_segment_configuration_status - 1] = + CharGetDatum(i->status); + values[Anum_gp_segment_configuration_port - 1] = + Int32GetDatum(i->port); + values[Anum_gp_segment_configuration_hostname - 1] = + CStringGetTextDatum(i->hostname); + values[Anum_gp_segment_configuration_address - 1] = + CStringGetTextDatum(i->address); + values[Anum_gp_segment_configuration_datadir - 1] = + CStringGetTextDatum(i->datadir); + values[Anum_gp_segment_configuration_warehouseid - 1] = + ObjectIdGetDatum(i->warehouseid); + + tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + /* insert a new tuple */ + CatalogTupleInsert(rel, tuple); + + table_close(rel, NoLock); +} + +void +remove_segment_config_entry(int16 dbid) +{ + int numDel = 0; + ScanKeyData scankey; + SysScanDesc sscan; + HeapTuple tuple; + Relation rel; + + rel = table_open(GpSegmentConfigRelationId, RowExclusiveLock); + + ScanKeyInit(&scankey, + Anum_gp_segment_configuration_dbid, + BTEqualStrategyNumber, F_INT2EQ, + Int16GetDatum(dbid)); + sscan = systable_beginscan(rel, GpSegmentConfigDbidWarehouseIndexId, true, + NULL, 1, &scankey); + while ((tuple = systable_getnext(sscan)) != NULL) + { + Datum attr; + bool isNull; + Oid warehouseid = InvalidOid; + + attr = heap_getattr(tuple, Anum_gp_segment_configuration_warehouseid, + RelationGetDescr(rel), &isNull); + Assert(!isNull); + warehouseid = DatumGetObjectId(attr); + + if (!OidIsValid(warehouseid) || warehouseid == GetCurrentWarehouseId()) + { + CatalogTupleDelete(rel, &tuple->t_self); + numDel++; + } + } + systable_endscan(sscan); + + Assert(numDel > 0); + + table_close(rel, NoLock); +} + +static GpSegConfigEntry* +readGpSegConfigFromExtFile(int *total_dbs) +{ + FILE *fd; + int idx = 0; + int array_size = 500; + GpSegConfigEntry *configs = NULL; + GpSegConfigEntry *config = NULL; + + char hostname[MAXHOSTNAMELEN]; + char address[MAXHOSTNAMELEN]; + char datadir[1000]; + char buf[MAXHOSTNAMELEN * 2 + 32 + 2000]; + + Assert(gp_segment_configuration_file && strcmp(gp_segment_configuration_file, "") != 0); + + /* notify and wait FTS to finish a probe and update the dump file */ + + fd = AllocateFile(gp_segment_configuration_file, "r"); + + if (!fd) + elog(ERROR, "could not open gp_segment_configutation dump file:%s:%m", gp_segment_configuration_file); + + configs = palloc0(sizeof (GpSegConfigEntry) * array_size); + while (fgets(buf, sizeof(buf), fd)) + { + config = &configs[idx]; + + if (sscanf(buf, "%d %d %c %c %c %c %d %s %s %s", (int *)&config->dbid, (int *)&config->segindex, + &config->role, &config->preferred_role, &config->mode, &config->status, + &config->port, hostname, address, datadir) != 10) + { + FreeFile(fd); + elog(ERROR, "invalid data in gp_segment_configuration dump file: %s:%m", gp_segment_configuration_file); + } + + config->hostname = pstrdup(hostname); + config->address = pstrdup(address); + config->datadir = pstrdup(datadir); + + idx++; + /* + * Expand CdbComponentDatabaseInfo array if we've used up + * currently allocated space + */ + if (idx >= array_size) + { + array_size = array_size * 2; + configs = (GpSegConfigEntry *) + repalloc(configs, sizeof(GpSegConfigEntry) * array_size); + } + } + + FreeFile(fd); + + *total_dbs = idx; + return configs; +} + +void +write_gp_segment_configuration(void) +{ + Relation rel; + GpSegConfigEntry *configs; + int total_dbs; + SysScanDesc sscan; + HeapTuple tuple; + + rel = table_open(GpSegmentConfigRelationId, RowExclusiveLock); + sscan = systable_beginscan(rel, GpSegmentConfigDbidWarehouseIndexId, true, + NULL, 0, NULL); + while ((tuple = systable_getnext(sscan)) != NULL) + { + CatalogTupleDelete(rel, &tuple->t_self); + } + systable_endscan(sscan); + + /* insert new configs into gp_segment_configuration table */ + configs = readGpSegConfigFromExtFile(&total_dbs); + for (int i = 0; i < total_dbs; i++) { + GpSegConfigEntry config = configs[i]; + add_segment_config_entry(&config); + } + table_close(rel, RowExclusiveLock); +} + +#else bool am_ftshandler = false; diff --git a/src/backend/cdb/dispatcher/cdbdisp_query.c b/src/backend/cdb/dispatcher/cdbdisp_query.c index 99f5179e756..fb29bd9fa14 100644 --- a/src/backend/cdb/dispatcher/cdbdisp_query.c +++ b/src/backend/cdb/dispatcher/cdbdisp_query.c @@ -867,6 +867,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms, { const char *command = pQueryParms->strCommand; int command_len; + int is_hs_dispatch = IS_HOT_STANDBY_QD() ? 1 : 0; const char *plantree = pQueryParms->serializedPlantree; int plantree_len = pQueryParms->serializedPlantreelen; const char *sddesc = pQueryParms->serializedQueryDispatchDesc; @@ -921,6 +922,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms, sizeof(outerUserId) /* outerUserIsSuper */ + sizeof(currentUserId) + sizeof(n32) * 2 /* currentStatementStartTimestamp */ + + sizeof(is_hs_dispatch) + sizeof(command_len) + sizeof(plantree_len) + sizeof(sddesc_len) + @@ -976,6 +978,10 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms, memcpy(pos, &n32, sizeof(n32)); pos += sizeof(n32); + tmp = htonl(is_hs_dispatch); + memcpy(pos, &tmp, sizeof(is_hs_dispatch)); + pos += sizeof(is_hs_dispatch); + tmp = htonl(command_len); memcpy(pos, &tmp, sizeof(command_len)); pos += sizeof(command_len); diff --git a/src/backend/cdb/dispatcher/cdbgang.c b/src/backend/cdb/dispatcher/cdbgang.c index 780ddef0f42..87ce88504b0 100644 --- a/src/backend/cdb/dispatcher/cdbgang.c +++ b/src/backend/cdb/dispatcher/cdbgang.c @@ -698,8 +698,7 @@ getCdbProcessesForQD(int isPrimary) qdinfo = cdbcomponent_getComponentInfo(MASTER_CONTENT_ID); - Assert(qdinfo->config->segindex == -1); - Assert(SEGMENT_IS_ACTIVE_PRIMARY(qdinfo)); + Assert((qdinfo->config->segindex == -1 && SEGMENT_IS_ACTIVE_PRIMARY(qdinfo)) || IS_HOT_STANDBY_QD()); Assert(qdinfo->config->hostip != NULL); proc = makeNode(CdbProcess); diff --git a/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c b/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c new file mode 100644 index 00000000000..6e07aebcc96 --- /dev/null +++ b/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c @@ -0,0 +1,341 @@ +#include +#include +#include +#include "cmockery.h" +#include "postgres.h" + +#include "storage/ipc.h" +#include "storage/proc.h" + +#include "../cdbdisp_query.c" + + +#undef PG_RE_THROW +#define PG_RE_THROW() siglongjmp(*PG_exception_stack, 1) + + +int __wrap_errmsg(const char *fmt,...); +int __wrap_errcode(int sqlerrcode); +bool __wrap_errstart(int elevel, const char *filename, int lineno, + const char *funcname, const char *domain); +void __wrap_errfinish(int dummy __attribute__((unused)),...); +Gang *__wrap_cdbgang_createGang_async(List *segments, SegmentType segmentType); +int __wrap_pqPutMsgStart(char msg_type, bool force_len, PGconn *conn); +int __wrap_PQcancel(PGcancel *cancel, char *errbuf, int errbufsize); +char *__wrap_serializeNode(Node *node, int *size, int *uncompressed_size_out); +char *__wrap_qdSerializeDtxContextInfo(int *size, bool wantSnapshot, bool inCursor, int txnOptions, char *debugCaller); +void __wrap_VirtualXactLockTableInsert(VirtualTransactionId vxid); +void __wrap_AcceptInvalidationMessages(void); +static void terminate_process(); + + +int +__wrap_errmsg(const char *fmt,...) +{ + check_expected(fmt); + optional_assignment(fmt); + return (int) mock(); +} + + +int +__wrap_errcode(int sqlerrcode) +{ + check_expected(sqlerrcode); + return (int) mock(); +} + + +bool +__wrap_errstart(int elevel, const char *filename, int lineno, + const char *funcname, const char *domain) +{ + if (elevel < LOG) + return false; + + check_expected(elevel); + check_expected(filename); + check_expected(lineno); + check_expected(funcname); + check_expected(domain); + optional_assignment(filename); + optional_assignment(funcname); + optional_assignment(domain); + return (bool) mock(); +} + + +void +__wrap_errfinish(int dummy __attribute__((unused)),...) +{ + PG_RE_THROW(); +} + + +static void +expect_ereport(int expect_elevel) +{ + expect_any(__wrap_errmsg, fmt); + will_be_called(__wrap_errmsg); + + expect_any(__wrap_errcode, sqlerrcode); + will_be_called(__wrap_errcode); + + expect_value(__wrap_errstart, elevel, expect_elevel); + expect_any(__wrap_errstart, filename); + expect_any(__wrap_errstart, lineno); + expect_any(__wrap_errstart, funcname); + expect_any(__wrap_errstart, domain); + if (expect_elevel < ERROR) + { + will_return(__wrap_errstart, false); + } + else + { + will_return(__wrap_errstart, true); + } +} + + +Gang * +__wrap_cdbgang_createGang_async(List *segments, SegmentType segmentType) +{ + MemoryContext oldContext = MemoryContextSwitchTo(DispatcherContext); + Gang *gang = buildGangDefinition(segments, segmentType); + + MemoryContextSwitchTo(oldContext); + + PGconn *conn = (PGconn *) malloc(sizeof(PGconn)); + + MemSet(conn, 0, sizeof(PGconn)); + initPQExpBuffer(&conn->errorMessage); + initPQExpBuffer(&conn->workBuffer); + gang->db_descriptors[0]->conn = conn; + + return gang; +} + + +int +__wrap_pqPutMsgStart(char msg_type, bool force_len, PGconn *conn) +{ + if (conn->outBuffer_shared) + fail_msg("Mustn't send something else during dispatch!"); + check_expected(msg_type); + check_expected(force_len); + check_expected(conn); + optional_assignment(conn); + return (int) mock(); +} + + +int +__wrap_PQcancel(PGcancel *cancel, char *errbuf, int errbufsize) +{ + return (int) mock(); +} + + +char * +__wrap_serializeNode(Node *node, int *size, int *uncompressed_size_out) +{ + const int alloc_size = 1024; + + if (size != NULL) + *size = alloc_size; + if (uncompressed_size_out != NULL) + *uncompressed_size_out = alloc_size; + + return (char *) palloc(alloc_size); +} + + +char * +__wrap_qdSerializeDtxContextInfo(int *size, bool wantSnapshot, bool inCursor, int txnOptions, char *debugCaller) +{ + const int alloc_size = 1024; + + assert_int_not_equal(size, NULL); + *size = alloc_size; + + return (char *) palloc(alloc_size); +} + + +void +__wrap_VirtualXactLockTableInsert(VirtualTransactionId vxid) +{ + mock(); +} + +void +__wrap_AcceptInvalidationMessages(void) +{ + mock(); +} + + +static void +terminate_process() +{ + die(SIGTERM); +} + +/* + * Test query may be interrupted during plan dispatching + */ +static void +test__CdbDispatchPlan_may_be_interrupted(void **state) +{ + PlannedStmt *plannedstmt = (PlannedStmt *) palloc(sizeof(PlannedStmt)); + + /* slice table is needed to allocate gang */ + plannedstmt->slices = palloc0(sizeof(PlanSlice)); + plannedstmt->numSlices = 1; + PlanSlice *slice = &plannedstmt->slices[0]; + + slice->sliceIndex = 1; + slice->gangType = GANGTYPE_PRIMARY_READER; + slice->numsegments = 1; + slice->parentIndex = -1; + slice->segindex = 0; + + QueryDesc *queryDesc = (QueryDesc *) palloc(sizeof(QueryDesc)); + + queryDesc->plannedstmt = plannedstmt; + /* ddesc->secContext is filled in cdbdisp_buildPlanQueryParms() */ + queryDesc->ddesc = (QueryDispatchDesc *) palloc(sizeof(QueryDispatchDesc)); + /* source text is required for buildGpQueryString() */ + queryDesc->sourceText = "select a from t1;"; + + queryDesc->estate = CreateExecutorState(); + + /* will be called multiple times in e.g. FtsNotifyProber/getCdbComponentInfo */ + will_return_count(RecoveryInProgress, false, -1); + + /* cdbcomponent_getCdbComponents() mocks */ + will_be_called(FtsNotifyProber); + will_return(getFtsVersion, 1); + will_return(GetGpExpandVersion, 1); + + /* StartTransactionCommand() mocks */ + will_be_called(__wrap_VirtualXactLockTableInsert); + will_be_called(__wrap_AcceptInvalidationMessages); + will_be_called(initialize_wal_bytes_written); + + /* + * cdbdisp_dispatchToGang() + * + * start sending MPP query to QE inside PQsendGpQuery_shared() replace + * connection buffer with the shared one + */ + expect_any(PQsendQueryStart, conn); + will_return(PQsendQueryStart, true); + + /* first try to flush MPP query inside PQsendGpQuery_shared() */ + expect_any(pqFlushNonBlocking, conn); + will_return(pqFlushNonBlocking, 1); + + /* + * cdbdisp_waitDispatchFinish() + * + * query will be interrupted before poll() + */ + expect_any_count(ResetWaitEventSet, pset, 2); + expect_any_count(ResetWaitEventSet, context, 2); + expect_any_count(ResetWaitEventSet, nevents, 2); + will_be_called_count(ResetWaitEventSet, 2); + + expect_any(pqFlushNonBlocking, conn); + will_return_with_sideeffect(pqFlushNonBlocking, 1, &terminate_process, NULL); + + expect_any(SetLatch, latch); + will_be_called(SetLatch); + + expect_any(AddWaitEventToSet, set); + expect_any(AddWaitEventToSet, events); + expect_any(AddWaitEventToSet, fd); + expect_any(AddWaitEventToSet, latch); + expect_any(AddWaitEventToSet, user_data); + will_be_called(AddWaitEventToSet); + + will_return(IsLogicalLauncher, false); + + /* process was terminated by administrative command */ + expect_ereport(FATAL); + + /* QD will trying to cancel queries on QEs */ + will_return(__wrap_PQcancel, true); + + /* during close and free connection */ + expect_any_count(pqClearAsyncResult, conn, 2); + will_be_called_count(pqClearAsyncResult, 2); + + /* + * BUT! pqPutMsgStart mustn't be called + * + * we can't send termination message (X) until shared message isn't sent + * out the buffer completely + */ + + /* + * dirty hack. cluster topology needed to allocate gangs is loaded from + * gpsegconfig_dump outside of transaction + */ + cdbcomponent_getCdbComponents(); + + StartTransactionCommand(); + + PG_TRY(); + { + queryDesc->estate->es_sliceTable = InitSliceTable(queryDesc->estate, plannedstmt); + + CdbDispatchPlan(queryDesc, queryDesc->estate->es_param_exec_vals, + false, false); + fail(); + } + PG_CATCH(); + { + /* + * SIGTERM handling emulation gpdb bail out from CheckDispatchResult + * without flushing unsent messages in case of process exit in + * progress AtAbort_DispatcherState will be called during transaction + * abort + */ + proc_exit_inprogress = true; + + AtAbort_DispatcherState(); + } + PG_END_TRY(); +} + +int +main(int argc, char *argv[]) +{ + cmockery_parse_arguments(argc, argv); + + const UnitTest tests[] = + { + unit_test(test__CdbDispatchPlan_may_be_interrupted) + }; + + Gp_role = GP_ROLE_DISPATCH; + /* to start transaction */ + PGPROC proc; + + MyBackendId = 7; + proc.backendId = MyBackendId; + MyProc = &proc; + /* to build cdb components info */ + GpIdentity.dbid = 1; + GpIdentity.segindex = -1; + + MemoryContextInit(); + + /* to avoid mocking cdbtm.c functions */ + MyTmGxactLocal = (TMGXACTLOCAL *) MemoryContextAllocZero(TopMemoryContext, sizeof(TMGXACTLOCAL)); + + SetSessionUserId(1000, true); + + return run_tests(tests); +} diff --git a/src/backend/fts/fts.c b/src/backend/fts/fts.c index 719e8fbca1c..c7c1711e97f 100644 --- a/src/backend/fts/fts.c +++ b/src/backend/fts/fts.c @@ -102,7 +102,7 @@ sigIntHandler(SIGNAL_ARGS) pid_t FtsProbePID(void) { - return *shmFtsProbePID; + return shmFtsProbePID ? *shmFtsProbePID : 0; } bool diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 1a835983222..68524222d71 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -371,6 +371,9 @@ standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) * XLOG_XACT_INVALIDATIONS. So we don't need to do anything here. */ break; + case XLOG_LATESTCOMPLETED_GXID: + /* FIXME: need to decode this part? */ + break; default: elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info); } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 73a53822b3d..e2953686b8e 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1712,6 +1712,7 @@ OpenTemporaryFile(bool interXact, const char *filePrefix) if (!interXact) RegisterTemporaryFile(file); + SIMPLE_FAULT_INJECTOR("after_open_temp_file"); return file; } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 3154caba1bd..57c03cce7d9 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2530,8 +2530,10 @@ getDtxCheckPointInfo(char **result, int *result_size) gxid_array = &gxact_checkpoint->committedGxidArray[0]; actual = 0; + LWLockAcquire(CommittedGxidArrayLock, LW_SHARED); for (; actual < *shmNumCommittedGxacts; actual++) gxid_array[actual] = shmCommittedGxidArray[actual]; + LWLockRelease(CommittedGxidArrayLock); SIMPLE_FAULT_INJECTOR("checkpoint_dtx_info"); @@ -2609,7 +2611,8 @@ CreateDistributedSnapshot(DistributedSnapshot *ds) ProcArrayStruct *arrayP = procArray; Assert(LWLockHeldByMe(ProcArrayLock)); - if (*shmNumCommittedGxacts != 0) + /* Hot standby accepts query while constantly replaying dtx, so this ERROR doesn't apply. */ + if (!IS_HOT_STANDBY_QD() && *shmNumCommittedGxacts != 0) elog(ERROR, "Create distributed snapshot before DTM recovery finish"); xmin = xmax = ShmemVariableCache->latestCompletedGxid + 1; @@ -2623,9 +2626,45 @@ CreateDistributedSnapshot(DistributedSnapshot *ds) Assert(ds->inProgressXidArray != NULL); + /* + * For a hot standby QD, check shmCommittedGxidArray to build the knowledge. + * Need to acquire shared lock to access the committed gxid array as the + * startup process might modify it. + */ + if (IS_HOT_STANDBY_QD()) + { + LWLockAcquire(CommittedGxidArrayLock, LW_SHARED); + for (i = 0; i < *shmNumCommittedGxacts; i++) + { + DistributedTransactionId gxid; + + gxid = shmCommittedGxidArray[i]; + + if (gxid == InvalidDistributedTransactionId || gxid >= xmax) + continue; + + if (gxid < xmin) + xmin = gxid; + + ds->inProgressXidArray[count++] = gxid; + } + LWLockRelease(CommittedGxidArrayLock); + } + /* * Gather up current in-progress global transactions for the distributed * snapshot. + * + * Note: The inProgressXidArray built below may contain transactions that + * have been prepared on some/all segments, and for which the QD hasn't + * begun the COMMIT phase (by writing a XLOG_XACT_DISTRIBUTED_COMMIT record). + * The gxids of these transactions don't necessarily have to be placed into + * inProgressXidArray, for correctness. This is because for visibility + * checks on the QEs, a state of DISTRIBUTEDSNAPSHOT_COMMITTED_UNKNOWN will + * be encountered for such txs, prompting a local check. The local check will + * always find these txs in progress (due to the dummy PGXACTs being + * recorded for prepared txs). So, hypothetically we could exclude these txs + * here, but we don't currently track them on the QD, so we can't. */ for (i = 0; i < arrayP->numProcs; i++) { diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 687ce03767d..13dc551ca54 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -21,6 +21,7 @@ #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" +#include "cdb/cdbvars.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" @@ -29,6 +30,7 @@ #include "storage/procarray.h" #include "storage/sinvaladt.h" #include "storage/standby.h" +#include "utils/faultinjector.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/ps_status.h" @@ -848,6 +850,8 @@ SendRecoveryConflictWithBufferPin(ProcSignalReason reason) * SIGUSR1 handling in each backend decide their own fate. */ CancelDBBackends(InvalidOid, reason, false); + + SIMPLE_FAULT_INJECTOR("recovery_conflict_bufferpin_signal_sent"); } /* @@ -1148,6 +1152,23 @@ standby_redo(XLogReaderState *record) xlrec->dbId, xlrec->tsId); } + else if (info == XLOG_LATESTCOMPLETED_GXID) + { + /* + * This record is only logged by coordinator. But the segment in + * some situation might see it too (e.g. gpexpand), but segment + * doesn't need to update latestCompletedGxid. + */ + if (IS_QUERY_DISPATCHER()) + { + DistributedTransactionId gxid; + + gxid = *((DistributedTransactionId *) XLogRecGetData(record)); + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + ShmemVariableCache->latestCompletedGxid = gxid; + LWLockRelease(ProcArrayLock); + } + } else elog(PANIC, "standby_redo: unknown op code %u", info); } @@ -1265,6 +1286,21 @@ LogStandbySnapshot(void) /* GetRunningTransactionData() acquired XidGenLock, we must release it */ LWLockRelease(XidGenLock); + if (IS_QUERY_DISPATCHER()) + { + /* + * GPDB: write latestCompletedGxid too, because the standby needs this + * value for creating distributed snapshot. The standby cannot rely on + * the nextGxid value to set latestCompletedGxid during restart (which + * the primary does) because nextGxid was bumped in the checkpoint. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + DistributedTransactionId lcgxid = ShmemVariableCache->latestCompletedGxid; + LWLockRelease(ProcArrayLock); + XLogBeginInsert(); + XLogRegisterData((char *) (&lcgxid), sizeof(lcgxid)); + recptr = XLogInsert(RM_STANDBY_ID, XLOG_LATESTCOMPLETED_GXID); + } return recptr; } diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index c8f283198ce..c3583b146d7 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -75,3 +75,4 @@ LoginFailedControlLock 65 LoginFailedSharedMemoryLock 66 GPIVMResLock 67 DirectoryTableLock 68 +CommittedGxidArrayLock 69 diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index a174e981b1f..37d917a1f3e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -354,17 +354,9 @@ InitProcess(void) * WAL sender, etc are marked as GP_ROLE_UTILITY to prevent unwanted * GP_ROLE_DISPATCH MyProc settings such as mppSessionId being valid and * mppIsWriter set to true. - * - * RecoveryInProgress() to see if we are in hot standby, because - * HotStandbyActive() is still true after promotion. */ - if (am_walsender || am_ftshandler || am_faulthandler || - (GpIdentity.segindex == -1 && RecoveryInProgress())) - { + if (am_walsender || am_ftshandler || am_faulthandler) Gp_role = GP_ROLE_UTILITY; - if (GpIdentity.segindex == -1 && RecoveryInProgress()) - elog(WARNING, "Force to run in utility mode in hot standby"); - } /* * ProcGlobal should be set up already (if we are a backend, we inherit diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index f29c9c2e606..62ded58aafb 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1529,6 +1529,8 @@ exec_mpp_dtx_protocol_command(DtxProtocolCommand dtxProtocolCommand, qc.commandTag = GetCommandTagEnum(loggingStr); qc.nprocessed = 1; + SIMPLE_FAULT_INJECTOR("exec_dtx_protocol_start"); + if (log_statement == LOGSTMT_ALL) elog(LOG,"DTM protocol command '%s' for gid = %s", loggingStr, gid); @@ -5714,6 +5716,7 @@ PostgresMain(int argc, char *argv[], const char *serializedQueryDispatchDesc = NULL; const char *resgroupInfoBuf = NULL; + int is_hs_dispatch; int query_string_len = 0; int serializedDtxContextInfolen = 0; int serializedPlantreelen = 0; @@ -5750,6 +5753,20 @@ PostgresMain(int argc, char *argv[], cuid = pq_getmsgint(&input_message, 4); statementStart = pq_getmsgint64(&input_message); + + /* check if the message is from standby QD and is expected */ + is_hs_dispatch = pq_getmsgint(&input_message, 4); + if (is_hs_dispatch == 0 && IS_HOT_STANDBY_QE()) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("mirror segments can only process MPP protocol messages from standby QD"), + errhint("Exit the current session and re-connect."))); + else if (is_hs_dispatch != 0 && !IS_HOT_STANDBY_QE()) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("primary segments can only process MPP protocol messages from primary QD"), + errhint("Exit the current session and re-connect."))); + query_string_len = pq_getmsgint(&input_message, 4); serializedPlantreelen = pq_getmsgint(&input_message, 4); serializedQueryDispatchDesclen = pq_getmsgint(&input_message, 4); diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 728d12c604a..532690f1d51 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -617,6 +617,8 @@ PortalStart(Portal portal, ParamListInfo params, needDistributedSnapshot = false; } + SIMPLE_FAULT_INJECTOR("select_before_qd_create_snapshot"); + /* Must set snapshot before starting executor. */ if (snapshot) PushActiveSnapshot(snapshot); @@ -626,6 +628,8 @@ PortalStart(Portal portal, ParamListInfo params, /* reset value */ needDistributedSnapshot = true; + SIMPLE_FAULT_INJECTOR("select_after_qd_create_snapshot"); + /* * We could remember the snapshot in portal->portalSnapshot, * but presently there seems no need to, as this code path diff --git a/src/backend/utils/gp/segadmin.c b/src/backend/utils/gp/segadmin.c index e8b9b309cb6..8d06ae50b48 100644 --- a/src/backend/utils/gp/segadmin.c +++ b/src/backend/utils/gp/segadmin.c @@ -181,87 +181,12 @@ static void remove_segment_config(int16 dbid) { #ifdef USE_INTERNAL_FTS - int numDel = 0; - ScanKeyData scankey; - SysScanDesc sscan; - HeapTuple tuple; - Relation rel; - - rel = table_open(GpSegmentConfigRelationId, RowExclusiveLock); - - ScanKeyInit(&scankey, - Anum_gp_segment_configuration_dbid, - BTEqualStrategyNumber, F_INT2EQ, - Int16GetDatum(dbid)); - sscan = systable_beginscan(rel, GpSegmentConfigDbidWarehouseIndexId, true, - NULL, 1, &scankey); - while ((tuple = systable_getnext(sscan)) != NULL) - { - Datum attr; - bool isNull; - Oid warehouseid = InvalidOid; - - attr = heap_getattr(tuple, Anum_gp_segment_configuration_warehouseid, - RelationGetDescr(rel), &isNull); - Assert(!isNull); - warehouseid = DatumGetObjectId(attr); - - if (!OidIsValid(warehouseid) || warehouseid == GetCurrentWarehouseId()) - { - CatalogTupleDelete(rel, &tuple->t_self); - numDel++; - } - } - systable_endscan(sscan); - - Assert(numDel > 0); - - table_close(rel, NoLock); + remove_segment_config_entry(dbid); #else delSegment(dbid); #endif } -#ifdef USE_INTERNAL_FTS -static void -add_segment_config_entry(GpSegConfigEntry *i) -{ - Relation rel = table_open(GpSegmentConfigRelationId, AccessExclusiveLock); - Datum values[Natts_gp_segment_configuration]; - bool nulls[Natts_gp_segment_configuration]; - HeapTuple tuple; - - MemSet(nulls, false, sizeof(nulls)); - - values[Anum_gp_segment_configuration_dbid - 1] = Int16GetDatum(i->dbid); - values[Anum_gp_segment_configuration_content - 1] = Int16GetDatum(i->segindex); - values[Anum_gp_segment_configuration_role - 1] = CharGetDatum(i->role); - values[Anum_gp_segment_configuration_preferred_role - 1] = - CharGetDatum(i->preferred_role); - values[Anum_gp_segment_configuration_mode - 1] = - CharGetDatum(i->mode); - values[Anum_gp_segment_configuration_status - 1] = - CharGetDatum(i->status); - values[Anum_gp_segment_configuration_port - 1] = - Int32GetDatum(i->port); - values[Anum_gp_segment_configuration_hostname - 1] = - CStringGetTextDatum(i->hostname); - values[Anum_gp_segment_configuration_address - 1] = - CStringGetTextDatum(i->address); - values[Anum_gp_segment_configuration_datadir - 1] = - CStringGetTextDatum(i->datadir); - values[Anum_gp_segment_configuration_warehouseid - 1] = - ObjectIdGetDatum(i->warehouseid); - - tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); - - /* insert a new tuple */ - CatalogTupleInsert(rel, tuple); - - table_close(rel, NoLock); -} -#endif - static void add_segment(GpSegConfigEntry *new_segment_information) { diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index bd6ae4300da..247935c5945 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -3081,17 +3081,6 @@ struct config_bool ConfigureNamesBool_gp[] = false, NULL, NULL, NULL }, - - { - {"gp_pause_on_restore_point_replay", PGC_SIGHUP, DEVELOPER_OPTIONS, - gettext_noop("Pause recovery when a restore point is replayed."), - NULL, - GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE - }, - &gp_pause_on_restore_point_replay, - false, - NULL, NULL, NULL - }, { {"gp_autostats_allow_nonowner", PGC_SUSET, DEVELOPER_OPTIONS, gettext_noop("Allow automatic stats collection on tables even for users who are not the owner of the relation."), @@ -5029,6 +5018,27 @@ struct config_string ConfigureNamesString_gp[] = "udpifc", check_gp_interconnect_type, assign_gp_interconnect_type, show_gp_interconnect_type }, + { + {"gp_pause_on_restore_point_replay", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Specifies the restore point to pause replay on."), + gettext_noop("Unlike recovery_target_name, this can be used to continuously set/reset " + "how much a standby should replay up to."), + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE + }, + &gp_pause_on_restore_point_replay, + "", + NULL, NULL, NULL + }, + { + {"gp_segment_configuration_file", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Specifies the recovery cluster gp_segment_configuration file"), + gettext_noop(""), + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE + }, + &gp_segment_configuration_file, + "", + NULL, NULL, NULL + }, /* End-of-list marker */ { diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 42c22a0690a..0ae8ddf27f1 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -174,6 +174,7 @@ static char *external_fts_files; #endif static char *system_functions_file; static char *system_views_file; +static char *system_views_gp_file; static bool success = false; static bool made_new_pgdata = false; static bool found_existing_pgdata = false; @@ -2831,6 +2832,7 @@ setup_data_file_paths(void) set_input(&system_constraints_file, "system_constraints.sql"); set_input(&system_functions_file, "system_functions.sql"); set_input(&system_views_file, "system_views.sql"); + set_input(&system_views_gp_file, "system_views_gp.sql"); set_input(&cdb_init_d_dir, "cdb_init.d"); @@ -2864,6 +2866,7 @@ setup_data_file_paths(void) #endif check_input(system_functions_file); check_input(system_views_file); + check_input(system_views_gp_file); } @@ -3231,6 +3234,7 @@ initialize_data_directory(void) */ setup_run_file(cmdfd, system_views_file); + setup_run_file(cmdfd, system_views_gp_file); setup_description(cmdfd); diff --git a/src/include/access/transam.h b/src/include/access/transam.h index cec3e5f4cb7..687799bec9f 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -301,7 +301,7 @@ extern int xid_stop_limit; extern int xid_warn_limit; /* GPDB-specific */ -extern bool gp_pause_on_restore_point_replay; +extern char *gp_pause_on_restore_point_replay; /* hook for plugins to assign new relfilenode */ typedef Oid (*NewSegRelfilenode_assign_hook_type)(void); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2dfad411b7a..e8a73ceb201 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -11,6 +11,8 @@ #ifndef XLOG_H #define XLOG_H +#include "postgres.h" /* for Datum */ + #include "access/rmgr.h" #include "access/xlogdefs.h" #include "access/xloginsert.h" diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 86910a0dada..026192b3674 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -56,6 +56,6 @@ */ /* 3yyymmddN */ -#define CATALOG_VERSION_NO 302502091 +#define CATALOG_VERSION_NO 302506101 #endif diff --git a/src/include/cdb/cdbtm.h b/src/include/cdb/cdbtm.h index 951b9013c00..2bf259a8744 100644 --- a/src/include/cdb/cdbtm.h +++ b/src/include/cdb/cdbtm.h @@ -35,8 +35,12 @@ typedef enum DTX_STATE_NONE = 0, /** - * The distributed transaction is active and requires distributed coordination - * (because it is explicit or an implicit writer transaction) + * The distributed transaction is active. + * For a primary, this state means the transaction requires distributed + * coordination (because it is explicit or an implicit writer transaction), + * and it will switch to other dtx states in different phases. + * For a hot standby, there is no coordination necessary so transaction + * will stay in this state until the end of the commit. */ DTX_STATE_ACTIVE_DISTRIBUTED, @@ -232,6 +236,7 @@ typedef struct TMGXACTLOCAL { /* * Memory only fields. + * If we are in hot standby, only 'state' is relevant. */ DtxState state; diff --git a/src/include/cdb/cdbutil.h b/src/include/cdb/cdbutil.h index 22c3cc782d8..d3711ca3ff8 100644 --- a/src/include/cdb/cdbutil.h +++ b/src/include/cdb/cdbutil.h @@ -37,6 +37,8 @@ extern char *gp_etcd_cluster_id; extern char *gp_etcd_namespace; extern char *gp_etcd_endpoints; +extern char *gp_segment_configuration_file; + typedef struct GpSegConfigEntryForUDF { GpSegConfigEntry * config_entry; @@ -132,6 +134,9 @@ extern char *getDnsAddress(char *name, int port, int elevel); #ifdef USE_INTERNAL_FTS extern void writeGpSegConfigToFTSFiles(void); +extern void add_segment_config_entry(GpSegConfigEntry *i); +extern void remove_segment_config_entry(int16 dbid); +extern void write_gp_segment_configuration(void); #else GpSegConfigEntry * readGpSegConfig(char * buff, int *total_dbs); diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h index 90af5177ce0..2393384ec3a 100644 --- a/src/include/cdb/cdbvars.h +++ b/src/include/cdb/cdbvars.h @@ -19,6 +19,7 @@ #ifndef CDBVARS_H #define CDBVARS_H +#include "access/xlog.h" /*RecoveryInProgress*/ #include "access/xlogdefs.h" /*XLogRecPtr*/ #include "cdb/cdbutil.h" /* MASTER_CONTENT_ID */ #ifdef USE_INTERNAL_FTS @@ -757,8 +758,10 @@ extern GpId GpIdentity; #define UNINITIALIZED_GP_IDENTITY_VALUE (-10000) #define IS_QUERY_DISPATCHER() (GpIdentity.segindex == MASTER_CONTENT_ID) +#define IS_HOT_STANDBY_QD() (EnableHotStandby && IS_QUERY_DISPATCHER() && RecoveryInProgress()) #define IS_QUERY_EXECUTOR_BACKEND() (Gp_role == GP_ROLE_EXECUTE && gp_session_id > 0) +#define IS_HOT_STANDBY_QE() (EnableHotStandby && IS_QUERY_EXECUTOR_BACKEND() && RecoveryInProgress()) /* Stores the listener port that this process uses to listen for incoming * Interconnect connections from other Motion nodes. diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index d99e6f40c6d..f007fe25245 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -34,6 +34,7 @@ extern void standby_desc_invalidations(StringInfo buf, #define XLOG_STANDBY_LOCK 0x00 #define XLOG_RUNNING_XACTS 0x10 #define XLOG_INVALIDATIONS 0x20 +#define XLOG_LATESTCOMPLETED_GXID 0xF0 typedef struct xl_standby_locks { diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h index 959d2a89a73..abf189263bb 100644 --- a/src/include/utils/unsync_guc_name.h +++ b/src/include/utils/unsync_guc_name.h @@ -241,6 +241,7 @@ "gp_max_slices", "gp_motion_cost_per_row", "gp_pause_on_restore_point_replay", + "gp_segment_configuration_file", "gp_predicate_pushdown_sample_rows", "gp_print_create_gang_time", "gp_qd_hostname", diff --git a/src/test/isolation2/Makefile b/src/test/isolation2/Makefile index 759b2855513..bc1e0f66be0 100644 --- a/src/test/isolation2/Makefile +++ b/src/test/isolation2/Makefile @@ -90,3 +90,6 @@ installcheck-cbdb-parallel: install export PGOPTIONS='-c optimizer=off -c enable_parallel=true'; \ $(pg_isolation2_regress_installcheck) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/isolation2_schedule \ ) + +installcheck-hot-standby: install + $(pg_isolation2_regress_installcheck) $(EXTRA_REGRESS_OPTS) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/hot_standby_schedule --dbname=isolation2-hot-standby diff --git a/src/test/isolation2/expected/hot_standby/basic.out b/src/test/isolation2/expected/hot_standby/basic.out new file mode 100644 index 00000000000..5318a35d7d7 --- /dev/null +++ b/src/test/isolation2/expected/hot_standby/basic.out @@ -0,0 +1,242 @@ +-- Tests for basic query dispatch on a hot standy. + +-- hot standby must show on and the sync mode is remote_apply for the tests to make sense +-1S: show hot_standby; + hot_standby +------------- + on +(1 row) +-1S: show synchronous_commit; + synchronous_commit +-------------------- + remote_apply +(1 row) + +-- will be checking if QD/QE info looks good +-1S: select id, type, content, port from gp_backend_info(); + id | type | content | port +----+------+---------+------ + -1 | Q | -1 | 7001 +(1 row) + +---------------------------------------------------------------- +-- Test: basic query dispatch +---------------------------------------------------------------- +create table hs_t1(a int); +CREATE +create table hs_t2(a int); +CREATE + +-- standby should see the results for 2pc immediately. +insert into hs_t1 select * from generate_series(1,10); +INSERT 10 +-1S: select * from hs_t1; + a +---- + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 + 1 +(10 rows) +-- standby won't see results for the last 1pc immediately because the standby QD +-- isn't aware of of it so its distributed snapshot doesn't include the 1pc, but +-- as long as another 2pc comes it will be able to see the previous 1pc. Wee +-- tolerate this case in the mirrored cluster setup. +insert into hs_t2 values(1); +INSERT 1 +-1S: select * from hs_t2; + a +--- +(0 rows) +-- any following 2pc will make the 1pc visible +create temp table tt(a int); +CREATE +-1S: select * from hs_t2; + a +--- + 1 +(1 row) + +-- we have three QEs launched on the mirror segments. +-- note that the first QE on a segment is still a "writer" because we +-- need it to manage locks, same as read-only queries on a primary QD. +-1S: select id, type, content, port from gp_backend_info(); + id | type | content | port +----+------+---------+------ + -1 | Q | -1 | 7001 + 0 | w | 0 | 7005 + 1 | w | 1 | 7006 + 2 | w | 2 | 7007 +(4 rows) + +-- should have parallel readers launched +-1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2); + a | a +---+--- + 1 | 1 +(1 row) +-1S: select id, type, content, port from gp_backend_info(); + id | type | content | port +----+------+---------+------ + -1 | Q | -1 | 7001 + 0 | w | 0 | 7005 + 1 | w | 1 | 7006 + 2 | w | 2 | 7007 + 3 | r | 0 | 7005 + 4 | r | 1 | 7006 + 5 | r | 2 | 7007 +(7 rows) + +-- now a singleton reader added too +-1S: select * from hs_t1 join (select oid::int from pg_class) hs_t2 on (hs_t1 = hs_t2); + a | oid +---+----- +(0 rows) +-1S: select id, type, content, port from gp_backend_info(); + id | type | content | port +----+------+---------+------ + -1 | Q | -1 | 7001 + 0 | w | 0 | 7005 + 1 | w | 1 | 7006 + 2 | w | 2 | 7007 + 3 | r | 0 | 7005 + 4 | r | 1 | 7006 + 5 | r | 2 | 7007 + 6 | R | -1 | 7001 +(8 rows) + +-- un-committed result should not be seen by the standby +begin; +BEGIN +insert into hs_t1 select * from generate_series(11,20); +INSERT 10 + +-- standby should only see 1...10 +-1S: select * from hs_t1; + a +---- + 5 + 6 + 9 + 10 + 2 + 3 + 4 + 7 + 8 + 1 +(10 rows) + +end; +END + +-- standby should see 1...20 now +-1S: select * from hs_t1; + a +---- + 2 + 3 + 4 + 7 + 8 + 16 + 18 + 19 + 1 + 12 + 15 + 20 + 5 + 6 + 9 + 10 + 11 + 13 + 14 + 17 +(20 rows) + +---------------------------------------------------------------- +-- Test: other things that a hot standby can do. +-- +-- More refer to regress test 'hs_standby_allowed'. +---------------------------------------------------------------- +-- set/reset and show GUC +-1S: set optimizer = on; +SET +-1S: show optimizer; + optimizer +----------- + on +(1 row) +-1S: reset optimizer; +RESET +-- copy command +-1S: copy hs_t1 to '/tmp/hs_copyto.csv' csv null ''; +COPY 20 +-- query catalogs +-1S: select count(*) from pg_class where relname = 'hs_t1'; + count +------- + 1 +(1 row) +-1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer; + dbid | content | role | preferred_role | mode | status +------+---------+------+----------------+------+-------- + 8 | -1 | m | m | s | u +(1 row) +-- checkpoint is allowed on standby but a restart point is created instead +-1S: checkpoint; +CHECKPOINT + +---------------------------------------------------------------- +-- Test: things that can't be done on a hot standby: +-- no DML, DDL or anything that generates WAL. +-- +-- More refer to regress test 'hs_standby_disallowed'. +---------------------------------------------------------------- +-1S: insert into hs_t1 values(1); +ERROR: cannot execute INSERT in a read-only transaction +-1S: delete from hs_t1; +ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress +LINE 1: delete from hs_t1; + ^ +HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery. +-1S: update hs_t1 set a = 0; +ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress +LINE 1: update hs_t1 set a = 0; + ^ +HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery. +-1S: create table hs_t2(a int); +ERROR: cannot execute CREATE TABLE in a read-only transaction +-1S: create database hs_db; +ERROR: cannot execute CREATE DATABASE in a read-only transaction +-1S: vacuum hs_t1; +ERROR: cannot execute VACUUM during recovery + +-- +-- No hintbit WAL generation in SELECT. +-- +create table hs_nohintbit(a int) distributed by (a); +CREATE +insert into hs_nohintbit select generate_series (1, 10); +INSERT 10 +-- flush the data to disk +checkpoint; +CHECKPOINT + +-1S: set gp_disable_tuple_hints=off; +SET +-- no WAL is being generated (otherwise an error would occur "cannot make new WAL entries during recovery") +-1S: SELECT count(*) FROM hs_nohintbit; + count +------- + 10 +(1 row) + diff --git a/src/test/isolation2/expected/hot_standby/faults.out b/src/test/isolation2/expected/hot_standby/faults.out new file mode 100644 index 00000000000..39f3a06cca6 --- /dev/null +++ b/src/test/isolation2/expected/hot_standby/faults.out @@ -0,0 +1,326 @@ +-- Test system faults scenarios + +-- start_matchsubs +-- +-- m/Is the server running on host.*/ +-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host and accepting/ +-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/ +-- s/(.*)/(seg IP:PORT)/ +-- m/ERROR: connection to dbid 1 .*:7000 failed .*/ +-- s/ERROR: connection to dbid 1 .*:7000 failed .*/ERROR: connection to dbid 1 :7000 failed/ +-- +-- end_matchsubs + +-- Let FTS detect/declare failure sooner +!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly; +(exited with code 0) +!\retcode gpstop -u; +(exited with code 0) + +create table hs_failover(a int); +CREATE +insert into hs_failover select * from generate_series(1,10); +INSERT 10 +-1S: select * from hs_failover; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) + +---------------------------------------------------------------- +-- Mirror segment fails +---------------------------------------------------------------- +select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm'; + pg_ctl +-------- + OK +(1 row) + +-- make sure mirror is detected down +create temp table hs_tt(a int); +CREATE +select gp_request_fts_probe_scan(); + gp_request_fts_probe_scan +--------------------------- + t +(1 row) + +-- will not succeed +-1S: select * from hs_failover; +ERROR: Error on receive from seg1 slice1 127.0.1.1:7006 pid=26942: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-1Sq: ... + +-- recovery +!\retcode gprecoverseg -aF; +(exited with code 0) + +-- sync-up +select wait_until_all_segments_synchronized(); + wait_until_all_segments_synchronized +-------------------------------------- + OK +(1 row) + +-- works now +-1S: select * from hs_failover; + a +---- + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 + 1 +(10 rows) + +---------------------------------------------------------------- +-- Primary segment fails +---------------------------------------------------------------- +-- inject a fault where the mirror gets out of recovery +select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm'; + gp_inject_fault +----------------- + Success: +(1 row) + +select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p'; + pg_ctl +-------- + OK +(1 row) +select gp_request_fts_probe_scan(); + gp_request_fts_probe_scan +--------------------------- + t +(1 row) + +-- make sure failover happens +select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + dbid | content | role | preferred_role | mode | status +------+---------+------+----------------+------+-------- + 3 | 1 | m | p | n | d + 6 | 1 | p | m | n | u +(2 rows) +select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- On an existing standby connection, query will run but it is dispatched to the previous mirror +-- in an existing gang. That mirror is now a primary, so it will complain and the query fails. +-1S: select * from hs_failover; +ERROR: primary segments can only process MPP protocol messages from primary QD (seg1 slice1 127.0.1.1:7006 pid=14671) +HINT: Exit the current session and re-connect. +-1Sq: ... + +-- will fail due to downed mirror (previous primary) +-1S: select * from hs_failover; +ERROR: failed to acquire resources on one or more segments +DETAIL: connection to server at "10.13.9.74", port 7003 failed: Connection refused + Is the server running on that host and accepting TCP/IP connections? + (seg1 10.13.9.74:7003) +-1Sq: ... + +-- bring the downed mirror up +!\retcode gprecoverseg -aF; +(exited with code 0) +select wait_until_all_segments_synchronized(); + wait_until_all_segments_synchronized +-------------------------------------- + OK +(1 row) + +-- mirror is up +-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + dbid | content | role | preferred_role | mode | status +------+---------+------+----------------+------+-------- + 6 | 1 | p | m | s | u + 3 | 1 | m | p | s | u +(2 rows) + +-- now the query will succeed +-1S: select * from hs_failover; + a +---- + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 + 1 +(10 rows) +-1Sq: ... + +-- re-balance, bring the segments to their preferred roles +!\retcode gprecoverseg -ar; +(exited with code 0) +select wait_until_all_segments_synchronized(); + wait_until_all_segments_synchronized +-------------------------------------- + OK +(1 row) +-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + dbid | content | role | preferred_role | mode | status +------+---------+------+----------------+------+-------- + 3 | 1 | p | p | s | u + 6 | 1 | m | m | s | u +(2 rows) + +-- query runs fine still +-1S: select * from hs_failover; + a +---- + 5 + 6 + 9 + 10 + 1 + 2 + 3 + 4 + 7 + 8 +(10 rows) + +---------------------------------------------------------------- +-- DTX recovery +---------------------------------------------------------------- +-- skip FTS probe to prevent unexpected mirror promotion +1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +1: create table tt_hs_dtx(a int); +CREATE + +-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process. +select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE +1&: insert into tt_hs_dtx select * from generate_series(1,10); + +-- inject a panic on primary QD, essentially restarts the primary QD +2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +2: select 1; +PANIC: fault triggered, fault name:'before_read_command' fault type:'panic' +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. + +1<: <... completed> +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +1q: ... +2q: ... + +-- standby QD can still run query +-1S: select * from hs_failover; + a +---- + 1 + 10 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +(10 rows) +-- it cannot see rows from the in-doubt DTX +-1S: select * from tt_hs_dtx; + a +--- +(0 rows) + +-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx +-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) +-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- standby should see the rows from the in-doubt DTX now +-1S: select * from tt_hs_dtx; + a +---- + 1 + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 +(10 rows) + +-1S: select wait_until_all_segments_synchronized(); + wait_until_all_segments_synchronized +-------------------------------------- + OK +(1 row) +1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1; + gp_inject_fault +----------------- + Success: +(1 row) + diff --git a/src/test/isolation2/expected/hot_standby/setup.out b/src/test/isolation2/expected/hot_standby/setup.out new file mode 100644 index 00000000000..f8f1e02fe40 --- /dev/null +++ b/src/test/isolation2/expected/hot_standby/setup.out @@ -0,0 +1,14 @@ +-- setup for hot standby tests +!\retcode gpconfig -c hot_standby -v on; +(exited with code 0) +-- let primary wait for standby to apply changes, make test less flaky +!\retcode gpconfig -c synchronous_commit -v remote_apply; +(exited with code 0) +-- make it faster to handle query conflict +!\retcode gpconfig -c max_standby_streaming_delay -v 1000; +(exited with code 0) +-- disable autovacuum, to not affect the manual VACUUM in the tests +!\retcode gpconfig -c autovacuum -v off; +(exited with code 0) +!\retcode gpstop -ar; +(exited with code 0) diff --git a/src/test/isolation2/expected/hot_standby/teardown.out b/src/test/isolation2/expected/hot_standby/teardown.out new file mode 100644 index 00000000000..8b4e1271610 --- /dev/null +++ b/src/test/isolation2/expected/hot_standby/teardown.out @@ -0,0 +1,9 @@ +-- reset the setup for hot standby tests +!\retcode gpconfig -r hot_standby; +(exited with code 0) +!\retcode gpconfig -r synchronous_commit; +(exited with code 0) +!\retcode gpconfig -r max_standby_streaming_delay; +(exited with code 0) +!\retcode gpstop -ar; +(exited with code 0) diff --git a/src/test/isolation2/expected/hot_standby/transaction_isolation.out b/src/test/isolation2/expected/hot_standby/transaction_isolation.out new file mode 100644 index 00000000000..3990bd7cd56 --- /dev/null +++ b/src/test/isolation2/expected/hot_standby/transaction_isolation.out @@ -0,0 +1,984 @@ +---------------------------------------------------------------- +-- Test transaction isolation in general, not specific to dtx +---------------------------------------------------------------- +1: create table hs_tx(a int); +CREATE +1: insert into hs_tx select * from generate_series(1,10); +INSERT 10 + +1: begin; +BEGIN +1: insert into hs_tx select * from generate_series(11,20); +INSERT 10 +2: begin; +BEGIN +2: insert into hs_tx select * from generate_series(21,30); +INSERT 10 +2: abort; +ABORT + +-- standby should only see completed transactions, not in-progress transactions, nor aborted transactions +-1S: select * from hs_tx; + a +---- + 1 + 5 + 6 + 9 + 10 + 2 + 3 + 4 + 7 + 8 +(10 rows) + +1: end; +END +-1S: select * from hs_tx; + a +---- + 2 + 3 + 4 + 7 + 8 + 16 + 18 + 19 + 1 + 12 + 15 + 20 + 5 + 6 + 9 + 10 + 11 + 13 + 14 + 17 +(20 rows) + +---------------------------------------------------------------- +-- Test isolation between hot standby query and in-progress dtx +---------------------------------------------------------------- + +1: create table hs_dtx1(a int); +CREATE +1: create table hs_dtx2(a int); +CREATE + +-- inject two suspend faults: +-- 1. on seg0, suspend before PREPARE phase of 2PC +1: select gp_inject_fault('qe_start_prepared', 'suspend',dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1&: insert into hs_dtx1 select * from generate_series(1,10); +-- 2. on seg1, suspend before COMMIT phase of 2PC +2: select gp_inject_fault('qe_start_commit_prepared', 'suspend',dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +2&: insert into hs_dtx2 select * from generate_series(1,10); + +-- standby should not see any rows from either dtx +-1S: select * from hs_dtx1; + a +--- +(0 rows) +-1S: select * from hs_dtx2; + a +--- +(0 rows) + +-- reset +3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1<: <... completed> +INSERT 10 +2<: <... completed> +INSERT 10 + +-- standby should see the results from the dtx now +-1S: select * from hs_dtx1; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) +-1S: select * from hs_dtx2; + a +---- + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 + 1 +(10 rows) + +---------------------------------------------------------------- +-- Test DTX abort that happens in different phases +---------------------------------------------------------------- + +1: create table hs_abort_dtx1(a int); +CREATE +1: create table hs_abort_dtx2(a int); +CREATE + +-- inject two errors: +-- 1. on seg0, error out before PREPARE phase of 2PC +1: select gp_inject_fault('qe_start_prepared', 'error', dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1: insert into hs_abort_dtx1 select * from generate_series(1,10); +ERROR: fault triggered, fault name:'qe_start_prepared' fault type:'error' (seg0 127.0.1.1:7002 pid=343) +1: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +-- 2. on seg1, error out before COMMIT phase of 2PC +1: select gp_inject_fault('qe_start_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1: insert into hs_abort_dtx2 select * from generate_series(1,10); +INSERT 10 +1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- standby should not see dtx1 which is aborted but should see dtx2 which is recovered +-1S: select * from hs_abort_dtx1; + a +--- +(0 rows) +-1S: select * from hs_abort_dtx2; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) + +---------------------------------------------------------------- +-- Test isolation between hot standby query and in-progress dtx, +-- but also run more queries in between +---------------------------------------------------------------- +1: create table hs_dtx3(a int); +CREATE + +-- inject faults to suspend segments in 2PC +1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1&: insert into hs_dtx3 select * from generate_series(1,10); +2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +2&: insert into hs_dtx3 select * from generate_series(11,20); + +-- standby should not see rows in the in-progress dtx +-1S: select * from hs_dtx3; + a +--- +(0 rows) + +-- now run some dtx and completed +3: insert into hs_dtx3 values(99); +INSERT 1 +3: create table hs_dtx4(a int); +CREATE +3: insert into hs_dtx4 select * from generate_series(1,10); +INSERT 10 + +-- standby should still not see rows in the in-progress DTX, but should see the completed ones +-1S: select * from hs_dtx3; + a +---- + 99 +(1 row) +-1S: select * from hs_dtx4; + a +---- + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 + 1 +(10 rows) + +3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1<: <... completed> +INSERT 10 +2<: <... completed> +INSERT 10 + +-- standby should see all rows now +-1S: select * from hs_dtx3; + a +---- + 1 + 12 + 15 + 20 + 2 + 3 + 4 + 7 + 8 + 16 + 18 + 19 + 99 + 5 + 6 + 9 + 10 + 11 + 13 + 14 + 17 +(21 rows) + +---------------------------------------------------------------- +-- Test isolation between standby QD and in-progress dtx, +-- but after standby QD resets and gets running DTX from checkpoint. +---------------------------------------------------------------- +1: create table hs_t5(a int, b text); +CREATE +1: create table hs_t6(a int, b text); +CREATE + +-- inject fault to suspend a primary right before it conducts the commit phase of 2PC, +-- so in the subsequent INSERT, all local transactions will be committed but the dtx is not. +1: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1&: insert into hs_t5 select i, 'in-progress' from generate_series(1,10) i; + +-- now run some dtx and completed, and primary conducts a checkpoint +2: insert into hs_t5 values(1, 'commited'); +INSERT 1 +2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i; +INSERT 10 +2: begin; +BEGIN +2: insert into hs_t5 values(99, 'aborted'); +INSERT 1 +2: abort; +ABORT +2: checkpoint; +CHECKPOINT + +-- now make the standby QD resets itself +-1S: select gp_inject_fault('exec_simple_query_start', 'panic', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-1S: select 1; +PANIC: fault triggered, fault name:'exec_simple_query_start' fault type:'panic' +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-1Sq: ... + +-- standby should still not see rows in the in-progress DTX, but should see the completed ones +-1S: select * from hs_t5; + a | b +---+---------- + 1 | commited +(1 row) +-1S: select * from hs_t6; + a | b +----+----------- + 1 | committed + 2 | committed + 3 | committed + 4 | committed + 7 | committed + 8 | committed + 5 | committed + 6 | committed + 9 | committed + 10 | committed +(10 rows) + +2: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1<: <... completed> +INSERT 10 + +-- standby should see all rows now +-1S: select * from hs_t5; + a | b +----+------------- + 1 | in-progress + 1 | commited + 5 | in-progress + 6 | in-progress + 9 | in-progress + 10 | in-progress + 2 | in-progress + 3 | in-progress + 4 | in-progress + 7 | in-progress + 8 | in-progress +(11 rows) +-1S: select * from hs_t6; + a | b +----+----------- + 5 | committed + 6 | committed + 9 | committed + 10 | committed + 1 | committed + 2 | committed + 3 | committed + 4 | committed + 7 | committed + 8 | committed +(10 rows) + +-- standby should correctly see more in-progress dtx on the primary. +-- context: previously this would be fail because the standby updates latestCompletedGxid to the +-- bumped nextGxid from checkpoint, which is too far (so that it thinks the new dtx already completed). +1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +1&: delete from hs_t5; +2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +2&: delete from hs_t6; + +-- standby should not see the effect of the deletes +-1S: select * from hs_t5; + a | b +----+------------- + 2 | in-progress + 3 | in-progress + 4 | in-progress + 7 | in-progress + 8 | in-progress + 1 | in-progress + 1 | commited + 5 | in-progress + 6 | in-progress + 9 | in-progress + 10 | in-progress +(11 rows) +-1S: select * from hs_t6; + a | b +----+----------- + 1 | committed + 2 | committed + 3 | committed + 4 | committed + 7 | committed + 8 | committed + 5 | committed + 6 | committed + 9 | committed + 10 | committed +(10 rows) + +3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) +3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; + gp_inject_fault +----------------- + Success: +(1 row) + +1<: <... completed> +DELETE 11 +2<: <... completed> +DELETE 10 + +-- standby now see those deletes +-1S: select * from hs_t5; + a | b +---+--- +(0 rows) +-1S: select * from hs_t6; + a | b +---+--- +(0 rows) + +---------------------------------------------------------------- +-- Read-committed isolation: query on hot standby should not see dtx that completed after it +-- created distributed snapshot, but should see dtx that completed before that. +---------------------------------------------------------------- + +1: create table hs_rc(a int); +CREATE +1: insert into hs_rc select * from generate_series(1,10); +INSERT 10 + +-- case 1: suspend SELECT on the standby QD right after it created snapshot +-1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-1S&: select * from hs_rc; + +-- new INSERT or DELETE won't be observed by the standby +1: insert into hs_rc select * from generate_series(11,20); +INSERT 10 +1: delete from hs_rc where a < 5; +DELETE 4 +1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- should only see the rows at the time when SELECT started (1...10). +-1S<: <... completed> + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) + +-- SELECT again, should see the effect from the INSERT and DELETE now +-1S: select * from hs_rc; + a +---- + 12 + 15 + 20 + 7 + 8 + 16 + 18 + 19 + 5 + 6 + 9 + 10 + 11 + 13 + 14 + 17 +(16 rows) + +-- case 2: suspend SELECT on the standby QD before creating snapshot +-1S: select gp_inject_fault('select_before_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-1S&: select * from hs_rc; + +1: insert into hs_rc select * from generate_series(21,30); +INSERT 10 +1: delete from hs_rc where a < 21; +DELETE 16 +1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- standby should see the effect of the INSERT and DELETE +-1S<: <... completed> + a +---- + 23 + 26 + 30 + 22 + 24 + 27 + 29 + 21 + 25 + 28 +(10 rows) + +---------------------------------------------------------------- +-- Read-committed isolation in the BEGIN...END block +---------------------------------------------------------------- + +1: truncate hs_rc; +TRUNCATE +1: insert into hs_rc select * from generate_series(1,30); +INSERT 30 + +-1S: begin; +BEGIN +-1S: select count(*) from hs_rc; + count +------- + 30 +(1 row) + +-- have some concurrent sessions on primary QD: +-- 1. a completed transaction +1: delete from hs_rc where a <= 10; +DELETE 10 +-- 3. an aborted transaction +2: begin; +BEGIN +2: delete from hs_rc where a > 10 and a <= 20; +DELETE 10 +2: abort; +ABORT +-- 3. an ongoing transaction +3: begin; +BEGIN +3: delete from hs_rc where a > 20 and a <= 30; +DELETE 10 + +-- the standby should see results accordingly +-1S: select * from hs_rc; + a +---- + 12 + 15 + 20 + 23 + 26 + 30 + 11 + 13 + 14 + 17 + 21 + 25 + 28 + 16 + 18 + 19 + 22 + 24 + 27 + 29 +(20 rows) +-1S: end; +END + +3: end; +END +-1S: select * from hs_rc; + a +---- + 12 + 15 + 20 + 11 + 13 + 14 + 17 + 16 + 18 + 19 +(10 rows) + +---------------------------------------------------------------- +-- Repeatable-read isolation: distributed snapshot is created at time of the +-- first query in transaction block. All queries in the transaction block +-- should only see results committed before the distributed snapshot creation. +---------------------------------------------------------------- + +1: create table hs_rr(a int); +CREATE +1: insert into hs_rr select * from generate_series(1,10); +INSERT 10 + +-1S: begin isolation level repeatable read; +BEGIN +-- should see 10 +-1S: select count(*) from hs_rr; + count +------- + 10 +(1 row) + +-- do some more INSERT, DELETE and UPDATE +1: insert into hs_rr select * from generate_series(11,20); +INSERT 10 +1: delete from hs_rr where a <= 10; +DELETE 10 +1: update hs_rr set a = a + 100; +UPDATE 10 + +-- should still the initial rows {1...10} +-1S: select * from hs_rr; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) +-1S: end; +END + +-- should see the results from the INSERT, DELETE and UPDATE +-1S: begin isolation level repeatable read; +BEGIN +-1S: select * from hs_rr; + a +----- + 115 + 120 + 118 + 113 + 114 + 112 + 116 + 119 + 111 + 117 +(10 rows) + +-- standby won't see ongoing or aborted transactions either +1: begin; +BEGIN +1: insert into hs_rr select * from generate_series(1,10); +INSERT 10 +2: begin; +BEGIN +2: insert into hs_rr select * from generate_series(1,10); +INSERT 10 +2: abort; +ABORT + +-1S: select * from hs_rr; + a +----- + 114 + 115 + 120 + 118 + 113 + 112 + 116 + 119 + 111 + 117 +(10 rows) + +1: end; +END +-1S: end; +END + +---------------------------------------------------------------- +-- Transaction isolation is respected in subtransactions too +---------------------------------------------------------------- + +1: create table hs_subtrx(a int); +CREATE + +-- (1) read-committed +-1S: begin; +BEGIN +-1S: select count(*) from hs_subtrx; + count +------- + 0 +(1 row) +-1S: savepoint s1; +SAVEPOINT + +1: insert into hs_subtrx select * from generate_series(1,10); +INSERT 10 + +-1S: select count(*) from hs_subtrx; + count +------- + 10 +(1 row) +-1S: savepoint s2; +SAVEPOINT +-1S: select count(*) from hs_subtrx; + count +------- + 10 +(1 row) +-1S: rollback to savepoint s1; +ROLLBACK +-1S: select count(*) from hs_subtrx; + count +------- + 10 +(1 row) +-1S: end; +END + +-- (2) repeatable-read +-1S: begin isolation level repeatable read; +BEGIN +-1S: select * from hs_subtrx; + a +---- + 1 + 2 + 3 + 4 + 7 + 8 + 5 + 6 + 9 + 10 +(10 rows) +-1S: savepoint s1; +SAVEPOINT + +1: insert into hs_subtrx select * from generate_series(11,20); +INSERT 10 +1: delete from hs_subtrx where a <= 10; +DELETE 10 +1: update hs_subtrx set a = a + 100; +UPDATE 10 + +-1S: select * from hs_subtrx; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) +-1S: savepoint s2; +SAVEPOINT +-1S: select * from hs_subtrx; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) +-1S: rollback to savepoint s1; +ROLLBACK +-1S: select * from hs_subtrx; + a +---- + 2 + 3 + 4 + 7 + 8 + 1 + 5 + 6 + 9 + 10 +(10 rows) +-1S: end; +END +-1S: select * from hs_subtrx; + a +----- + 114 + 115 + 120 + 118 + 113 + 112 + 116 + 119 + 111 + 117 +(10 rows) + +---------------------------------------------------------------- +-- Various isolation tests that involve AO/CO table. +---------------------------------------------------------------- +1: create table hs_ao(a int, id int unique) using ao_row; +CREATE +1: insert into hs_ao select 1,i from generate_series(1,10) i; +INSERT 10 +1: begin; +BEGIN +1: insert into hs_ao select 2,i from generate_series(11,20) i; +INSERT 10 + +-- standby sees the same AO metadata as primary +2: select * from gp_toolkit.__gp_aoseg('hs_ao'); + segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state +------------+-------+-----+----------+---------------+------------------+----------+---------------+------- + 0 | 1 | 128 | 5 | 1 | 128 | 1 | 3 | 1 + 1 | 1 | 40 | 1 | 1 | 40 | 1 | 3 | 1 + 2 | 1 | 104 | 4 | 1 | 104 | 1 | 3 | 1 +(3 rows) +-1S: select * from gp_toolkit.__gp_aoseg('hs_ao'); + segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state +------------+-------+-----+----------+---------------+------------------+----------+---------------+------- + 0 | 1 | 128 | 5 | 1 | 128 | 1 | 3 | 1 + 1 | 1 | 40 | 1 | 1 | 40 | 1 | 3 | 1 + 2 | 1 | 104 | 4 | 1 | 104 | 1 | 3 | 1 +(3 rows) +2: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id'); + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 4 + (0,2) | 1 | 0 | 0 | 1 | 0 | 1 + (0,2) | 1 | 0 | 0 | 1 | 0 | 5 +(3 rows) +-1S: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id'); + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 5 + (0,2) | 1 | 0 | 0 | 1 | 0 | 1 + (0,2) | 1 | 0 | 0 | 1 | 0 | 4 +(3 rows) + +-- standby sees correct table data +-1S: select * from hs_ao; + a | id +---+---- + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 7 + 1 | 8 + 1 | 1 + 1 | 5 + 1 | 6 + 1 | 9 + 1 | 10 +(10 rows) + +-- standby sees the effect of vacuum +1: end; +END +1: delete from hs_ao where a = 1; +DELETE 10 +1: vacuum hs_ao; +VACUUM +1: select * from gp_toolkit.__gp_aoseg('hs_ao'); + segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state +------------+-------+-----+----------+---------------+------------------+----------+---------------+------- + 2 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1 + 2 | 2 | 104 | 4 | 1 | 104 | 0 | 3 | 1 + 0 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1 + 0 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1 + 1 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1 + 1 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1 +(6 rows) +-1S: select * from gp_toolkit.__gp_aoseg('hs_ao'); + segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state +------------+-------+-----+----------+---------------+------------------+----------+---------------+------- + 2 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1 + 2 | 2 | 104 | 4 | 1 | 104 | 0 | 3 | 1 + 0 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1 + 0 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1 + 1 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1 + 1 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1 +(6 rows) +-1S: select * from hs_ao; + a | id +---+---- + 2 | 11 + 2 | 13 + 2 | 14 + 2 | 17 + 2 | 12 + 2 | 15 + 2 | 20 + 2 | 16 + 2 | 18 + 2 | 19 +(10 rows) diff --git a/src/test/isolation2/hot_standby_schedule b/src/test/isolation2/hot_standby_schedule new file mode 100644 index 00000000000..73e0f71a84c --- /dev/null +++ b/src/test/isolation2/hot_standby_schedule @@ -0,0 +1,6 @@ +test: hot_standby/setup +test: hot_standby/basic +test: hot_standby/transaction_isolation +test: hot_standby/query_conflict +test: hot_standby/faults +test: hot_standby/teardown diff --git a/src/test/isolation2/input/hot_standby/query_conflict.source b/src/test/isolation2/input/hot_standby/query_conflict.source new file mode 100644 index 00000000000..5f2aee3be53 --- /dev/null +++ b/src/test/isolation2/input/hot_standby/query_conflict.source @@ -0,0 +1,225 @@ +-- Tests for query conflict detection and cancellation on the hot standby. + +---------------------------------------------------------------- +-- Various query conflcit cases for hot standy. +-- +-- All cases are written in this pattern: +-- 1. Start a standby transaction that will be conflicted and cancelled; +-- 2. Start a primary transaction that will conflict it; +-- 3. Commit the primary transaction. Since we are using remote_apply, it will +-- wait until the WAL is applied on the standby, which would happen only +-- after the standby query is cancelled; +-- 4. Run something on the standby transaction and see the conflict error, which +-- in some cases it's ERROR, in others it's FATAL. +-- 5. Quit, establish a new connection, and re-run +-- 6. Check the system view gp_stat_database_conflicts to see that the conflict +-- has been recorded. Note that we print the max count among all segments +-- to avoid flakiness. +-- See https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT for more details. +---------------------------------------------------------------- + +-- We assume we start the test with clean records +-1S: select max(confl_tablespace), max(confl_lock), max(confl_snapshot), max(confl_bufferpin), max(confl_deadlock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +--------------------------------------------------------------------- +-- Conflict with explicit lock +--------------------------------------------------------------------- +create table hs_qc_lock(a int); +insert into hs_qc_lock select * from generate_series(1,5); +-1S: begin; +-1S: select * from hs_qc_lock; +1: begin; +1: lock table hs_qc_lock in access exclusive mode; +1: end; +-1S: select * from hs_qc_lock; +-1Sq: +-1S: select * from hs_qc_lock; +-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +--------------------------------------------------------------------- +-- Conflict with implicit lock +--------------------------------------------------------------------- +-1S: begin; +-1S: select * from hs_qc_lock; +1: alter table hs_qc_lock set access method ao_row; +-1S: select * from hs_qc_lock; +-1Sq: +-1S: select * from hs_qc_lock; +-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +--------------------------------------------------------------------- +-- Conflict with drop database +--------------------------------------------------------------------- +1: create database hs_qc_dropdb; +-1Sq: +-1S:@db_name hs_qc_dropdb: select 1; +1: drop database hs_qc_dropdb; +-1S: select 1; +-1Sq: +-- Stats aren't counted for database conflicts. See: pgstat_recv_recoveryconflict + +--------------------------------------------------------------------- +-- Conflict with VACUUM (snapshot) +--------------------------------------------------------------------- +1: create table hs_qc_vac1(a int); +1: insert into hs_qc_vac1 select * from generate_series(1,10); +-1S: begin transaction isolation level repeatable read; +-1S: select count(*) from hs_qc_vac1; +1: delete from hs_qc_vac1; +1: vacuum hs_qc_vac1; +-1S: select count(*) from hs_qc_vac1; +-1Sq: +-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +--------------------------------------------------------------------- +-- Conflict with VACUUM (buffer pin) +-- VACUUM of page that the standby is still holding buffer pin on, the difference with +-- the previous case is that here the deleted row is already invisible to the standby. +--------------------------------------------------------------------- +1: create table hs_qc_vac2(a int); +1: insert into hs_qc_vac2 values(2); +1: delete from hs_qc_vac2; +-- run select once on the standby, so the next select will fetch data from buffer +-1S: select * from hs_qc_vac2; +-- suspend the standby at where it just unlocks the buffer but still holds the pin +1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'suspend','','','hs_qc_vac2',1,1,0,dbid) from gp_segment_configuration where content=0 and role='m'; +-- we'll also make sure the startup process has sent out the signal before we let the standby backend release the pin +1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'skip',dbid) from gp_segment_configuration where content=0 and role='m'; +-1S&: select * from hs_qc_vac2; +1: vacuum hs_qc_vac2; +-- as mentioned before, make sure startup process has sent the signal, and then let the standby proceed +1: select gp_wait_until_triggered_fault('recovery_conflict_bufferpin_signal_sent', 1,dbid) from gp_segment_configuration where content=0 and role='m'; +1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'reset',dbid) from gp_segment_configuration where content=0 and role='m'; +1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'reset',dbid) from gp_segment_configuration where content=0 and role='m'; +-- should see the conflict +-1S<: +-1Sq: +-- XXX: sometimes it shows the number is 2 instead of 1. It still validates the test but it would be nice to know why. +-1S: select max(confl_bufferpin) > 0 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +--------------------------------------------------------------------- +-- Conflict with drop (temp) tablespace +-- Note: regular user tablespaces won't cause conflict on the standby since the standby cannot create any objects under them. +--------------------------------------------------------------------- +-- create tablespace +!\retcode mkdir -p @testtablespace@/hs_tablespace_directory; +create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory'; + +-- some prepartion on the primary +create table hs_ts_foo (i int, j int) distributed by(i); +insert into hs_ts_foo select i, i from generate_series(1,800000)i; +analyze hs_ts_foo; + +-- make sure the standby won't run too fast and delete the temp files +select gp_inject_fault('after_open_temp_file', 'suspend',dbid) from gp_segment_configuration where content=1 and role='m'; + +-- on the standby, run some query that requires workfile, this example is taken +-- from regress/temp_tablespaces test +-1S: set temp_tablespaces = hs_ts; +-1S: set default_tablespace = hs_ts; +-1S: set statement_mem='2MB'; +-1S&: with a1 as (select * from hs_ts_foo), a2 as (select * from hs_ts_foo) select a1.i xx from a1 inner join a2 on a2.i = a1.i union all select count(a1.i) from a1 inner join a2 on a2.i = a1.i order by xx limit 5; + +-- drop tablespace, should see conflict on the hot standby +drop tablespace hs_ts; +select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m'; +-1S<: +-1Sq: + +-- conflict has been recorded. The query has multiple slices +-1S: select max(confl_tablespace) >= 1 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +-- cleanup +!\retcode rm -rf @testtablespace@/hs_tablespace_directory; +-- Do one checkpoint. Otherwise if server restarts w/o doing checkpoint (some subsequent +-- tests might do that), the server would complain it cannot find the directory for hs_ts. +checkpoint; + +---------------------------------------------------------------- +-- Additional case to show that distributed transaction is not taken into +-- account w/o the help of restore-point-based distributed snapshot creation. +---------------------------------------------------------------- + +1: create table hs_qc_ds1(a int); +1: insert into hs_qc_ds1 select * from generate_series(1,10); +-- standby starts a repeatable read transaction, runs a local query that +-- creates a distributed snapshot w/o creating QE. +-1S: select count(*) from hs_qc_ds1; +-1S: begin transaction isolation level repeatable read; +-1S: select relname from pg_class where relname = 'hs_qc_ds1'; +-- primary runs VACUUM +1: delete from hs_qc_ds1; +1: vacuum hs_qc_ds1; +-- The standby query in theory should be cancelled, because it started before +-- the VACUUM. But in reality, it doesn't, and sees 0 rows, because the QE for the +-- SELECT below will create more recent local snapshot that does not conflict with +-- the VACUUM, and sees the result of DELETE+VACUUM. +-- Note: with the help of restore point, we would be able to create local snapshot +-- precisely corresponding to each distributed snapshot, and do conflict detection accordingly. +-1S: select count(*) from hs_qc_ds1; +-1S: end; + +---------------------------------------------------------------- +-- Test GUC hot_standby_feedback +---------------------------------------------------------------- +!\retcode gpconfig -c hot_standby_feedback -v on; +!\retcode gpstop -u; + +1: create table hs_qc_guc1(a int); +1: insert into hs_qc_guc1 select * from generate_series(1,10); + +-1S: begin transaction isolation level repeatable read; +-1S: select * from hs_qc_guc1; + +-- VACUUM won't cleanup this table since the standby still sees it +1: delete from hs_qc_guc1; +1: vacuum hs_qc_guc1; + +-- hot standby can still see those rows +-1S: select * from hs_qc_guc1; + +-- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table +-1S: end; +1: vacuum hs_qc_guc1; +-1S: select * from hs_qc_guc1; +-1Sq: + +!\retcode gpconfig -r hot_standby_feedback; +!\retcode gpstop -u; + +---------------------------------------------------------------- +-- Test GUC vacuum_defer_cleanup_age +---------------------------------------------------------------- +-- Use a GUC value that's not 0, so VACUUM does not clean up +-- recent dead rows that the hot standby might be still seeing. +!\retcode gpconfig -c vacuum_defer_cleanup_age -v 1; +!\retcode gpstop -u; + +1: create table hs_qc_guc2(a int); +1: insert into hs_qc_guc2 select * from generate_series(1,10); + +-1S: begin transaction isolation level repeatable read; +-1S: select count(*) from hs_qc_guc2; + +-- VACUUM won't cleanup this table since the DELETE is still within vacuum_defer_cleanup_age +1: delete from hs_qc_guc2; +1: vacuum hs_qc_guc2; + +-- showing all rows are deleted but not vacuumed +1: select count(*) from hs_qc_guc2; +1: set gp_select_invisible to on; +1: select count(*) from hs_qc_guc2; + +-- hot standby can still query the table +-1S: select count(*) from hs_qc_guc2; + +-- only if the age is reached, hot standby will see the same conflict as before +1: create temp table tt1(a int); +1: vacuum hs_qc_guc2; +-1S: select count(*) from hs_qc_guc2; +-1Sq: +-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + +!\retcode gpconfig -r vacuum_defer_cleanup_age; +!\retcode gpstop -u; + diff --git a/src/test/isolation2/output/hot_standby/query_conflict.source b/src/test/isolation2/output/hot_standby/query_conflict.source new file mode 100644 index 00000000000..909d2532df3 --- /dev/null +++ b/src/test/isolation2/output/hot_standby/query_conflict.source @@ -0,0 +1,470 @@ +-- Tests for query conflict detection and cancellation on the hot standby. + +---------------------------------------------------------------- +-- Various query conflcit cases for hot standy. +-- +-- All cases are written in this pattern: +-- 1. Start a standby transaction that will be conflicted and cancelled; +-- 2. Start a primary transaction that will conflict it; +-- 3. Commit the primary transaction. Since we are using remote_apply, it will +-- wait until the WAL is applied on the standby, which would happen only +-- after the standby query is cancelled; +-- 4. Run something on the standby transaction and see the conflict error, which +-- in some cases it's ERROR, in others it's FATAL. +-- 5. Quit, establish a new connection, and re-run +-- 6. Check the system view gp_stat_database_conflicts to see that the conflict +-- has been recorded. Note that we print the max count among all segments +-- to avoid flakiness. +-- See https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT for more details. +---------------------------------------------------------------- + +-- We assume we start the test with clean records +-1S: select max(confl_tablespace), max(confl_lock), max(confl_snapshot), max(confl_bufferpin), max(confl_deadlock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + max | max | max | max | max +-----+-----+-----+-----+----- + 0 | 0 | 0 | 0 | 0 +(1 row) + +--------------------------------------------------------------------- +-- Conflict with explicit lock +--------------------------------------------------------------------- +create table hs_qc_lock(a int); +CREATE +insert into hs_qc_lock select * from generate_series(1,5); +INSERT 5 +-1S: begin; +BEGIN +-1S: select * from hs_qc_lock; + a +--- + 2 + 3 + 4 + 1 + 5 +(5 rows) +1: begin; +BEGIN +1: lock table hs_qc_lock in access exclusive mode; +LOCK +1: end; +END +-1S: select * from hs_qc_lock; +FATAL: terminating connection due to conflict with recovery +DETAIL: User was holding a relation lock for too long. +HINT: In a moment you should be able to reconnect to the database and repeat your command. +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-1Sq: ... +-1S: select * from hs_qc_lock; + a +--- + 1 + 5 + 2 + 3 + 4 +(5 rows) +-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + max +----- + 1 +(1 row) + +--------------------------------------------------------------------- +-- Conflict with implicit lock +--------------------------------------------------------------------- +-1S: begin; +BEGIN +-1S: select * from hs_qc_lock; + a +--- + 1 + 5 + 2 + 3 + 4 +(5 rows) +1: alter table hs_qc_lock set access method ao_row; +ALTER +-1S: select * from hs_qc_lock; +FATAL: terminating connection due to conflict with recovery +DETAIL: User was holding a relation lock for too long. +HINT: In a moment you should be able to reconnect to the database and repeat your command. +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-1Sq: ... +-1S: select * from hs_qc_lock; + a +--- + 1 + 5 + 2 + 3 + 4 +(5 rows) +-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + max +----- + 2 +(1 row) + +--------------------------------------------------------------------- +-- Conflict with drop database +--------------------------------------------------------------------- +1: create database hs_qc_dropdb; +CREATE +-1Sq: ... +-1S:@db_name hs_qc_dropdb: select 1; + ?column? +---------- + 1 +(1 row) +1: drop database hs_qc_dropdb; +DROP +-1S: select 1; +FATAL: terminating connection due to conflict with recovery +DETAIL: User was connected to a database that must be dropped. +server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +-1Sq: ... +-- Stats aren't counted for database conflicts. See: pgstat_recv_recoveryconflict + +--------------------------------------------------------------------- +-- Conflict with VACUUM (snapshot) +--------------------------------------------------------------------- +1: create table hs_qc_vac1(a int); +CREATE +1: insert into hs_qc_vac1 select * from generate_series(1,10); +INSERT 10 +-1S: begin transaction isolation level repeatable read; +BEGIN +-1S: select count(*) from hs_qc_vac1; + count +------- + 10 +(1 row) +1: delete from hs_qc_vac1; +DELETE 10 +1: vacuum hs_qc_vac1; +VACUUM +-1S: select count(*) from hs_qc_vac1; +DETAIL: User query might have needed to see row versions that must be removed. +ERROR: terminating connection due to conflict with recovery +HINT: In a moment you should be able to reconnect to the database and repeat your command. +-1Sq: ... +-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + max +----- + 1 +(1 row) + +--------------------------------------------------------------------- +-- Conflict with VACUUM (buffer pin) +-- VACUUM of page that the standby is still holding buffer pin on, the difference with +-- the previous case is that here the deleted row is already invisible to the standby. +--------------------------------------------------------------------- +1: create table hs_qc_vac2(a int); +CREATE +1: insert into hs_qc_vac2 values(2); +INSERT 1 +1: delete from hs_qc_vac2; +DELETE 1 +-- run select once on the standby, so the next select will fetch data from buffer +-1S: select * from hs_qc_vac2; + a +--- +(0 rows) +-- suspend the standby at where it just unlocks the buffer but still holds the pin +1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'suspend','','','hs_qc_vac2',1,1,0,dbid) from gp_segment_configuration where content=0 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-- we'll also make sure the startup process has sent out the signal before we let the standby backend release the pin +1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'skip',dbid) from gp_segment_configuration where content=0 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-1S&: select * from hs_qc_vac2; +1: vacuum hs_qc_vac2; +VACUUM +-- as mentioned before, make sure startup process has sent the signal, and then let the standby proceed +1: select gp_wait_until_triggered_fault('recovery_conflict_bufferpin_signal_sent', 1,dbid) from gp_segment_configuration where content=0 and role='m'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'reset',dbid) from gp_segment_configuration where content=0 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'reset',dbid) from gp_segment_configuration where content=0 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-- should see the conflict +-1S<: <... completed> +ERROR: canceling statement due to conflict with recovery (seg0 slice1 127.0.1.1:7005 pid=17044) +DETAIL: User was holding shared buffer pin for too long. +-1Sq: ... +-- XXX: sometimes it shows the number is 2 instead of 1. It still validates the test but it would be nice to know why. +-1S: select max(confl_bufferpin) > 0 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + ?column? +---------- + t +(1 row) + +--------------------------------------------------------------------- +-- Conflict with drop (temp) tablespace +-- Note: regular user tablespaces won't cause conflict on the standby since the standby cannot create any objects under them. +--------------------------------------------------------------------- +-- create tablespace +!\retcode mkdir -p @testtablespace@/hs_tablespace_directory; +(exited with code 0) +create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory'; +CREATE + +-- some prepartion on the primary +create table hs_ts_foo (i int, j int) distributed by(i); +CREATE +insert into hs_ts_foo select i, i from generate_series(1,800000)i; +INSERT 800000 +analyze hs_ts_foo; +ANALYZE + +-- make sure the standby won't run too fast and delete the temp files +select gp_inject_fault('after_open_temp_file', 'suspend',dbid) from gp_segment_configuration where content=1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) + +-- on the standby, run some query that requires workfile, this example is taken +-- from regress/temp_tablespaces test +-1S: set temp_tablespaces = hs_ts; +SET +-1S: set default_tablespace = hs_ts; +SET +-1S: set statement_mem='2MB'; +SET +-1S&: with a1 as (select * from hs_ts_foo), a2 as (select * from hs_ts_foo) select a1.i xx from a1 inner join a2 on a2.i = a1.i union all select count(a1.i) from a1 inner join a2 on a2.i = a1.i order by xx limit 5; + +-- drop tablespace, should see conflict on the hot standby +drop tablespace hs_ts; +DROP +select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m'; + gp_inject_fault +----------------- + Success: +(1 row) +-1S<: <... completed> +ERROR: canceling statement due to conflict with recovery (seg1 slice3 127.0.1.1:7006 pid=990) +DETAIL: User was or might have been using tablespace that must be dropped. +-1Sq: ... + +-- conflict has been recorded. The query has multiple slices +-1S: select max(confl_tablespace) >= 1 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + ?column? +---------- + t +(1 row) + +-- cleanup +!\retcode rm -rf @testtablespace@/hs_tablespace_directory; +GP_IGNORE:-- start_ignore +GP_IGNORE: +GP_IGNORE:-- end_ignore +(exited with code 0) +-- Do one checkpoint. Otherwise if server restarts w/o doing checkpoint (some subsequent +-- tests might do that), the server would complain it cannot find the directory for hs_ts. +checkpoint; +CHECKPOINT + +---------------------------------------------------------------- +-- Additional case to show that distributed transaction is not taken into +-- account w/o the help of restore-point-based distributed snapshot creation. +---------------------------------------------------------------- + +1: create table hs_qc_ds1(a int); +CREATE +1: insert into hs_qc_ds1 select * from generate_series(1,10); +INSERT 10 +-- standby starts a repeatable read transaction, runs a local query that +-- creates a distributed snapshot w/o creating QE. +-1S: select count(*) from hs_qc_ds1; + count +------- + 10 +(1 row) +-1S: begin transaction isolation level repeatable read; +BEGIN +-1S: select relname from pg_class where relname = 'hs_qc_ds1'; + relname +----------- + hs_qc_ds1 +(1 row) +-- primary runs VACUUM +1: delete from hs_qc_ds1; +DELETE 10 +1: vacuum hs_qc_ds1; +VACUUM +-- The standby query in theory should be cancelled, because it started before +-- the VACUUM. But in reality, it doesn't, and sees 0 rows, because the QE for the +-- SELECT below will create more recent local snapshot that does not conflict with +-- the VACUUM, and sees the result of DELETE+VACUUM. +-- Note: with the help of restore point, we would be able to create local snapshot +-- precisely corresponding to each distributed snapshot, and do conflict detection accordingly. +-1S: select count(*) from hs_qc_ds1; + count +------- + 0 +(1 row) +-1S: end; +END + +---------------------------------------------------------------- +-- Test GUC hot_standby_feedback +---------------------------------------------------------------- +!\retcode gpconfig -c hot_standby_feedback -v on; +(exited with code 0) +!\retcode gpstop -u; +(exited with code 0) + +1: create table hs_qc_guc1(a int); +CREATE +1: insert into hs_qc_guc1 select * from generate_series(1,10); +INSERT 10 + +-1S: begin transaction isolation level repeatable read; +BEGIN +-1S: select * from hs_qc_guc1; + a +---- + 1 + 10 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +(10 rows) + +-- VACUUM won't cleanup this table since the standby still sees it +1: delete from hs_qc_guc1; +DELETE 10 +1: vacuum hs_qc_guc1; +VACUUM + +-- hot standby can still see those rows +-1S: select * from hs_qc_guc1; + a +---- + 1 + 10 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 +(10 rows) + +-- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table +-1S: end; +END +1: vacuum hs_qc_guc1; +VACUUM +-1S: select * from hs_qc_guc1; + a +--- +(0 rows) +-1Sq: ... + +!\retcode gpconfig -r hot_standby_feedback; +(exited with code 0) +!\retcode gpstop -u; +(exited with code 0) + +---------------------------------------------------------------- +-- Test GUC vacuum_defer_cleanup_age +---------------------------------------------------------------- +-- Use a GUC value that's not 0, so VACUUM does not clean up +-- recent dead rows that the hot standby might be still seeing. +!\retcode gpconfig -c vacuum_defer_cleanup_age -v 1; +(exited with code 0) +!\retcode gpstop -u; +(exited with code 0) + +1: create table hs_qc_guc2(a int); +CREATE +1: insert into hs_qc_guc2 select * from generate_series(1,10); +INSERT 10 + +-1S: begin transaction isolation level repeatable read; +BEGIN +-1S: select count(*) from hs_qc_guc2; + count +------- + 10 +(1 row) + +-- VACUUM won't cleanup this table since the DELETE is still within vacuum_defer_cleanup_age +1: delete from hs_qc_guc2; +DELETE 10 +1: vacuum hs_qc_guc2; +VACUUM + +-- showing all rows are deleted but not vacuumed +1: select count(*) from hs_qc_guc2; + count +------- + 0 +(1 row) +1: set gp_select_invisible to on; +SET +1: select count(*) from hs_qc_guc2; + count +------- + 10 +(1 row) + +-- hot standby can still query the table +-1S: select count(*) from hs_qc_guc2; + count +------- + 10 +(1 row) + +-- only if the age is reached, hot standby will see the same conflict as before +1: create temp table tt1(a int); +CREATE +1: vacuum hs_qc_guc2; +VACUUM +-1S: select count(*) from hs_qc_guc2; +ERROR: terminating connection due to conflict with recovery (seg0 slice1 127.0.1.1:7005 pid=18713) +DETAIL: User query might have needed to see row versions that must be removed. +HINT: In a moment you should be able to reconnect to the database and repeat your command. +-1Sq: ... +-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby'; + max +----- + 2 +(1 row) + +!\retcode gpconfig -r vacuum_defer_cleanup_age; +(exited with code 0) +!\retcode gpstop -u; +(exited with code 0) + diff --git a/src/test/isolation2/sql/.gitignore b/src/test/isolation2/sql/.gitignore index 361b986e18d..bfc3709082c 100644 --- a/src/test/isolation2/sql/.gitignore +++ b/src/test/isolation2/sql/.gitignore @@ -7,6 +7,7 @@ /pt_io_in_progress_deadlock.sql /distributed_snapshot.sql /local_directory_table_mixed.sql +/hot_standby/query_conflict.sql # ignores including sub-directories autovacuum-analyze.sql diff --git a/src/test/isolation2/sql/hot_standby/basic.sql b/src/test/isolation2/sql/hot_standby/basic.sql new file mode 100644 index 00000000000..a900b38a29c --- /dev/null +++ b/src/test/isolation2/sql/hot_standby/basic.sql @@ -0,0 +1,95 @@ +-- Tests for basic query dispatch on a hot standy. + +-- hot standby must show on and the sync mode is remote_apply for the tests to make sense +-1S: show hot_standby; +-1S: show synchronous_commit; + +-- will be checking if QD/QE info looks good +-1S: select id, type, content, port from gp_backend_info(); + +---------------------------------------------------------------- +-- Test: basic query dispatch +---------------------------------------------------------------- +create table hs_t1(a int); +create table hs_t2(a int); + +-- standby should see the results for 2pc immediately. +insert into hs_t1 select * from generate_series(1,10); +-1S: select * from hs_t1; +-- standby won't see results for the last 1pc immediately because the standby QD +-- isn't aware of of it so its distributed snapshot doesn't include the 1pc, but +-- as long as another 2pc comes it will be able to see the previous 1pc. Wee +-- tolerate this case in the mirrored cluster setup. +insert into hs_t2 values(1); +-1S: select * from hs_t2; +-- any following 2pc will make the 1pc visible +create temp table tt(a int); +-1S: select * from hs_t2; + +-- we have three QEs launched on the mirror segments. +-- note that the first QE on a segment is still a "writer" because we +-- need it to manage locks, same as read-only queries on a primary QD. +-1S: select id, type, content, port from gp_backend_info(); + +-- should have parallel readers launched +-1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2); +-1S: select id, type, content, port from gp_backend_info(); + +-- now a singleton reader added too +-1S: select * from hs_t1 join (select oid::int from pg_class) hs_t2 on (hs_t1 = hs_t2); +-1S: select id, type, content, port from gp_backend_info(); + +-- un-committed result should not be seen by the standby +begin; +insert into hs_t1 select * from generate_series(11,20); + +-- standby should only see 1...10 +-1S: select * from hs_t1; + +end; + +-- standby should see 1...20 now +-1S: select * from hs_t1; + +---------------------------------------------------------------- +-- Test: other things that a hot standby can do. +-- +-- More refer to regress test 'hs_standby_allowed'. +---------------------------------------------------------------- +-- set/reset and show GUC +-1S: set optimizer = on; +-1S: show optimizer; +-1S: reset optimizer; +-- copy command +-1S: copy hs_t1 to '/tmp/hs_copyto.csv' csv null ''; +-- query catalogs +-1S: select count(*) from pg_class where relname = 'hs_t1'; +-1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer; +-- checkpoint is allowed on standby but a restart point is created instead +-1S: checkpoint; + +---------------------------------------------------------------- +-- Test: things that can't be done on a hot standby: +-- no DML, DDL or anything that generates WAL. +-- +-- More refer to regress test 'hs_standby_disallowed'. +---------------------------------------------------------------- +-1S: insert into hs_t1 values(1); +-1S: delete from hs_t1; +-1S: update hs_t1 set a = 0; +-1S: create table hs_t2(a int); +-1S: create database hs_db; +-1S: vacuum hs_t1; + +-- +-- No hintbit WAL generation in SELECT. +-- +create table hs_nohintbit(a int) distributed by (a); +insert into hs_nohintbit select generate_series (1, 10); +-- flush the data to disk +checkpoint; + +-1S: set gp_disable_tuple_hints=off; +-- no WAL is being generated (otherwise an error would occur "cannot make new WAL entries during recovery") +-1S: SELECT count(*) FROM hs_nohintbit; + diff --git a/src/test/isolation2/sql/hot_standby/faults.sql b/src/test/isolation2/sql/hot_standby/faults.sql new file mode 100644 index 00000000000..6e25bcba272 --- /dev/null +++ b/src/test/isolation2/sql/hot_standby/faults.sql @@ -0,0 +1,125 @@ +-- Test system faults scenarios + +-- start_matchsubs +-- +-- m/Is the server running on host.*/ +-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host and accepting/ +-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/ +-- s/(.*)/(seg IP:PORT)/ +-- m/ERROR: connection to dbid 1 .*:7000 failed .*/ +-- s/ERROR: connection to dbid 1 .*:7000 failed .*/ERROR: connection to dbid 1 :7000 failed/ +-- +-- end_matchsubs + +-- Let FTS detect/declare failure sooner +!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly; +!\retcode gpstop -u; + +create table hs_failover(a int); +insert into hs_failover select * from generate_series(1,10); +-1S: select * from hs_failover; + +---------------------------------------------------------------- +-- Mirror segment fails +---------------------------------------------------------------- +select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm'; + +-- make sure mirror is detected down +create temp table hs_tt(a int); +select gp_request_fts_probe_scan(); + +-- will not succeed +-1S: select * from hs_failover; +-1Sq: + +-- recovery +!\retcode gprecoverseg -aF; + +-- sync-up +select wait_until_all_segments_synchronized(); + +-- works now +-1S: select * from hs_failover; + +---------------------------------------------------------------- +-- Primary segment fails +---------------------------------------------------------------- +-- inject a fault where the mirror gets out of recovery +select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm'; + +select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p'; +select gp_request_fts_probe_scan(); + +-- make sure failover happens +select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; +select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p'; +select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p'; + +-- On an existing standby connection, query will run but it is dispatched to the previous mirror +-- in an existing gang. That mirror is now a primary, so it will complain and the query fails. +-1S: select * from hs_failover; +-1Sq: + +-- will fail due to downed mirror (previous primary) +-1S: select * from hs_failover; +-1Sq: + +-- bring the downed mirror up +!\retcode gprecoverseg -aF; +select wait_until_all_segments_synchronized(); + +-- mirror is up +-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + +-- now the query will succeed +-1S: select * from hs_failover; +-1Sq: + +-- re-balance, bring the segments to their preferred roles +!\retcode gprecoverseg -ar; +select wait_until_all_segments_synchronized(); +-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + +-- query runs fine still +-1S: select * from hs_failover; + +---------------------------------------------------------------- +-- DTX recovery +---------------------------------------------------------------- +-- skip FTS probe to prevent unexpected mirror promotion +1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1; + +1: create table tt_hs_dtx(a int); + +-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process. +select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p'; + +-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE +1&: insert into tt_hs_dtx select * from generate_series(1,10); + +-- inject a panic on primary QD, essentially restarts the primary QD +2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p'; +2: select 1; + +1<: +1q: +2q: + +-- standby QD can still run query +-1S: select * from hs_failover; +-- it cannot see rows from the in-doubt DTX +-1S: select * from tt_hs_dtx; + +-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx +-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m'; +-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p'; +-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m'; +-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m'; + +-- standby should see the rows from the in-doubt DTX now +-1S: select * from tt_hs_dtx; + +-1S: select wait_until_all_segments_synchronized(); +1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p'; +1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1; + diff --git a/src/test/isolation2/sql/hot_standby/setup.sql b/src/test/isolation2/sql/hot_standby/setup.sql new file mode 100644 index 00000000000..aa15f468b7d --- /dev/null +++ b/src/test/isolation2/sql/hot_standby/setup.sql @@ -0,0 +1,9 @@ +-- setup for hot standby tests +!\retcode gpconfig -c hot_standby -v on; +-- let primary wait for standby to apply changes, make test less flaky +!\retcode gpconfig -c synchronous_commit -v remote_apply; +-- make it faster to handle query conflict +!\retcode gpconfig -c max_standby_streaming_delay -v 1000; +-- disable autovacuum, to not affect the manual VACUUM in the tests +!\retcode gpconfig -c autovacuum -v off; +!\retcode gpstop -ar; diff --git a/src/test/isolation2/sql/hot_standby/teardown.sql b/src/test/isolation2/sql/hot_standby/teardown.sql new file mode 100644 index 00000000000..af6fba50aed --- /dev/null +++ b/src/test/isolation2/sql/hot_standby/teardown.sql @@ -0,0 +1,5 @@ +-- reset the setup for hot standby tests +!\retcode gpconfig -r hot_standby; +!\retcode gpconfig -r synchronous_commit; +!\retcode gpconfig -r max_standby_streaming_delay; +!\retcode gpstop -ar; diff --git a/src/test/isolation2/sql/hot_standby/transaction_isolation.sql b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql new file mode 100644 index 00000000000..68945228313 --- /dev/null +++ b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql @@ -0,0 +1,319 @@ +---------------------------------------------------------------- +-- Test transaction isolation in general, not specific to dtx +---------------------------------------------------------------- +1: create table hs_tx(a int); +1: insert into hs_tx select * from generate_series(1,10); + +1: begin; +1: insert into hs_tx select * from generate_series(11,20); +2: begin; +2: insert into hs_tx select * from generate_series(21,30); +2: abort; + +-- standby should only see completed transactions, not in-progress transactions, nor aborted transactions +-1S: select * from hs_tx; + +1: end; +-1S: select * from hs_tx; + +---------------------------------------------------------------- +-- Test isolation between hot standby query and in-progress dtx +---------------------------------------------------------------- + +1: create table hs_dtx1(a int); +1: create table hs_dtx2(a int); + +-- inject two suspend faults: +-- 1. on seg0, suspend before PREPARE phase of 2PC +1: select gp_inject_fault('qe_start_prepared', 'suspend',dbid) from gp_segment_configuration where content=0 and role='p'; +1&: insert into hs_dtx1 select * from generate_series(1,10); +-- 2. on seg1, suspend before COMMIT phase of 2PC +2: select gp_inject_fault('qe_start_commit_prepared', 'suspend',dbid) from gp_segment_configuration where content=1 and role='p'; +2&: insert into hs_dtx2 select * from generate_series(1,10); + +-- standby should not see any rows from either dtx +-1S: select * from hs_dtx1; +-1S: select * from hs_dtx2; + +-- reset +3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; +3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; +1<: +2<: + +-- standby should see the results from the dtx now +-1S: select * from hs_dtx1; +-1S: select * from hs_dtx2; + +---------------------------------------------------------------- +-- Test DTX abort that happens in different phases +---------------------------------------------------------------- + +1: create table hs_abort_dtx1(a int); +1: create table hs_abort_dtx2(a int); + +-- inject two errors: +-- 1. on seg0, error out before PREPARE phase of 2PC +1: select gp_inject_fault('qe_start_prepared', 'error', dbid) from gp_segment_configuration where content=0 and role='p'; +1: insert into hs_abort_dtx1 select * from generate_series(1,10); +1: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; +-- 2. on seg1, error out before COMMIT phase of 2PC +1: select gp_inject_fault('qe_start_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p'; +1: insert into hs_abort_dtx2 select * from generate_series(1,10); +1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; + +-- standby should not see dtx1 which is aborted but should see dtx2 which is recovered +-1S: select * from hs_abort_dtx1; +-1S: select * from hs_abort_dtx2; + +---------------------------------------------------------------- +-- Test isolation between hot standby query and in-progress dtx, +-- but also run more queries in between +---------------------------------------------------------------- +1: create table hs_dtx3(a int); + +-- inject faults to suspend segments in 2PC +1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; +1&: insert into hs_dtx3 select * from generate_series(1,10); +2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p'; +2&: insert into hs_dtx3 select * from generate_series(11,20); + +-- standby should not see rows in the in-progress dtx +-1S: select * from hs_dtx3; + +-- now run some dtx and completed +3: insert into hs_dtx3 values(99); +3: create table hs_dtx4(a int); +3: insert into hs_dtx4 select * from generate_series(1,10); + +-- standby should still not see rows in the in-progress DTX, but should see the completed ones +-1S: select * from hs_dtx3; +-1S: select * from hs_dtx4; + +3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; +3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; +1<: +2<: + +-- standby should see all rows now +-1S: select * from hs_dtx3; + +---------------------------------------------------------------- +-- Test isolation between standby QD and in-progress dtx, +-- but after standby QD resets and gets running DTX from checkpoint. +---------------------------------------------------------------- +1: create table hs_t5(a int, b text); +1: create table hs_t6(a int, b text); + +-- inject fault to suspend a primary right before it conducts the commit phase of 2PC, +-- so in the subsequent INSERT, all local transactions will be committed but the dtx is not. +1: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; +1&: insert into hs_t5 select i, 'in-progress' from generate_series(1,10) i; + +-- now run some dtx and completed, and primary conducts a checkpoint +2: insert into hs_t5 values(1, 'commited'); +2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i; +2: begin; +2: insert into hs_t5 values(99, 'aborted'); +2: abort; +2: checkpoint; + +-- now make the standby QD resets itself +-1S: select gp_inject_fault('exec_simple_query_start', 'panic', dbid) from gp_segment_configuration where content=-1 and role='m'; +-1S: select 1; +-1Sq: + +-- standby should still not see rows in the in-progress DTX, but should see the completed ones +-1S: select * from hs_t5; +-1S: select * from hs_t6; + +2: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; +1<: + +-- standby should see all rows now +-1S: select * from hs_t5; +-1S: select * from hs_t6; + +-- standby should correctly see more in-progress dtx on the primary. +-- context: previously this would be fail because the standby updates latestCompletedGxid to the +-- bumped nextGxid from checkpoint, which is too far (so that it thinks the new dtx already completed). +1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p'; +1&: delete from hs_t5; +2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p'; +2&: delete from hs_t6; + +-- standby should not see the effect of the deletes +-1S: select * from hs_t5; +-1S: select * from hs_t6; + +3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p'; +3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p'; + +1<: +2<: + +-- standby now see those deletes +-1S: select * from hs_t5; +-1S: select * from hs_t6; + +---------------------------------------------------------------- +-- Read-committed isolation: query on hot standby should not see dtx that completed after it +-- created distributed snapshot, but should see dtx that completed before that. +---------------------------------------------------------------- + +1: create table hs_rc(a int); +1: insert into hs_rc select * from generate_series(1,10); + +-- case 1: suspend SELECT on the standby QD right after it created snapshot +-1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m'; +-1S&: select * from hs_rc; + +-- new INSERT or DELETE won't be observed by the standby +1: insert into hs_rc select * from generate_series(11,20); +1: delete from hs_rc where a < 5; +1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m'; + +-- should only see the rows at the time when SELECT started (1...10). +-1S<: + +-- SELECT again, should see the effect from the INSERT and DELETE now +-1S: select * from hs_rc; + +-- case 2: suspend SELECT on the standby QD before creating snapshot +-1S: select gp_inject_fault('select_before_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m'; +-1S&: select * from hs_rc; + +1: insert into hs_rc select * from generate_series(21,30); +1: delete from hs_rc where a < 21; +1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m'; + +-- standby should see the effect of the INSERT and DELETE +-1S<: + +---------------------------------------------------------------- +-- Read-committed isolation in the BEGIN...END block +---------------------------------------------------------------- + +1: truncate hs_rc; +1: insert into hs_rc select * from generate_series(1,30); + +-1S: begin; +-1S: select count(*) from hs_rc; + +-- have some concurrent sessions on primary QD: +-- 1. a completed transaction +1: delete from hs_rc where a <= 10; +-- 3. an aborted transaction +2: begin; +2: delete from hs_rc where a > 10 and a <= 20; +2: abort; +-- 3. an ongoing transaction +3: begin; +3: delete from hs_rc where a > 20 and a <= 30; + +-- the standby should see results accordingly +-1S: select * from hs_rc; +-1S: end; + +3: end; +-1S: select * from hs_rc; + +---------------------------------------------------------------- +-- Repeatable-read isolation: distributed snapshot is created at time of the +-- first query in transaction block. All queries in the transaction block +-- should only see results committed before the distributed snapshot creation. +---------------------------------------------------------------- + +1: create table hs_rr(a int); +1: insert into hs_rr select * from generate_series(1,10); + +-1S: begin isolation level repeatable read; +-- should see 10 +-1S: select count(*) from hs_rr; + +-- do some more INSERT, DELETE and UPDATE +1: insert into hs_rr select * from generate_series(11,20); +1: delete from hs_rr where a <= 10; +1: update hs_rr set a = a + 100; + +-- should still the initial rows {1...10} +-1S: select * from hs_rr; +-1S: end; + +-- should see the results from the INSERT, DELETE and UPDATE +-1S: begin isolation level repeatable read; +-1S: select * from hs_rr; + +-- standby won't see ongoing or aborted transactions either +1: begin; +1: insert into hs_rr select * from generate_series(1,10); +2: begin; +2: insert into hs_rr select * from generate_series(1,10); +2: abort; + +-1S: select * from hs_rr; + +1: end; +-1S: end; + +---------------------------------------------------------------- +-- Transaction isolation is respected in subtransactions too +---------------------------------------------------------------- + +1: create table hs_subtrx(a int); + +-- (1) read-committed +-1S: begin; +-1S: select count(*) from hs_subtrx; +-1S: savepoint s1; + +1: insert into hs_subtrx select * from generate_series(1,10); + +-1S: select count(*) from hs_subtrx; +-1S: savepoint s2; +-1S: select count(*) from hs_subtrx; +-1S: rollback to savepoint s1; +-1S: select count(*) from hs_subtrx; +-1S: end; + +-- (2) repeatable-read +-1S: begin isolation level repeatable read; +-1S: select * from hs_subtrx; +-1S: savepoint s1; + +1: insert into hs_subtrx select * from generate_series(11,20); +1: delete from hs_subtrx where a <= 10; +1: update hs_subtrx set a = a + 100; + +-1S: select * from hs_subtrx; +-1S: savepoint s2; +-1S: select * from hs_subtrx; +-1S: rollback to savepoint s1; +-1S: select * from hs_subtrx; +-1S: end; +-1S: select * from hs_subtrx; + +---------------------------------------------------------------- +-- Various isolation tests that involve AO/CO table. +---------------------------------------------------------------- +1: create table hs_ao(a int, id int unique) using ao_row; +1: insert into hs_ao select 1,i from generate_series(1,10) i; +1: begin; +1: insert into hs_ao select 2,i from generate_series(11,20) i; + +-- standby sees the same AO metadata as primary +2: select * from gp_toolkit.__gp_aoseg('hs_ao'); +-1S: select * from gp_toolkit.__gp_aoseg('hs_ao'); +2: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id'); +-1S: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id'); + +-- standby sees correct table data +-1S: select * from hs_ao; + +-- standby sees the effect of vacuum +1: end; +1: delete from hs_ao where a = 1; +1: vacuum hs_ao; +1: select * from gp_toolkit.__gp_aoseg('hs_ao'); +-1S: select * from gp_toolkit.__gp_aoseg('hs_ao'); +-1S: select * from hs_ao; diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index 262e4e74fbe..9e6d4c653b9 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -603,6 +603,50 @@ sub append_conf =pod +=item $node->adjust_conf(filename, setting, value, skip_equals) + +Modify the named config file setting with the value. If the value is undefined, +instead delete the setting. If the setting is not present no action is taken. + +This will write "$setting = $value\n" in place of the existing line, +unless skip_equals is true, in which case it will write +"$setting $value\n". If the value needs to be quoted it is the caller's +responsibility to do that. + +=cut + +sub adjust_conf +{ + my ($self, $filename, $setting, $value, $skip_equals) = @_; + + my $conffile = $self->data_dir . '/' . $filename; + + my $contents = PostgreSQL::Test::Utils::slurp_file($conffile); + my @lines = split(/\n/, $contents); + my @result; + my $eq = $skip_equals ? '' : '= '; + foreach my $line (@lines) + { + if ($line !~ /^$setting\W/) + { + push(@result, "$line\n"); + } + elsif (defined $value) + { + push(@result, "$setting $eq$value\n"); + } + } + open my $fh, ">", $conffile + or croak "could not write \"$conffile\": $!"; + print $fh @result; + close $fh; + + chmod($self->group_access() ? 0640 : 0600, $conffile) + or die("unable to set permissions for $conffile"); +} + +=pod + =item $node->backup(backup_name) Create a hot backup with B in subdirectory B of diff --git a/src/test/recovery/t/101_restore_point_and_startup_pause.pl b/src/test/recovery/t/101_restore_point_and_startup_pause.pl index cda572524c1..f59acffb7ad 100644 --- a/src/test/recovery/t/101_restore_point_and_startup_pause.pl +++ b/src/test/recovery/t/101_restore_point_and_startup_pause.pl @@ -1,48 +1,122 @@ -# test for pausing on startup and on a specified restore point +# Test for pausing and resuming recovery at specific restore points, +# both at initial startup and in a continuous fashion by advancing +# gp_pause_on_restore_point_replay. + use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 1; +use Test::More tests => 12; use File::Copy; -# Initialize primary node with WAL archiving setup +# Initialize and start primary node my $node_primary = get_new_node('primary'); -$node_primary->init( - has_archiving => 1, - allows_streaming => 1); -$node_primary->append_conf('postgresql.conf', "wal_level = 'replica'"); -$node_primary->append_conf('postgresql.conf', "max_wal_senders = 10"); -my $backup_name = 'my_backup'; - -# Start primary +$node_primary->init(has_archiving => 1, allows_streaming => 1); $node_primary->start; -# Initialize standby node from backup, fetching WAL from archives -$node_primary->backup($backup_name); -my $node_standby = get_new_node('standby'); -$node_standby->init_from_backup($node_primary, $backup_name, - has_restoring => 1); -$node_standby->append_conf('postgresql.conf', "gp_pause_on_restore_point_replay = on"); +my $node_standby = get_new_node("standby"); + +sub test_pause_in_recovery +{ + my ($restore_point, $test_lsn, $num_rows) = @_; + + # Wait until standby has replayed enough data + my $caughtup_query = "SELECT pg_last_wal_replay_lsn() = '$test_lsn'::pg_lsn"; + $node_standby->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for standby to catch up"; + + # Check data has been replayed + my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;"); + is($result, $num_rows, "check standby content for $restore_point"); + ok($node_standby->safe_psql('postgres', 'SELECT pg_is_wal_replay_paused();') eq 't', + "standby is paused in recovery on $restore_point"); +} + +# Create data before taking the backup +$node_primary->safe_psql('postgres', "CREATE TABLE table_foo AS SELECT generate_series(1,1000);"); +# Take backup from which all operations will be run +$node_primary->backup('my_backup'); +my $lsn0 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp0');"); +# Switching WAL guarantees that the restore point is available to the standby +$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();"); + +# Add more data, create restore points and switch wal to guarantee +# that the restore point is available to the standby + +# rp1 +$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(1001,2000))"); +my $lsn1 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp1');"); +$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();"); + +# rp2 +$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(2001, 3000))"); +my $lsn2 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp2');"); +$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();"); + +# rp3 +$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(3001, 4000))"); +$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp3');"); +$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();"); -# Start standby +# rp4 +$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(4001, 5000))"); +my $lsn4 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp4');"); +$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();"); + +# rp5 +$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(5001, 6000))"); +$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp5');"); +$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();"); + +# Restore the backup +$node_standby->init_from_backup($node_primary, 'my_backup', has_restoring => 1); +# Enable `hot_standby` +$node_standby->append_conf('postgresql.conf', qq(hot_standby = 'on')); + +# Set rp0 as a restore point to pause on start up +$node_standby->append_conf('postgresql.conf', qq(gp_pause_on_restore_point_replay = 'rp0')); +# Start the standby $node_standby->start; +test_pause_in_recovery('rp0', $lsn0, 1000); + +# Advance to rp1 +$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp1"); +$node_standby->reload; +$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();"); +test_pause_in_recovery('rp1', $lsn1, 2000); + +# Advance to rp2 +$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp2"); +$node_standby->reload; +$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();"); +test_pause_in_recovery('rp2', $lsn2, 3000); + +# Verify that a restart will bring us back to rp2 +$node_standby->restart; +test_pause_in_recovery('rp2', $lsn2, 3000); + +# Skip rp3 and advance to rp4 +$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp4"); +$node_standby->reload; +$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();"); +test_pause_in_recovery('rp4', $lsn4, 5000); + +# Do not advance to rp5; signal promote and then resume recovery +$node_standby->safe_psql('postgres', "SELECT pg_promote(false);"); +$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();"); -# Create a restore point on the primary -my $restore_point_lsn = - $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp')"); +# Wait for standby to promote +$node_standby->poll_query_until('postgres', "SELECT NOT pg_is_in_recovery();") + or die "Timed out while waiting for standby to exit recovery"; -# Force archival of WAL file to make it present on standby -$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()"); +# Check that we promoted with rp4's table count and not rp5's +my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;"); +is($result, 5000, "check standby content after promotion"); -# Wait until enough replay has been done on the standby before checking if replay -# is paused at the restore point -my $caughtup_query = - "SELECT '$restore_point_lsn'::pg_lsn <= pg_last_wal_replay_lsn()"; -$node_standby->poll_query_until('postgres', $caughtup_query) - or die "Timed out while waiting for standby to catch up"; +# Make sure the former standby is now writable +$node_standby->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(6001, 7000));"); +$result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;"); +is($result, 6000, "check standby is writable after promotion"); -my $paused_at_restore_point_query = - "SELECT pg_is_wal_replay_paused() and pg_last_wal_replay_lsn() = '$restore_point_lsn'::pg_lsn"; -my $result2 = $node_standby->safe_psql('postgres', $paused_at_restore_point_query); -is($result2, qq(t), 'check if WAL replay is paused at restore point'); +$node_primary->teardown_node; +$node_standby->teardown_node; diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile index dffdba2572a..8271d5fb171 100644 --- a/src/test/regress/GNUmakefile +++ b/src/test/regress/GNUmakefile @@ -239,6 +239,16 @@ endif standbycheck: all $(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/standby_schedule $(EXTRA_TESTS) +# GPDB: installcheck for hot standby. This is essentially same as the upstream 'standbycheck' +# above but we just make sure that we do the primary preparation and use the desired standby port. +# If no standby port is given, just use the demo cluster's standby port 7001. +ifeq ($(STANDBY_PGPORT),) + STANDBY_PGPORT = 7001 +endif +installcheck-hot-standby: all + $(pg_regress_installcheck) $(REGRESS_OPTS) hs_primary_setup + $(pg_regress_installcheck) $(REGRESS_OPTS) --port=$(STANDBY_PGPORT) --use-existing --schedule=$(srcdir)/standby_schedule $(EXTRA_TESTS) + # old interfaces follow... runcheck: check diff --git a/src/test/regress/expected/hs_primary_setup.out b/src/test/regress/expected/hs_primary_setup.out new file mode 100644 index 00000000000..0184b2b73e9 --- /dev/null +++ b/src/test/regress/expected/hs_primary_setup.out @@ -0,0 +1,19 @@ +-- +-- Hot Standby tests +-- +-- hs_primary_setup.sql +-- +drop table if exists hs1; +create table hs1 (col1 integer primary key); +insert into hs1 values (1); +drop table if exists hs2; +create table hs2 (col1 integer primary key); +insert into hs2 values (12); +insert into hs2 values (13); +drop table if exists hs3; +create table hs3 (col1 integer primary key); +insert into hs3 values (113); +insert into hs3 values (114); +insert into hs3 values (115); +DROP sequence if exists hsseq; +create sequence hsseq; diff --git a/src/test/regress/expected/hs_standby_allowed.out b/src/test/regress/expected/hs_standby_allowed.out index 00b8faf9eb6..e6b6514642f 100644 --- a/src/test/regress/expected/hs_standby_allowed.out +++ b/src/test/regress/expected/hs_standby_allowed.out @@ -164,31 +164,25 @@ show synchronous_commit; reset synchronous_commit; discard temp; discard all; +NOTICE: command without clusterwide effect +HINT: Consider alternatives as DEALLOCATE ALL, or DISCARD TEMP if a clusterwide effect is desired. -- CURSOR commands BEGIN; -DECLARE hsc CURSOR FOR select * from hs3; +DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc; FETCH next from hsc; col1 ------ 113 (1 row) -fetch first from hsc; - col1 ------- - 113 -(1 row) - -fetch last from hsc; - col1 ------- - 115 -(1 row) - +-- GPDB: backward fetch isn't allowed, moved to hs_standby_disallowed +-- fetch first from hsc; +-- fetch last from hsc; fetch 1 from hsc; col1 ------ -(0 rows) + 114 +(1 row) CLOSE hsc; COMMIT; @@ -216,3 +210,5 @@ UNLISTEN *; -- ALLOWED COMMANDS CHECKPOINT; discard all; +NOTICE: command without clusterwide effect +HINT: Consider alternatives as DEALLOCATE ALL, or DISCARD TEMP if a clusterwide effect is desired. diff --git a/src/test/regress/expected/hs_standby_disallowed.out b/src/test/regress/expected/hs_standby_disallowed.out index 8d3cafa5cec..0a62e40e743 100644 --- a/src/test/regress/expected/hs_standby_disallowed.out +++ b/src/test/regress/expected/hs_standby_disallowed.out @@ -11,9 +11,15 @@ commit; WARNING: there is no transaction in progress -- SELECT select * from hs1 FOR SHARE; -ERROR: cannot execute SELECT FOR SHARE in a read-only transaction +ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress +LINE 1: select * from hs1 FOR SHARE; + ^ +HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery. select * from hs1 FOR UPDATE; -ERROR: cannot execute SELECT FOR UPDATE in a read-only transaction +ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress +LINE 1: select * from hs1 FOR UPDATE; + ^ +HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery. -- DML BEGIN; insert into hs1 values (37); @@ -21,11 +27,17 @@ ERROR: cannot execute INSERT in a read-only transaction ROLLBACK; BEGIN; delete from hs1 where col1 = 1; -ERROR: cannot execute DELETE in a read-only transaction +ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress +LINE 1: delete from hs1 where col1 = 1; + ^ +HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery. ROLLBACK; BEGIN; update hs1 set col1 = NULL where col1 > 0; -ERROR: cannot execute UPDATE in a read-only transaction +ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress +LINE 1: update hs1 set col1 = NULL where col1 > 0; + ^ +HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery. ROLLBACK; BEGIN; truncate hs3; @@ -131,3 +143,15 @@ REVOKE SELECT ON hs1 FROM PUBLIC; ERROR: cannot execute REVOKE in a read-only transaction GRANT SELECT ON hs1 TO PUBLIC; ERROR: cannot execute GRANT in a read-only transaction +-- GPDB: backward fetch is not supported, moved from hs_standby_allowed. +BEGIN; +DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc; +fetch next from hsc; + col1 +------ + 113 +(1 row) + +fetch first from hsc; +ERROR: backward scan is not supported in this version of Apache Cloudberry +COMMIT; diff --git a/src/test/regress/expected/hs_standby_functions.out b/src/test/regress/expected/hs_standby_functions.out index ce846b758bf..48cb480f47a 100644 --- a/src/test/regress/expected/hs_standby_functions.out +++ b/src/test/regress/expected/hs_standby_functions.out @@ -27,13 +27,16 @@ select * from pg_prepared_xacts; -------------+-----+----------+-------+---------- (0 rows) --- just the startup process -select locktype, virtualxid, virtualtransaction, mode, granted +-- just the startup processes of all standby coordinator and segments, since pg_locks show cluster-wide view +select gp_segment_id, locktype, virtualxid, virtualtransaction, mode, granted from pg_locks where virtualxid = '1/1'; - locktype | virtualxid | virtualtransaction | mode | granted -------------+------------+--------------------+---------------+--------- - virtualxid | 1/1 | 1/0 | ExclusiveLock | t -(1 row) + gp_segment_id | locktype | virtualxid | virtualtransaction | mode | granted +---------------+------------+------------+--------------------+---------------+--------- + -1 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t + 0 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t + 1 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t + 2 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t +(4 rows) -- suicide is painless select pg_cancel_backend(pg_backend_pid()); diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 3b9e91136d4..9320cf0aeec 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -3615,9 +3615,20 @@ cluster_healthy(void) return false; } + char *p; + /* skip if the instance is hot standby */ + psql_command_output("postgres", line, sizeof(line), + "SELECT pg_is_in_recovery();"); + p = &line[0]; + while (*p == ' ') + p++; + if (*p == 't') + { + return !halt_work; + } + i = 120; do { - char *p; /* check for the health for standby coordinator */ psql_command_output("postgres", line, sizeof(line), "SELECT sync_state FROM pg_stat_get_wal_senders();"); diff --git a/src/test/regress/sql/hs_primary_setup.sql b/src/test/regress/sql/hs_primary_setup.sql index eeb4421307f..83403299fd5 100644 --- a/src/test/regress/sql/hs_primary_setup.sql +++ b/src/test/regress/sql/hs_primary_setup.sql @@ -22,4 +22,11 @@ insert into hs3 values (115); DROP sequence if exists hsseq; create sequence hsseq; +-- start_ignore SELECT pg_switch_wal(); + +-- GPDB: enable hot_standby for this cluster +\! gpconfig -c hot_standby -v on; +\! gpstop -ari; + +-- end_ignore diff --git a/src/test/regress/sql/hs_standby_allowed.sql b/src/test/regress/sql/hs_standby_allowed.sql index 6debddc5e99..873f3ef8643 100644 --- a/src/test/regress/sql/hs_standby_allowed.sql +++ b/src/test/regress/sql/hs_standby_allowed.sql @@ -82,11 +82,12 @@ discard all; BEGIN; -DECLARE hsc CURSOR FOR select * from hs3; +DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc; FETCH next from hsc; -fetch first from hsc; -fetch last from hsc; +-- GPDB: backward fetch isn't allowed, moved to hs_standby_disallowed +-- fetch first from hsc; +-- fetch last from hsc; fetch 1 from hsc; CLOSE hsc; diff --git a/src/test/regress/sql/hs_standby_disallowed.sql b/src/test/regress/sql/hs_standby_disallowed.sql index a470600eec8..72066e2d40b 100644 --- a/src/test/regress/sql/hs_standby_disallowed.sql +++ b/src/test/regress/sql/hs_standby_disallowed.sql @@ -101,3 +101,11 @@ REINDEX TABLE hs2; REVOKE SELECT ON hs1 FROM PUBLIC; GRANT SELECT ON hs1 TO PUBLIC; + +-- GPDB: backward fetch is not supported, moved from hs_standby_allowed. +BEGIN; +DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc; +fetch next from hsc; +fetch first from hsc; +COMMIT; + diff --git a/src/test/regress/sql/hs_standby_functions.sql b/src/test/regress/sql/hs_standby_functions.sql index b57f67ff8b5..903c8f96037 100644 --- a/src/test/regress/sql/hs_standby_functions.sql +++ b/src/test/regress/sql/hs_standby_functions.sql @@ -16,8 +16,8 @@ select pg_stop_backup(); -- should return no rows select * from pg_prepared_xacts; --- just the startup process -select locktype, virtualxid, virtualtransaction, mode, granted +-- just the startup processes of all standby coordinator and segments, since pg_locks show cluster-wide view +select gp_segment_id, locktype, virtualxid, virtualtransaction, mode, granted from pg_locks where virtualxid = '1/1'; -- suicide is painless