This is an automated email from the ASF dual-hosted git repository.
wangwn pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push:
new 92e337463f2 Feat: Enable hot DR cluster
92e337463f2 is described below
commit 92e337463f2246280a7b4a82086d830e4dea4b59
Author: WANG Weinan <[email protected]>
AuthorDate: Thu Jul 31 17:54:03 2025 +0800
Feat: Enable hot DR cluster
The most of feature is done by upsteam, but hot DR qd can not
organzie cdbcomponent by `gp_segment_configuration` rel. Define
a boolean GUC name as `hot_dr`, if the `hot_dr` enable, read
cluster infor from segconf file.
---
.github/workflows/build-cloudberry.yml | 3 ++
src/backend/access/transam/xlog.c | 10 ++++++
src/backend/cdb/cdbutil.c | 40 +++++++++++++++++-----
src/backend/utils/misc/guc_gp.c | 27 +++++++++++++++
src/include/access/xlog.h | 1 +
src/include/cdb/cdbutil.h | 1 +
src/include/cdb/cdbvars.h | 1 +
src/include/utils/unsync_guc_name.h | 1 +
.../isolation2/expected/hot_standby/faults.out | 3 +-
src/test/isolation2/sql/hot_standby/faults.sql | 3 +-
10 files changed, 80 insertions(+), 10 deletions(-)
diff --git a/.github/workflows/build-cloudberry.yml
b/.github/workflows/build-cloudberry.yml
index de702b5790d..fd2b9c73949 100644
--- a/.github/workflows/build-cloudberry.yml
+++ b/.github/workflows/build-cloudberry.yml
@@ -310,6 +310,9 @@ jobs:
{"test":"ic-isolation2",
"make_configs":["src/test/isolation2:installcheck-isolation2"]
},
+ {"test":"ic-isolation2-hot-standby",
+ "make_configs":["src/test/isolation2:installcheck-hot-standby"]
+ },
{"test":"ic-isolation2-crash",
"make_configs":["src/test/isolation2:installcheck-isolation2-crash"],
"enable_core_check":false
diff --git a/src/backend/access/transam/xlog.c
b/src/backend/access/transam/xlog.c
index ffc8714cf62..034aeb6473b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -114,6 +114,7 @@ int XLogArchiveTimeout = 0;
int XLogArchiveMode = ARCHIVE_MODE_OFF;
char *XLogArchiveCommand = NULL;
bool EnableHotStandby = false;
+bool EnableHotDR = false;
bool fullPageWrites = true;
bool wal_log_hints = false;
bool wal_compression = false;
@@ -7967,6 +7968,12 @@ StartupXLOG(void)
if (gp_pause_on_restore_point_replay)
pauseRecoveryOnRestorePoint(xlogreader);
+ /* Exit the recovery loop if a promotion is
triggered in pauseRecoveryOnRestorePoint() */
+ if (reachedContinuousRecoveryTarget &&
recoveryTargetAction == RECOVERY_TARGET_ACTION_PROMOTE){
+ reachedRecoveryTarget = true;
+ break;
+ }
+
/* Exit loop if we reached inclusive recovery
target */
if (recoveryStopsAfter(xlogreader))
{
@@ -10757,6 +10764,9 @@ XLogRestorePoint(const char *rpName)
xlrec.rp_time = GetCurrentTimestamp();
strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
+ /* LogHotStandby for the restore here */
+ LogStandbySnapshot();
+
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
diff --git a/src/backend/cdb/cdbutil.c b/src/backend/cdb/cdbutil.c
index a241549662d..fbf3f8900f2 100644
--- a/src/backend/cdb/cdbutil.c
+++ b/src/backend/cdb/cdbutil.c
@@ -92,6 +92,7 @@ static int CdbComponentDatabaseInfoCompare(const void *p1,
const void *p2);
static GpSegConfigEntry * readGpSegConfigFromCatalog(int *total_dbs);
static GpSegConfigEntry * readGpSegConfigFromFTSFiles(int *total_dbs);
+static GpSegConfigEntry * readGpSegConfigFromFiles(int *total_dbs);
static void getAddressesForDBid(GpSegConfigEntry *c, int elevel);
static HTAB *hostPrimaryCountHashTableInit(void);
@@ -131,6 +132,15 @@ typedef struct HostPrimaryCountEntry
*/
static GpSegConfigEntry *
readGpSegConfigFromFTSFiles(int *total_dbs)
+{
+ Assert(!IsTransactionState() && !IS_HOT_DR_CLUSTER());
+ /* notify and wait FTS to finish a probe and update the dump file */
+ FtsNotifyProber();
+ return readGpSegConfigFromFiles(total_dbs);
+}
+
+static GpSegConfigEntry *
+readGpSegConfigFromFiles(int *total_dbs)
{
FILE *fd;
int idx = 0;
@@ -142,11 +152,6 @@ readGpSegConfigFromFTSFiles(int *total_dbs)
char address[MAXHOSTNAMELEN];
char buf[MAXHOSTNAMELEN * 2 + 32];
- Assert(!IsTransactionState());
-
- /* notify and wait FTS to finish a probe and update the dump file */
- FtsNotifyProber();
-
fd = AllocateFile(GPSEGCONFIGDUMPFILE, "r");
if (!fd)
@@ -188,6 +193,18 @@ readGpSegConfigFromFTSFiles(int *total_dbs)
return configs;
}
+bool
+checkGpSegConfigFtsFiles()
+{
+ FILE *fd = AllocateFile(GPSEGCONFIGDUMPFILE, "r");
+
+ if (!fd)
+ return false;
+
+ FreeFile(fd);
+ return true;
+}
+
/*
* writeGpSegConfigToFTSFiles() dump gp_segment_configuration to the file
* GPSEGCONFIGDUMPFILE, in $PGDATA, only FTS process can use this function.
@@ -372,10 +389,17 @@ getCdbComponentInfo(void)
HTAB *hostPrimaryCountHash = hostPrimaryCountHashTableInit();
- if (IsTransactionState())
- configs = readGpSegConfigFromCatalog(&total_dbs);
+ if (EnableHotDR)
+ {
+ configs = readGpSegConfigFromFiles(&total_dbs);
+ }
else
- configs = readGpSegConfigFromFTSFiles(&total_dbs);
+ {
+ if (IsTransactionState())
+ configs = readGpSegConfigFromCatalog(&total_dbs);
+ else
+ configs = readGpSegConfigFromFTSFiles(&total_dbs);
+ }
component_databases = palloc0(sizeof(CdbComponentDatabases));
diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c
index c7bb596cb61..2c373e5a582 100644
--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -86,6 +86,7 @@ static bool check_optimizer(bool *newval, void **extra,
GucSource source);
static bool check_verify_gpfdists_cert(bool *newval, void **extra, GucSource
source);
static bool check_dispatch_log_stats(bool *newval, void **extra, GucSource
source);
static bool check_gp_workfile_compression(bool *newval, void **extra,
GucSource source);
+static bool check_hot_dr(bool *newval, void **extra, GucSource source);
/* Helper function for guc setter */
bool gpvars_check_gp_resqueue_priority_default_value(char **newval,
@@ -3331,6 +3332,16 @@ struct config_bool ConfigureNamesBool_gp[] =
NULL, NULL, NULL
},
+ {
+ {"hot_dr", PGC_POSTMASTER, REPLICATION_STANDBY,
+ gettext_noop("DR Cluster as well as allows connteions
and queries"),
+ NULL
+ },
+ &EnableHotDR,
+ false,
+ check_hot_dr, NULL, NULL
+ },
+
{
{"gp_enable_runtime_filter_pushdown", PGC_USERSET,
DEVELOPER_OPTIONS,
gettext_noop("Try to push the hash table of hash join
to the seqscan or AM as bloom filter."),
@@ -5455,6 +5466,22 @@ check_verify_gpfdists_cert(bool *newval, void **extra,
GucSource source)
return true;
}
+static bool
+check_hot_dr(bool *newval, void **extra, GucSource source)
+{
+ if (*newval && !EnableHotStandby)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot enable \"hot_dr\" when
\"hot_standby\" is false")));
+
+ if (*newval && IS_QUERY_DISPATCHER() && !checkGpSegConfigFtsFiles())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot enable \"hot_dr\" since DR
cluster segment configuration file does not exits")));
+
+ return true;
+}
+
static bool
check_dispatch_log_stats(bool *newval, void **extra, GucSource source)
{
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index e8a73ceb201..6d1cc151ed2 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -123,6 +123,7 @@ extern int XLogArchiveTimeout;
extern int wal_retrieve_retry_interval;
extern char *XLogArchiveCommand;
extern bool EnableHotStandby;
+extern bool EnableHotDR;
extern bool fullPageWrites;
extern bool wal_log_hints;
diff --git a/src/include/cdb/cdbutil.h b/src/include/cdb/cdbutil.h
index 22c3cc782d8..0f638bbd521 100644
--- a/src/include/cdb/cdbutil.h
+++ b/src/include/cdb/cdbutil.h
@@ -132,6 +132,7 @@ extern char *getDnsAddress(char *name, int port, int
elevel);
#ifdef USE_INTERNAL_FTS
extern void writeGpSegConfigToFTSFiles(void);
+extern bool checkGpSegConfigFtsFiles(void);
#else
GpSegConfigEntry * readGpSegConfig(char * buff, int *total_dbs);
diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h
index 2393384ec3a..534f957978d 100644
--- a/src/include/cdb/cdbvars.h
+++ b/src/include/cdb/cdbvars.h
@@ -757,6 +757,7 @@ extern GpId GpIdentity;
#define MAX_DBID_STRING_LENGTH 11
#define UNINITIALIZED_GP_IDENTITY_VALUE (-10000)
+#define IS_HOT_DR_CLUSTER() (EnableHotDR)
#define IS_QUERY_DISPATCHER() (GpIdentity.segindex == MASTER_CONTENT_ID)
#define IS_HOT_STANDBY_QD() (EnableHotStandby && IS_QUERY_DISPATCHER() &&
RecoveryInProgress())
diff --git a/src/include/utils/unsync_guc_name.h
b/src/include/utils/unsync_guc_name.h
index b26c5b43c7b..37f629e6e97 100644
--- a/src/include/utils/unsync_guc_name.h
+++ b/src/include/utils/unsync_guc_name.h
@@ -294,6 +294,7 @@
"gp_workfile_limit_per_segment",
"gp_workfile_max_entries",
"hba_file",
+ "hot_dr",
"hot_standby",
"hot_standby_feedback",
"huge_pages",
diff --git a/src/test/isolation2/expected/hot_standby/faults.out
b/src/test/isolation2/expected/hot_standby/faults.out
index 39f3a06cca6..2eb16b37229 100644
--- a/src/test/isolation2/expected/hot_standby/faults.out
+++ b/src/test/isolation2/expected/hot_standby/faults.out
@@ -133,7 +133,7 @@ select gp_inject_fault('out_of_recovery_in_startupxlog',
'reset', dbid) from gp_
ERROR: primary segments can only process MPP protocol messages from primary
QD (seg1 slice1 127.0.1.1:7006 pid=14671)
HINT: Exit the current session and re-connect.
-1Sq: ... <quitting>
-
+-- start_ignore
-- will fail due to downed mirror (previous primary)
-1S: select * from hs_failover;
ERROR: failed to acquire resources on one or more segments
@@ -141,6 +141,7 @@ DETAIL: connection to server at "10.13.9.74", port 7003
failed: Connection refu
Is the server running on that host and accepting TCP/IP connections?
(seg1 10.13.9.74:7003)
-1Sq: ... <quitting>
+-- end_ignore
-- bring the downed mirror up
!\retcode gprecoverseg -aF;
diff --git a/src/test/isolation2/sql/hot_standby/faults.sql
b/src/test/isolation2/sql/hot_standby/faults.sql
index 6e25bcba272..b1be240916a 100644
--- a/src/test/isolation2/sql/hot_standby/faults.sql
+++ b/src/test/isolation2/sql/hot_standby/faults.sql
@@ -59,10 +59,11 @@ select gp_inject_fault('out_of_recovery_in_startupxlog',
'reset', dbid) from gp_
-- in an existing gang. That mirror is now a primary, so it will complain and
the query fails.
-1S: select * from hs_failover;
-1Sq:
-
+-- start_ignore
-- will fail due to downed mirror (previous primary)
-1S: select * from hs_failover;
-1Sq:
+-- end_ignore
-- bring the downed mirror up
!\retcode gprecoverseg -aF;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]