Pg_upgrade is able to run in --check mode when the old server is still running. Unfortunately, all supported versions of pg_upgrade generate an incorrect (but harmless) "failure" message when doing this:
$ pg_upgrade -b /u/pgsql.old/bin -B /u/pgsql/bin \ -d /u/pgsql.old/data/ -D /u/pgsql/data --link --check --> *failure* --> Consult the last few lines of "pg_upgrade_server.log" for --> the probable cause of the failure. Performing Consistency Checks on Old Live Server ------------------------------------------------ Checking cluster versions ok Checking database user is the install user ok Checking database connection settings ok Checking for prepared transactions ok Checking for reg* data types in user tables ok Checking for contrib/isn with bigint-passing mismatch ok Checking for invalid "unknown" user columns ok Checking for hash indexes ok Checking for roles starting with "pg_" ok Checking for presence of required libraries ok Checking database user is the install user ok Checking for prepared transactions ok *Clusters are compatible* Embarrassingly, I found out about this bug when watching a presentation in Frankfurt: PostgreSQL-Upgrade: Best Practices https://www.ittage.informatik-aktuell.de/programm/2017/postgresql-upgrade-best-practices/ and there is a blog entry about it too: https://blog.dbi-services.com/does-pg_upgrade-in-check-mode-raises-a-failure-when-the-old-cluster-is-running/ (Yeah, it stinks to be me. :-) ) So, I looked into the code and it turns out that start_postmaster() needs to run exec_prog() in a way that allows it to fail and continue _without_ generating an error message. There are other places that need to run exec_prog() and allow it to fail and continue _and_ generate an error message. To fix this, I modified the exec_prog() API to separately control reporting and exiting-on errors. The attached patch does this and generates the proper output: $ pg_upgrade -b /u/pgsql.old/bin -B /u/pgsql/bin \ -d /u/pgsql.old/data/ -D /u/pgsql/data --link --check Performing Consistency Checks on Old Live Server ------------------------------------------------ Checking cluster versions ok Checking database user is the install user ok Checking database connection settings ok Checking for prepared transactions ok Checking for reg* data types in user tables ok Checking for contrib/isn with bigint-passing mismatch ok Checking for invalid "unknown" user columns ok Checking for hash indexes ok Checking for roles starting with "pg_" ok Checking for presence of required libraries ok Checking database user is the install user ok Checking for prepared transactions ok *Clusters are compatible* -- Bruce Momjian <br...@momjian.us> http://momjian.us EnterpriseDB http://enterprisedb.com + As you are, so once was I. As I am, so you will be. + + Ancient Roman grave inscription +
diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c new file mode 100644 index 5ed6b78..8a662e9 *** a/src/bin/pg_upgrade/dump.c --- b/src/bin/pg_upgrade/dump.c *************** generate_old_dump(void) *** 23,29 **** prep_status("Creating dump of global objects"); /* run new pg_dumpall binary for globals */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_dumpall\" %s --globals-only --quote-all-identifiers " "--binary-upgrade %s -f %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), --- 23,29 ---- prep_status("Creating dump of global objects"); /* run new pg_dumpall binary for globals */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_dumpall\" %s --globals-only --quote-all-identifiers " "--binary-upgrade %s -f %s", new_cluster.bindir, cluster_conn_opts(&old_cluster), diff --git a/src/bin/pg_upgrade/exec.c b/src/bin/pg_upgrade/exec.c new file mode 100644 index ea45434..9122e27 *** a/src/bin/pg_upgrade/exec.c --- b/src/bin/pg_upgrade/exec.c *************** get_bin_version(ClusterInfo *cluster) *** 71,86 **** * and attempts to execute that command. If the command executes * successfully, exec_prog() returns true. * ! * If the command fails, an error message is saved to the specified log_file. ! * If throw_error is true, this raises a PG_FATAL error and pg_upgrade ! * terminates; otherwise it is just reported as PG_REPORT and exec_prog() ! * returns false. * * The code requires it be called first from the primary thread on Windows. */ bool exec_prog(const char *log_file, const char *opt_log_file, ! bool throw_error, const char *fmt,...) { int result = 0; int written; --- 71,84 ---- * and attempts to execute that command. If the command executes * successfully, exec_prog() returns true. * ! * If the command fails, an error message is optionally written to the specified ! * log_file, and the program optionally exits. * * The code requires it be called first from the primary thread on Windows. */ bool exec_prog(const char *log_file, const char *opt_log_file, ! bool report_error, bool exit_on_error, const char *fmt,...) { int result = 0; int written; *************** exec_prog(const char *log_file, const ch *** 173,179 **** #endif result = system(cmd); ! if (result != 0) { /* we might be in on a progress status line, so go to the next line */ report_status(PG_REPORT, "\n*failure*"); --- 171,177 ---- #endif result = system(cmd); ! if (result != 0 && report_error) { /* we might be in on a progress status line, so go to the next line */ report_status(PG_REPORT, "\n*failure*"); *************** exec_prog(const char *log_file, const ch *** 181,192 **** pg_log(PG_VERBOSE, "There were problems executing \"%s\"\n", cmd); if (opt_log_file) ! pg_log(throw_error ? PG_FATAL : PG_REPORT, "Consult the last few lines of \"%s\" or \"%s\" for\n" "the probable cause of the failure.\n", log_file, opt_log_file); else ! pg_log(throw_error ? PG_FATAL : PG_REPORT, "Consult the last few lines of \"%s\" for\n" "the probable cause of the failure.\n", log_file); --- 179,190 ---- pg_log(PG_VERBOSE, "There were problems executing \"%s\"\n", cmd); if (opt_log_file) ! pg_log(exit_on_error ? PG_FATAL : PG_REPORT, "Consult the last few lines of \"%s\" or \"%s\" for\n" "the probable cause of the failure.\n", log_file, opt_log_file); else ! pg_log(exit_on_error ? PG_FATAL : PG_REPORT, "Consult the last few lines of \"%s\" for\n" "the probable cause of the failure.\n", log_file); diff --git a/src/bin/pg_upgrade/parallel.c b/src/bin/pg_upgrade/parallel.c new file mode 100644 index cb1dc43..23f869f *** a/src/bin/pg_upgrade/parallel.c --- b/src/bin/pg_upgrade/parallel.c *************** parallel_exec_prog(const char *log_file, *** 78,85 **** va_end(args); if (user_opts.jobs <= 1) ! /* throw_error must be true to allow jobs */ ! exec_prog(log_file, opt_log_file, true, "%s", cmd); else { /* parallel */ --- 78,85 ---- va_end(args); if (user_opts.jobs <= 1) ! /* exit_on_error must be true to allow jobs */ ! exec_prog(log_file, opt_log_file, true, true, "%s", cmd); else { /* parallel */ *************** parallel_exec_prog(const char *log_file, *** 122,128 **** child = fork(); if (child == 0) /* use _exit to skip atexit() functions */ ! _exit(!exec_prog(log_file, opt_log_file, true, "%s", cmd)); else if (child < 0) /* fork failed */ pg_fatal("could not create worker process: %s\n", strerror(errno)); --- 122,128 ---- child = fork(); if (child == 0) /* use _exit to skip atexit() functions */ ! _exit(!exec_prog(log_file, opt_log_file, true, true, "%s", cmd)); else if (child < 0) /* fork failed */ pg_fatal("could not create worker process: %s\n", strerror(errno)); *************** win32_exec_prog(exec_thread_arg *args) *** 160,166 **** { int ret; ! ret = !exec_prog(args->log_file, args->opt_log_file, true, "%s", args->cmd); /* terminates thread */ return ret; --- 160,166 ---- { int ret; ! ret = !exec_prog(args->log_file, args->opt_log_file, true, true, "%s", args->cmd); /* terminates thread */ return ret; *************** parallel_transfer_all_new_dbs(DbInfoArr *** 187,193 **** #endif if (user_opts.jobs <= 1) - /* throw_error must be true to allow jobs */ transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, NULL); else { --- 187,192 ---- diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c new file mode 100644 index 2b7da52..8726214 *** a/src/bin/pg_upgrade/pg_upgrade.c --- b/src/bin/pg_upgrade/pg_upgrade.c *************** main(int argc, char **argv) *** 149,162 **** * because there is no need to have the schema load use new oids. */ prep_status("Setting next OID for new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_resetwal\" -o %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtoid, new_cluster.pgdata); check_ok(); prep_status("Sync data directory to disk"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/initdb\" --sync-only \"%s\"", new_cluster.bindir, new_cluster.pgdata); check_ok(); --- 149,162 ---- * because there is no need to have the schema load use new oids. */ prep_status("Setting next OID for new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -o %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtoid, new_cluster.pgdata); check_ok(); prep_status("Sync data directory to disk"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/initdb\" --sync-only \"%s\"", new_cluster.bindir, new_cluster.pgdata); check_ok(); *************** prepare_new_cluster(void) *** 249,255 **** * --analyze so autovacuum doesn't update statistics later */ prep_status("Analyzing all rows in the new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/vacuumdb\" %s --all --analyze %s", new_cluster.bindir, cluster_conn_opts(&new_cluster), log_opts.verbose ? "--verbose" : ""); --- 249,255 ---- * --analyze so autovacuum doesn't update statistics later */ prep_status("Analyzing all rows in the new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/vacuumdb\" %s --all --analyze %s", new_cluster.bindir, cluster_conn_opts(&new_cluster), log_opts.verbose ? "--verbose" : ""); *************** prepare_new_cluster(void) *** 262,268 **** * counter later. */ prep_status("Freezing all rows in the new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/vacuumdb\" %s --all --freeze %s", new_cluster.bindir, cluster_conn_opts(&new_cluster), log_opts.verbose ? "--verbose" : ""); --- 262,268 ---- * counter later. */ prep_status("Freezing all rows in the new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/vacuumdb\" %s --all --freeze %s", new_cluster.bindir, cluster_conn_opts(&new_cluster), log_opts.verbose ? "--verbose" : ""); *************** prepare_new_databases(void) *** 289,295 **** * support functions in template1 but pg_dumpall creates database using * the template0 template. */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/psql\" " EXEC_PSQL_ARGS " %s -f \"%s\"", new_cluster.bindir, cluster_conn_opts(&new_cluster), GLOBALS_DUMP_FILE); --- 289,295 ---- * support functions in template1 but pg_dumpall creates database using * the template0 template. */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/psql\" " EXEC_PSQL_ARGS " %s -f \"%s\"", new_cluster.bindir, cluster_conn_opts(&new_cluster), GLOBALS_DUMP_FILE); *************** copy_subdir_files(const char *old_subdir *** 392,398 **** prep_status("Copying old %s to new server", old_subdir); ! exec_prog(UTILITY_LOG_FILE, NULL, true, #ifndef WIN32 "cp -Rf \"%s\" \"%s\"", #else --- 392,398 ---- prep_status("Copying old %s to new server", old_subdir); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, #ifndef WIN32 "cp -Rf \"%s\" \"%s\"", #else *************** copy_xact_xlog_xid(void) *** 418,433 **** /* set the next transaction id and epoch of the new cluster */ prep_status("Setting next transaction ID and epoch for new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_resetwal\" -f -x %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtxid, new_cluster.pgdata); ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_resetwal\" -f -e %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtepoch, new_cluster.pgdata); /* must reset commit timestamp limits also */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_resetwal\" -f -c %u,%u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtxid, --- 418,433 ---- /* set the next transaction id and epoch of the new cluster */ prep_status("Setting next transaction ID and epoch for new cluster"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -x %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtxid, new_cluster.pgdata); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -e %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtepoch, new_cluster.pgdata); /* must reset commit timestamp limits also */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -c %u,%u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtxid, *************** copy_xact_xlog_xid(void) *** 453,459 **** * we preserve all files and contents, so we must preserve both "next" * counters here and the oldest multi present on system. */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtmxoff, --- 453,459 ---- * we preserve all files and contents, so we must preserve both "next" * counters here and the oldest multi present on system. */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtmxoff, *************** copy_xact_xlog_xid(void) *** 481,487 **** * might end up wrapped around (i.e. 0) if the old cluster had * next=MaxMultiXactId, but multixact.c can cope with that just fine. */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, "\"%s/pg_resetwal\" -m %u,%u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtmulti + 1, --- 481,487 ---- * might end up wrapped around (i.e. 0) if the old cluster had * next=MaxMultiXactId, but multixact.c can cope with that just fine. */ ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -m %u,%u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtmulti + 1, *************** copy_xact_xlog_xid(void) *** 492,498 **** /* now reset the wal archives in the new cluster */ prep_status("Resetting WAL archives"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, /* use timeline 1 to match controldata and no WAL history file */ "\"%s/pg_resetwal\" -l 00000001%s \"%s\"", new_cluster.bindir, old_cluster.controldata.nextxlogfile + 8, --- 492,498 ---- /* now reset the wal archives in the new cluster */ prep_status("Resetting WAL archives"); ! exec_prog(UTILITY_LOG_FILE, NULL, true, true, /* use timeline 1 to match controldata and no WAL history file */ "\"%s/pg_resetwal\" -l 00000001%s \"%s\"", new_cluster.bindir, old_cluster.controldata.nextxlogfile + 8, diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h new file mode 100644 index fbe69cb..67f874b *** a/src/bin/pg_upgrade/pg_upgrade.h --- b/src/bin/pg_upgrade/pg_upgrade.h *************** void generate_old_dump(void); *** 360,366 **** #define EXEC_PSQL_ARGS "--echo-queries --set ON_ERROR_STOP=on --no-psqlrc --dbname=template1" bool exec_prog(const char *log_file, const char *opt_log_file, ! bool throw_error, const char *fmt,...) pg_attribute_printf(4, 5); void verify_directories(void); bool pid_lock_file_exists(const char *datadir); --- 360,366 ---- #define EXEC_PSQL_ARGS "--echo-queries --set ON_ERROR_STOP=on --no-psqlrc --dbname=template1" bool exec_prog(const char *log_file, const char *opt_log_file, ! bool report_error, bool exit_on_error, const char *fmt,...) pg_attribute_printf(5, 6); void verify_directories(void); bool pid_lock_file_exists(const char *datadir); *************** PGresult *executeQueryOrDie(PGconn *co *** 416,423 **** char *cluster_conn_opts(ClusterInfo *cluster); ! bool start_postmaster(ClusterInfo *cluster, bool throw_error); ! void stop_postmaster(bool fast); uint32 get_major_server_version(ClusterInfo *cluster); void check_pghost_envvar(void); --- 416,423 ---- char *cluster_conn_opts(ClusterInfo *cluster); ! bool start_postmaster(ClusterInfo *cluster, bool report_and_exit_on_error); ! void stop_postmaster(bool in_atexit); uint32 get_major_server_version(ClusterInfo *cluster); void check_pghost_envvar(void); diff --git a/src/bin/pg_upgrade/server.c b/src/bin/pg_upgrade/server.c new file mode 100644 index 9618123..5f55b58 *** a/src/bin/pg_upgrade/server.c --- b/src/bin/pg_upgrade/server.c *************** stop_postmaster_atexit(void) *** 191,197 **** bool ! start_postmaster(ClusterInfo *cluster, bool throw_error) { char cmd[MAXPGPATH * 4 + 1000]; PGconn *conn; --- 191,197 ---- bool ! start_postmaster(ClusterInfo *cluster, bool report_and_exit_on_error) { char cmd[MAXPGPATH * 4 + 1000]; PGconn *conn; *************** start_postmaster(ClusterInfo *cluster, b *** 257,267 **** (strcmp(SERVER_LOG_FILE, SERVER_START_LOG_FILE) != 0) ? SERVER_LOG_FILE : NULL, ! false, "%s", cmd); /* Did it fail and we are just testing if the server could be started? */ ! if (!pg_ctl_return && !throw_error) return false; /* --- 257,267 ---- (strcmp(SERVER_LOG_FILE, SERVER_START_LOG_FILE) != 0) ? SERVER_LOG_FILE : NULL, ! report_and_exit_on_error, false, "%s", cmd); /* Did it fail and we are just testing if the server could be started? */ ! if (!pg_ctl_return && !report_and_exit_on_error) return false; /* *************** start_postmaster(ClusterInfo *cluster, b *** 305,313 **** PQfinish(conn); /* ! * If pg_ctl failed, and the connection didn't fail, and throw_error is ! * enabled, fail now. This could happen if the server was already ! * running. */ if (!pg_ctl_return) { --- 305,313 ---- PQfinish(conn); /* ! * If pg_ctl failed, and the connection didn't fail, and ! * report_and_exit_on_error is enabled, fail now. This ! * could happen if the server was already running. */ if (!pg_ctl_return) { *************** start_postmaster(ClusterInfo *cluster, b *** 322,328 **** void ! stop_postmaster(bool fast) { ClusterInfo *cluster; --- 322,328 ---- void ! stop_postmaster(bool in_atexit) { ClusterInfo *cluster; *************** stop_postmaster(bool fast) *** 333,343 **** else return; /* no cluster running */ ! exec_prog(SERVER_STOP_LOG_FILE, NULL, !fast, "\"%s/pg_ctl\" -w -D \"%s\" -o \"%s\" %s stop", cluster->bindir, cluster->pgconfig, cluster->pgopts ? cluster->pgopts : "", ! fast ? "-m fast" : "-m smart"); os_info.running_cluster = NULL; } --- 333,343 ---- else return; /* no cluster running */ ! exec_prog(SERVER_STOP_LOG_FILE, NULL, !in_atexit, !in_atexit, "\"%s/pg_ctl\" -w -D \"%s\" -o \"%s\" %s stop", cluster->bindir, cluster->pgconfig, cluster->pgopts ? cluster->pgopts : "", ! in_atexit ? "-m fast" : "-m smart"); os_info.running_cluster = NULL; }