This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 07c4d130e172a8ad515d8c21457eb54fcce47bf0 Author: Nihal Jain <[email protected]> AuthorDate: Tue Jun 6 14:03:43 2023 +0530 gprecoverseg: Add ability to handle interrupts Currently, gprecoverseg lacks the ability to handle interrupts. When the main gprecoverseg process is terminated, it leaves behind orphaned `gpsegrecovery, pg_rewind, or pg_basebackup` processes that the user would have to manually terminate. This commit aims to add signal handling capabilities to the gprecoverseg code so that we can gracefully terminate the gprecoverseg process by using appropriate signal handlers and performing necessary cleanup actions, instead of abruptly terminating only the main process. The overall process of termination using signal handlers is described below: - Signals are caught by the signal handlers in the `gprecoverseg` code. This signal handler then gets the PIDs of the `gpsegsetuprecovery/gpsegrecovery` process running on each segment host and sends a `SIGTERM` signal to them. - The `gpsegrecovery` process also has a signal handler that will be used to handle the `SIGTERM` signal sent by the main process. We do not need any signal handler for the `gpsegsetuprecovery` code as it does not spawn any new process and we can directly terminate it. - The signal handler in the `gpsegrecovery` code would then call `terminate_proc_tree` which would terminate the entire process tree of `gpsegrecovery` process, since it is the parent PID of all the child processes (either `pg_rewind/pg_basebackup/rsync`). We do not terminate the `gpsegrecovery` process here since we would need it to perform proper error handling and cleanup actions. - The above steps are repeated unitl the `WorkerPool` is done with its work. This is to make sure that we are not leaving behind any unfinished commands that might be waiting in the `WorkerPool` queue. - Termination of the `pg_rewind/pg_basebackup/rsync` process will raise an error, which will be propagated back to the user, and it would also trigger the necessary cleanup actions that are already in place whenever a failure occurs. The behavior of gprecoverseg on different termination scenarios will be as follows: 1. **Using CTRL-C**: This sends a `SIGINT` signal, and upon receiving this, the user would be given a prompt if they want to proceed further with the termination. ``` It is not recommended to terminate a recovery procedure midway. However, if you choose to proceed, you will need to run either gprecoverseg --differential or gprecoverseg -F to start a new recovery process the next time. Continue terminating gprecoverseg Yy|Nn (default=N): > ``` 2. **Using kill from the terminal**: This, by default, sends a `SIGTERM` signal, and upon receiving this signal, gprecoverseg would directly terminate without any confirmation from the user. 3. **SSH disconnections**: This sends a `SIGHUP` signal, and currently, this signal will be ignored as we do not want to terminate the recovery process due to such issues. The signal handlers are placed just before the recovery process is started so that it does not affect any other functionality of gprecoverseg (like rebalancing). --- gpMgmt/bin/gppylib/commands/gp.py | 4 +- gpMgmt/bin/gppylib/commands/unix.py | 32 + .../bin/gppylib/operations/buildMirrorSegments.py | 14 +- gpMgmt/bin/gppylib/programs/clsRecoverSegment.py | 55 +- .../test/unit/test_cluster_clsrecoversegment.py | 78 +++ gpMgmt/sbin/gpsegrecovery.py | 15 + gpMgmt/sbin/recovery_base.py | 10 +- gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature | 762 +++++++++++++++++++++ .../behave/mgmt_utils/gprecoverseg_newhost.feature | 140 +++- gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py | 155 ++++- .../mgmt_utils/steps/recoverseg_mgmt_utils.py | 298 +++++++- 11 files changed, 1519 insertions(+), 44 deletions(-) diff --git a/gpMgmt/bin/gppylib/commands/gp.py b/gpMgmt/bin/gppylib/commands/gp.py index 96e16cbd04..129ee13d50 100644 --- a/gpMgmt/bin/gppylib/commands/gp.py +++ b/gpMgmt/bin/gppylib/commands/gp.py @@ -1002,7 +1002,7 @@ class GpSegSetupRecovery(Command): """ def __init__(self, name, confinfo, logdir, batchSize, verbose, remoteHost, forceoverwrite): cmdStr = _get_cmd_for_recovery_wrapper('gpsegsetuprecovery', confinfo, logdir, batchSize, verbose,forceoverwrite) - Command.__init__(self, name, cmdStr, REMOTE, remoteHost) + Command.__init__(self, name, cmdStr, REMOTE, remoteHost, start_new_session=True) class GpSegRecovery(Command): @@ -1011,7 +1011,7 @@ class GpSegRecovery(Command): """ def __init__(self, name, confinfo, logdir, batchSize, verbose, remoteHost, forceoverwrite, era): cmdStr = _get_cmd_for_recovery_wrapper('gpsegrecovery', confinfo, logdir, batchSize, verbose, forceoverwrite, era) - Command.__init__(self, name, cmdStr, REMOTE, remoteHost) + Command.__init__(self, name, cmdStr, REMOTE, remoteHost, start_new_session=True) def _get_cmd_for_recovery_wrapper(wrapper_filename, confinfo, logdir, batchSize, verbose, forceoverwrite, era=None): diff --git a/gpMgmt/bin/gppylib/commands/unix.py b/gpMgmt/bin/gppylib/commands/unix.py index 3d0b0582d1..fc0732ec81 100644 --- a/gpMgmt/bin/gppylib/commands/unix.py +++ b/gpMgmt/bin/gppylib/commands/unix.py @@ -201,6 +201,38 @@ def kill_sequence(pid): # all else failed - try SIGABRT logandkill(pid, signal.SIGABRT) +""" +Terminate a process tree (including grandchildren) with signal 'sig'. +'on_terminate', if specified, is a callback function which is +called as soon as a child terminates. +""" +def terminate_proc_tree(pid, sig=signal.SIGTERM, include_parent=True, timeout=None, on_terminate=None): + parent = psutil.Process(pid) + + children = list() + terminated = set() + + if include_parent: + children.append(parent) + + children.extend(parent.children(recursive=True)) + while children: + process = children.pop() + + try: + # Update the list with any new process spawned after the initial list creation + children.extend(process.children(recursive=True)) + process.send_signal(sig) + terminated.add(process) + except psutil.NoSuchProcess: + pass + + _, alive = psutil.wait_procs(terminated, timeout=timeout, callback=on_terminate) + + # Forcefully terminate any remaining processes + for process in alive: + process.kill() + # ---------------Platform Framework-------------------- diff --git a/gpMgmt/bin/gppylib/operations/buildMirrorSegments.py b/gpMgmt/bin/gppylib/operations/buildMirrorSegments.py index 2626d32f91..cfa08280b2 100644 --- a/gpMgmt/bin/gppylib/operations/buildMirrorSegments.py +++ b/gpMgmt/bin/gppylib/operations/buildMirrorSegments.py @@ -274,7 +274,7 @@ class GpMirrorListToBuild: full_recovery_dbids[ri.target_segment_dbid] = True # Disable Ctrl-C, going to save metadata in database and transition segments - signal.signal(signal.SIGINT, signal.SIG_IGN) + old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) backout_map = None try: self.__logger.info("Updating configuration for mirrors") @@ -289,7 +289,7 @@ class GpMirrorListToBuild: self.__logger.debug("Generating configuration backout scripts") finally: # Re-enable Ctrl-C - signal.signal(signal.SIGINT, signal.default_int_handler) + signal.signal(signal.SIGINT, old_handler) return backout_map def _remove_progress_files(self, recovery_info_by_host, recovery_results): @@ -408,8 +408,14 @@ class GpMirrorListToBuild: combined_progress_file.write("".join(complete_progress_output)) combined_progress_file.flush() - outfile.write("".join(output)) - outfile.flush() + try: + outfile.write("".join(output)) + outfile.flush() + + # During SSH disconnections, writing to stdout might not be possible. So ignore the + # error, but continue writing to the recovery_progress file. + except IOError: + pass written = False combined_progress_filepath = get_recovery_progress_file(gplog) diff --git a/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py b/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py index d42b5af4b3..34764d7d45 100644 --- a/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py +++ b/gpMgmt/bin/gppylib/programs/clsRecoverSegment.py @@ -25,7 +25,7 @@ from contextlib import closing from gppylib import gparray, gplog, userinput, utils from gppylib.util import gp_utils from gppylib.commands import gp, pg, unix -from gppylib.commands.base import Command, WorkerPool +from gppylib.commands.base import Command, WorkerPool, REMOTE from gppylib.db import dbconn from gppylib.gpparseopts import OptParser, OptChecker from gppylib.operations.detect_unreachable_hosts import get_unreachable_segment_hosts, update_unreachable_flag_for_segments @@ -78,6 +78,7 @@ class GpRecoverSegmentProgram: self.__options = options self.__pool = None self.logger = logger + self.termination_requested = False # If user did not specify a value for showProgressInplace and # stdout is a tty then send escape sequences to gprecoverseg @@ -347,10 +348,46 @@ class GpRecoverSegmentProgram: contentsToUpdate = [seg.getLiveSegment().getSegmentContentId() for seg in mirrorBuilder.getMirrorsToBuild()] update_pg_hba_on_segments(gpArray, self.__options.hba_hostnames, self.__options.parallelDegree, contentsToUpdate) + + def signal_handler(sig, frame): + signal_name = signal.Signals(sig).name + logger.warn("Recieved {0} signal, terminating gprecoverseg".format(signal_name)) + + # Confirm with the user if they really want to terminate with CTRL-C. + if signal_name == "SIGINT": + prompt_text = "\nIt is not recommended to terminate a recovery procedure midway. However, if you choose to proceed, you will need " \ + "to run either gprecoverseg --differential or gprecoverseg -F to start a new recovery process the next time." + + if not userinput.ask_yesno(prompt_text, "Continue terminating gprecoverseg", 'N'): + return + + self.termination_requested = True + self.shutdown(current_hosts) + + # Reset the signal handlers + signal.signal(signal.SIGINT, signal.SIG_DFL) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + # SSH disconnections send a SIGHUP signal to all the processes running in that session. + # Ignoring this signal so that gprecoverseg does not terminate due to such issues. + signal.signal(signal.SIGHUP, signal.SIG_IGN) + if not mirrorBuilder.recover_mirrors(gpEnv, gpArray): - self.logger.error("gprecoverseg failed. Please check the output for more details.") + if self.termination_requested: + self.logger.error("gprecoverseg process was interrupted by the user.") + if self.__options.differentialResynchronization: + self.logger.error("gprecoverseg differential recovery failed. Please check the gpsegrecovery.py log" + " file and rsync log file for more details.") + else: + self.logger.error("gprecoverseg failed. Please check the output for more details.") sys.exit(1) + if self.termination_requested: + self.logger.info("Not able to terminate the recovery process since it has been completed successfully.") + self.logger.info("********************************") self.logger.info("Segments successfully recovered.") self.logger.info("********************************") @@ -391,6 +428,20 @@ class GpRecoverSegmentProgram: self.__pool.joinWorkers() # > all three of these appear necessary self.__pool.join() # / see MPP-12633, CR-2252 as well + def shutdown(self, hosts): + + # Clear out the existing pool to stop any pending recovery process + while not self.__pool.isDone(): + + for host in hosts: + try: + logger.debug("Terminating recovery process on host {0}".format(host)) + cmd = Command(name="terminate recovery process", + cmdStr="ps ux | grep -E 'gpsegsetuprecovery|gpsegrecovery' | grep -vE 'ssh|grep|bash' | awk '{print $ 2}' | xargs -r kill", remoteHost=host, ctxt=REMOTE) + cmd.run(validateAfter=True) + except ExecutionError as e: + logger.error("Not able to terminate recovery process on host {0}: {1}".format(host, e)) + # ------------------------------------------------------------------------- @staticmethod diff --git a/gpMgmt/bin/gppylib/programs/test/unit/test_cluster_clsrecoversegment.py b/gpMgmt/bin/gppylib/programs/test/unit/test_cluster_clsrecoversegment.py new file mode 100644 index 0000000000..877feb51ed --- /dev/null +++ b/gpMgmt/bin/gppylib/programs/test/unit/test_cluster_clsrecoversegment.py @@ -0,0 +1,78 @@ +from mock import Mock, patch, call + +from gppylib.test.unit.gp_unittest import GpTestCase, run_tests +from gppylib.commands.base import CommandResult, ExecutionError +from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram + +class RecoverSegmentsTestCase(GpTestCase): + def setUp(self): + mock_logger = Mock(spec=['log', 'warn', 'info', 'debug', 'error', 'warning', 'fatal']) + + self.apply_patches([ + patch('gppylib.programs.clsRecoverSegment.logger', return_value=mock_logger), + ]) + + self.mock_logger = self.get_mock_from_apply_patch('logger') + + # Mock WorkerPool + self.mock_pool = Mock() + self.mock_pool.isDone.side_effect = [False, True] + self.obj = GpRecoverSegmentProgram(Mock()) + self.obj._GpRecoverSegmentProgram__pool = self.mock_pool + + def tearDown(self): + super(RecoverSegmentsTestCase, self).tearDown() + + @patch('gppylib.programs.clsRecoverSegment.Command.run') + def test_shutdown_runs_successfully_single_host(self, mock1): + self.obj.shutdown(['sdw1']) + + self.mock_logger.debug.assert_called_once_with("Terminating recovery process on host sdw1") + self.assertEqual(mock1.call_count, 1) + + @patch('gppylib.programs.clsRecoverSegment.Command.run') + def test_shutdown_runs_successfully_multiple_hosts(self, mock1): + self.obj.shutdown(['sdw1', 'sdw2', 'sdw3']) + + self.mock_logger.debug.assert_any_call("Terminating recovery process on host sdw1") + self.mock_logger.debug.assert_any_call("Terminating recovery process on host sdw2") + self.mock_logger.debug.assert_any_call("Terminating recovery process on host sdw3") + + self.assertEqual(mock1.call_count, 3) + + @patch('gppylib.programs.clsRecoverSegment.ExecutionError.__str__', return_value="Error getting recovery PID") + def test_shutdown_logs_exception_on_single_host(self, mock1): + + def mock_func(*args, **kwargs): + cmd = args[0] + if cmd.remoteHost == "sdw2": + raise ExecutionError("Error getting recovery PID", cmd) + + with patch('gppylib.programs.clsRecoverSegment.Command.run', mock_func): + self.obj.shutdown(['sdw1', 'sdw2', 'sdw3']) + + self.mock_logger.debug.assert_has_calls([call("Terminating recovery process on host sdw1"), + call("Terminating recovery process on host sdw2"), + call("Terminating recovery process on host sdw3")]) + self.mock_logger.error.assert_called_once_with("Not able to terminate recovery process on host sdw2: Error getting recovery PID") + + @patch('gppylib.programs.clsRecoverSegment.ExecutionError.__str__', return_value="Error getting recovery PID") + def test_shutdown_logs_exception_on_multiple_host(self, mock1): + + def mock_func(*args, **kwargs): + cmd = args[0] + if cmd.remoteHost in ["sdw1", "sdw3"]: + raise ExecutionError("Error getting recovery PID", cmd) + + with patch('gppylib.programs.clsRecoverSegment.Command.run', mock_func): + self.obj.shutdown(['sdw1', 'sdw2', 'sdw3']) + + self.mock_logger.debug.assert_has_calls([call("Terminating recovery process on host sdw1"), + call("Terminating recovery process on host sdw2"), + call("Terminating recovery process on host sdw3")]) + self.mock_logger.error.assert_has_calls([call("Not able to terminate recovery process on host sdw1: Error getting recovery PID"), + call("Not able to terminate recovery process on host sdw3: Error getting recovery PID")]) + + +if __name__ == '__main__': + run_tests() diff --git a/gpMgmt/sbin/gpsegrecovery.py b/gpMgmt/sbin/gpsegrecovery.py index 9be43ef3df..6b9f31bf3e 100644 --- a/gpMgmt/sbin/gpsegrecovery.py +++ b/gpMgmt/sbin/gpsegrecovery.py @@ -1,11 +1,15 @@ #!/usr/bin/env python3 +import os +import signal + from gppylib.recoveryinfo import RecoveryErrorType from gppylib.commands.pg import PgBaseBackup, PgRewind from recovery_base import RecoveryBase, set_recovery_cmd_results from gppylib.commands.base import Command from gppylib.commands.gp import SegmentStart from gppylib.gparray import Segment +from gppylib.commands.unix import terminate_proc_tree class FullRecovery(Command): @@ -108,6 +112,17 @@ class SegRecovery(object): def main(self): recovery_base = RecoveryBase(__file__) + + def signal_handler(sig, frame): + recovery_base.logger.warning("Recieved termination signal, stopping gpsegrecovery") + + while not recovery_base.pool.isDone(): + + # gpsegrecovery will be the parent for all the child processes (pg_basebackup/pg_rewind/rsync) + terminate_proc_tree(pid=os.getpid(), include_parent=False) + + signal.signal(signal.SIGTERM, signal_handler) + recovery_base.main(self.get_recovery_cmds(recovery_base.seg_recovery_info_list, recovery_base.options.forceoverwrite, recovery_base.logger, recovery_base.options.era)) diff --git a/gpMgmt/sbin/recovery_base.py b/gpMgmt/sbin/recovery_base.py index 4f9eb6e1d7..36bbb0d13b 100644 --- a/gpMgmt/sbin/recovery_base.py +++ b/gpMgmt/sbin/recovery_base.py @@ -19,6 +19,7 @@ class RecoveryBase(object): self.logger = None self.seg_recovery_info_list = None self.options = None + self.pool = None try: self.parseargs() except Exception as e: @@ -63,19 +64,18 @@ class RecoveryBase(object): raise Exception('No segment configuration values found in --confinfo argument') def main(self, cmd_list): - pool = None try: # TODO: should we output the name of the exact file? self.logger.info("Starting recovery with args: %s" % ' '.join(sys.argv[1:])) - pool = WorkerPool(numWorkers=min(self.options.batch_size, len(cmd_list))) - self.run_cmd_list(cmd_list, self.logger, self.options, pool) + self.pool = WorkerPool(numWorkers=min(self.options.batch_size, len(cmd_list))) + self.run_cmd_list(cmd_list, self.logger, self.options, self.pool) sys.exit(0) except Exception as e: self._write_to_stderr_and_exit(e) finally: - if pool: - pool.haltWork() + if self.pool: + self.pool.haltWork() def _write_to_stderr_and_exit(self, e): if self.logger: diff --git a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature index fc19962e22..b17c37e61f 100644 --- a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature +++ b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature @@ -545,6 +545,51 @@ Feature: gprecoverseg tests And verify that lines from recovery_progress.file are present in segment progress files in gpAdminLogs And the cluster is rebalanced + @demo_cluster + @concourse_cluster + Scenario: SIGINT on gprecoverseg differential recovery should delete the progress file + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + When the user asynchronously runs "gprecoverseg -a --differential" and the process is saved + Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format + Then verify if the gprecoverseg.lock directory is present in coordinator_data_directory + When the user asynchronously sets up to end gprecoverseg process with SIGINT + And the user waits until saved async process is completed + Then recovery_progress.file should not exist in gpAdminLogs + Then the gprecoverseg lock directory is removed + And the user waits until mirror on content 0,1,2 is up + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: SIGKILL on gprecoverseg should not display progress in gpstate -e + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And sql "DROP TABLE IF EXISTS test_recoverseg; CREATE TABLE test_recoverseg AS SELECT generate_series(1,100000000) AS a;" is executed in "postgres" db + And the user suspend the walsender on the primary on content 0 + When the user asynchronously runs "gprecoverseg -aF" and the process is saved + Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format + Then verify if the gprecoverseg.lock directory is present in coordinator_data_directory + When the user runs "gpstate -e" + Then gpstate should print "Segments in recovery" to stdout + When the user asynchronously sets up to end gprecoverseg process with SIGKILL + And the user waits until saved async process is completed + When the user runs "gpstate -e" + Then gpstate should not print "Segments in recovery" to stdout + Then the user reset the walsender on the primary on content 0 + And the user waits until mirror on content 0,1,2 is up + And the gprecoverseg lock directory is removed + And the cluster is rebalanced + @demo_cluster @concourse_cluster Scenario: gprecoverseg mixed recovery segments come up even if one basebackup takes longer @@ -603,6 +648,209 @@ Feature: gprecoverseg tests And the cluster is recovered in full and rebalanced And the row count from table "test_recoverseg" in "postgres" is verified against the saved data + @demo_cluster + Scenario Outline: gprecoverseg differential recovery segments come up even if recovery for one segment fails + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And sql "DROP TABLE if exists test_recoverseg; CREATE TABLE test_recoverseg AS SELECT generate_series(1,10000) AS i" is executed in "postgres" db + And the "test_recoverseg" table row count in "postgres" is saved + And a temporary directory with mode '0000' is created under data_dir of primary with content 0 + When the user runs "gprecoverseg -av --differential" + Then gprecoverseg should return a return code of 1 + And user can start transactions + And check if differential recovery failed for mirrors with content 0 for gprecoverseg + And gprecoverseg should print "Failed to recover the following segments. You must run either gprecoverseg --differential or gprecoverseg -F for all differential failures" to stdout + And verify that mirror on content 1,2 is up + And the segments are synchronized for content 1,2 + And gpAdminLogs directory has no "pg_basebackup*" files on all segment hosts + And gpAdminLogs directory has "gpsegsetuprecovery*" files on all segment hosts + And gpAdminLogs directory has "gpsegrecovery*" files on all segment hosts + And the temporary directory is removed + And the cluster is recovered <args> and rebalanced + And the row count from table "test_recoverseg" in "postgres" is verified against the saved data + + Examples: + | scenario | args | + | differential | using differential | + | full | in full | + + @concourse_cluster + Scenario: Propagating env var + Given the database is running + And An entry to send SUSPEND_PG_REWIND env var is added on all hosts of cluster + And An entry to accept SUSPEND_PG_REWIND env var is added on all hosts of cluster + + @concourse_cluster + Scenario: gprecoverseg gives warning if pg_rewind already running for one failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 2 + And user can start transactions + And the environment variable "SUSPEND_PG_REWIND" is set to "600" + And the user asynchronously runs "gprecoverseg -a" and the process is saved + Then the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 2 is down + And the gprecoverseg lock directory is removed + And user immediately stops all primary processes for content 0,1 + And the user waits until mirror on content 0,1 is down + And an FTS probe is triggered + And user can start transactions + And "SUSPEND_PG_REWIND" environment variable should be restored + When the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Found pg_rewind running for segments with contentIds [2], skipping recovery of these segments" to logfile + And verify that mirror on content 2 is down + And verify that mirror on content 0,1 is up + And pg_rewind is killed on mirror with content 2 + And the user asynchronously sets up to end gprecoverseg process with SIGKILL + And the gprecoverseg lock directory is removed + And verify that mirror on content 2 is down + And the user runs "gprecoverseg -a" + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And the cluster is rebalanced + + @concourse_cluster + Scenario: gprecoverseg gives warning if pg_rewind already running for some failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 2,3 + And user can start transactions + And the environment variable "SUSPEND_PG_REWIND" is set to "600" + And the user asynchronously runs "gprecoverseg -a" and the process is saved + Then the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 2,3 is down + And the gprecoverseg lock directory is removed + And user immediately stops all primary processes for content 0,1 + And the user waits until mirror on content 0,1 is down + And an FTS probe is triggered + And user can start transactions + And "SUSPEND_PG_REWIND" environment variable should be restored + When the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Found pg_rewind running for segments with contentIds [2, 3], skipping recovery of these segments" to logfile + And verify that mirror on content 2,3 is down + And verify that mirror on content 0,1 is up + And pg_rewind is killed on mirror with content 2,3 + And the user asynchronously sets up to end gprecoverseg process with SIGKILL + And the gprecoverseg lock directory is removed + And verify that mirror on content 2,3 is down + And the user runs "gprecoverseg -a" + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2,3 is up + And the cluster is rebalanced + + @concourse_cluster + Scenario: gprecoverseg gives warning if pg_rewind already running for all failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2,3 + And user can start transactions + And the environment variable "SUSPEND_PG_REWIND" is set to "600" + And the user asynchronously runs "gprecoverseg -a" and the process is saved + Then the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 0,1,2,3 is down + And the gprecoverseg lock directory is removed + And an FTS probe is triggered + And user can start transactions + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Found pg_rewind running for segments with contentIds [0, 1, 2, 3], skipping recovery of these segments" to logfile + And verify that mirror on content 0,1,2,3 is down + And pg_rewind is killed on mirror with content 0,1,2,3 + And the user asynchronously sets up to end gprecoverseg process with SIGKILL + And the gprecoverseg lock directory is removed + And verify that mirror on content 0,1,2,3 is down + And "SUSPEND_PG_REWIND" environment variable should be restored + And the user runs "gprecoverseg -a" + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2,3 is up + And the cluster is rebalanced + + @concourse_cluster + Scenario: gprecoverseg -i gives warning if pg_rewind already running for some of the failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1 + And user can start transactions + And the environment variable "SUSPEND_PG_REWIND" is set to "600" + And the user asynchronously runs "gprecoverseg -a" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 0,1 is down + And the gprecoverseg lock directory is removed + And user immediately stops all primary processes for content 2,3 + And the user waits until mirror on content 2,3 is down + And an FTS probe is triggered + And user can start transactions + Then "SUSPEND_PG_REWIND" environment variable should be restored + Given a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 full inplace + And edit the input file to recover mirror with content 1 incremental + And edit the input file to recover mirror with content 2 incremental + And edit the input file to recover mirror with content 3 incremental + When the user runs gprecoverseg with input file and additional args "-a" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Found pg_rewind running for segments with contentIds [0, 1], skipping recovery of these segments" to logfile + And verify that mirror on content 2,3 is up + And verify that mirror on content 0,1 is down + And pg_rewind is killed on mirror with content 0,1 + And the user asynchronously sets up to end gprecoverseg process with SIGKILL + And the gprecoverseg lock directory is removed + And verify that mirror on content 0,1 is down + And the user runs "gprecoverseg -a" + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2 is up + And the cluster is rebalanced + + @concourse_cluster + Scenario: gprecoverseg -i gives warning if pg_rewind already running for all of the failed segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2,3 + And user can start transactions + And the environment variable "SUSPEND_PG_REWIND" is set to "600" + And the user asynchronously runs "gprecoverseg -a" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + And verify that mirror on content 0,1,2,3 is down + And the gprecoverseg lock directory is removed + And an FTS probe is triggered + And user can start transactions + Given a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 full inplace + And edit the input file to recover mirror with content 1 full inplace + And edit the input file to recover mirror with content 2 incremental + And edit the input file to recover mirror with content 3 incremental + When the user runs gprecoverseg with input file and additional args "-a" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "Found pg_rewind running for segments with contentIds [0, 1, 2, 3], skipping recovery of these segments" to logfile + And verify that mirror on content 0,1,2,3 is down + And pg_rewind is killed on mirror with content 0,1,2,3 + And the user asynchronously sets up to end gprecoverseg process with SIGKILL + And the gprecoverseg lock directory is removed + And verify that mirror on content 0,1,2,3 is down + Then "SUSPEND_PG_REWIND" environment variable should be restored + And the user runs "gprecoverseg -a" + And gprecoverseg should return a return code of 0 + And verify that mirror on content 0,1,2,3 is up + And the cluster is rebalanced + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + @demo_cluster @concourse_cluster Scenario: gprecoverseg mixed recovery one basebackup fails and one rewind fails while others succeed @@ -1018,3 +1266,517 @@ Feature: gprecoverseg tests And all the segments are running And the segments are synchronized + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should not terminate on SIGINT when user selects No in the prompt + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + And sql "DROP TABLE IF EXISTS test_recoverseg; CREATE TABLE test_recoverseg AS SELECT generate_series(1,10000) AS a;" is executed in "postgres" db + And the "test_recoverseg" table row count in "postgres" is saved + When the user would run "gprecoverseg -aF" and terminate the process with SIGINT and selects "n" without delay + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "[WARNING]:-Recieved SIGINT signal, terminating gprecoverseg" escaped to stdout + And gprecoverseg should print "Continue terminating gprecoverseg" to stdout + And gprecoverseg should print "Segments successfully recovered" to stdout + And the user waits until mirror on content 0 is up + And the segments are synchronized + And the cluster is rebalanced + And the row count from table "test_recoverseg" in "postgres" is verified against the saved data + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should terminate on SIGINT when user selects Yes in the prompt + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + When the user would run "gprecoverseg -aF" and terminate the process with SIGINT and selects "y" without delay + Then gprecoverseg should return a return code of 1 + And gprecoverseg should print "[WARNING]:-Recieved SIGINT signal, terminating gprecoverseg" escaped to stdout + And gprecoverseg should print "Continue terminating gprecoverseg" to stdout + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." escaped to stdout + And gprecoverseg should print "[ERROR]:-gprecoverseg failed. Please check the output for more details." escaped to stdout + And gprecoverseg should print "full" errors to logfile for content 0 + And the user reset the walsender on the primary on content 0 + And verify that pg_basebackup is not running for content 0 + And verify that mirror on content 0 is down + And recovery_progress.file should not exist in gpAdminLogs + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario Outline: gprecoverseg should terminate gracefully on SIGTERM for <scenario> of the segments + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content <content> + And user can start transactions + And the user suspend the walsender on the primary on content <content> + When the user asynchronously runs "gprecoverseg -aF" and the process is saved + Then verify that pg_basebackup is running for content <content> + Then verify that pg_rewind is not running for content <content> + And the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg failed. Please check the output for more details." to logfile + And gprecoverseg should print "full" errors to logfile for content <content> + And the user reset the walsender on the primary on content <content> + And verify that pg_basebackup is not running for content <content> + And verify that mirror on content <content> is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + Examples: + | scenario | content | + | one | 0 | + | some | 0,1 | + | all | 0,1,2 | + + + @concourse_cluster + Scenario: gprecoverseg should terminate for mixed recovery of mirrors + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1 + And user can start transactions + And the user suspend the walsender on the primary on content 1 + And the environment variable "SUSPEND_PG_REWIND" is set to "600" + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 incremental + And edit the input file to recover mirror with content 1 full inplace + When the user asynchronously runs gprecoverseg with input file and additional args "-a" and the process is saved + Then verify that pg_basebackup is running for content 1 + And verify that pg_rewind is running for content 0 + And the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "incremental" errors to logfile for content 0 + And gprecoverseg should print "full" errors to logfile for content 1 + And the user reset the walsender on the primary on content 1 + And "SUSPEND_PG_REWIND" environment variable should be restored + And verify that pg_basebackup is not running for content 1 + And verify that pg_rewind is not running for content 0 + And verify that mirror on content 0,1 is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should terminate gracefully on SIGTERM when running differential recovery + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + When the user asynchronously runs "gprecoverseg -a --differential" and the process is saved + Then verify that differential is running for content 0 + And the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg differential recovery failed. Please check the gpsegrecovery.py log file and rsync log file for more details." to logfile + And gprecoverseg should print "differential" errors to logfile for content 0 + And verify that mirror on content 0 is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @concourse_cluster + Scenario: gprecoverseg should revert catalog changes upon termination + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of contents 0,1,2 is saved + And all files in gpAdminLogs directory are deleted on hosts cdw,sdw1,sdw2 + And the "primary" segment information is saved + + And the primary on content 0 is stopped + And user can start transactions + And the status of the primary on content 0 should be "d" + And user can start transactions + And the user suspend the walsender on the primary on content 0 + + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover mirror with content 0 to a new directory on remote host with mode 0700 + When the user asynchronously runs gprecoverseg with input file and additional args "-a" and the process is saved + Then verify that pg_basebackup is running for content 0 + And the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "full" errors to logfile for content 0 + + Then the contents 0,1,2 should have their original data directory in the system configuration + And the gp_configuration_history table should contain a backout entry for the primary segment for contents 0 + And the user reset the walsender on the primary on content 0 + And verify that pg_basebackup is not running for content 0 + And verify that mirror on content 0 is down + + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should ignore SIGHUP and continue recovery + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + And the user suspend the walsender on the primary on content 0 + When the user asynchronously runs "gprecoverseg -aF" and the process is saved + Then the user just waits until recovery_progress.file is created in gpAdminLogs + And the user asynchronously sets up to end gprecoverseg process with SIGHUP + And the user reset the walsender on the primary on content 0 + And the user waits until saved async process is completed + And gprecoverseg should print "Segments successfully recovered" to logfile + And verify that mirror on content 0 is up + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should terminate gracefully when parallelism is limited + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 0,1,2 + When the user asynchronously runs "gprecoverseg -aF -b 1 -B 1" and the process is saved + Then verify that pg_basebackup is running for content 0 + And the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg failed. Please check the output for more details." to logfile + And gprecoverseg should print "full" errors to logfile for content 0,1,2 + And the user reset the walsender on the primary on content 0,1,2 + And verify that pg_basebackup is not running for content 0,1,2 + And verify that mirror on content 0,1,2 is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should only terminate the remaining recoveries if some of them are completed + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1,2 + And user can start transactions + And the user suspend the walsender on the primary on content 1,2 + When the user asynchronously runs "gprecoverseg -aF" and the process is saved + And the user waits until mirror on content 0 is up + And the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg failed. Please check the output for more details." to logfile + And gprecoverseg should print "full" errors to logfile for content 1,2 + And the user reset the walsender on the primary on content 1,2 + And verify that pg_basebackup is not running for content 1,2 + And verify that mirror on content 0 is up + And verify that mirror on content 1,2 is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should show appropriate log when recovery has been completed before termination happens + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + When the user would run "gprecoverseg -aF" and terminate the process with SIGINT and selects "y" with delay + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print "[WARNING]:-Recieved SIGINT signal, terminating gprecoverseg" escaped to stdout + And gprecoverseg should print "Continue terminating gprecoverseg" to stdout + And gprecoverseg should print "[INFO]:-Not able to terminate the recovery process since it has been completed successfully" escaped to stdout + And verify that mirror on content 0 is up + And recovery_progress.file should not exist in gpAdminLogs + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @demo_cluster + @concourse_cluster + Scenario: gprecoverseg should terminate when interrupted during setup recovery process + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + And the user runs psql with "-c "CREATE EXTENSION IF NOT EXISTS gp_inject_fault;"" against database "postgres" + And the user runs psql with "-c "SELECT gp_inject_fault('checkpoint', 'sleep', '', '', '', 1, -1, 3600, dbid) FROM gp_segment_configuration where content=0 and role='p'"" against database "postgres" + When the user asynchronously runs "gprecoverseg -a" and the process is saved + And the user asynchronously sets up to end gprecoverseg process when "Setting up the required segments for recovery" is printed in the logs + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + Then gprecoverseg should print "[ERROR]:-Unable to parse recovery error" to logfile + And the user runs psql with "-c "SELECT gp_inject_fault('checkpoint', 'reset', '', '', '', 1, -1, 3600, dbid) FROM gp_segment_configuration where content=0 and role='p'"" against database "postgres" + And verify that mirror on content 0 is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @concourse_cluster + Scenario: gprecoverseg rebalance should be able to terminate gracefully when interrupted by the user + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0,1 + And user can start transactions + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + Given the environment variable "SUSPEND_PG_REWIND" is set to "600" + When the user asynchronously runs "gprecoverseg -ar" and the process is saved + And the user just waits until recovery_progress.file is created in gpAdminLogs + Then verify that pg_rewind is running for content 0 + When the user asynchronously sets up to end gprecoverseg process with SIGTERM + And the user waits until saved async process is completed + Then gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." to logfile + And gprecoverseg should print "[ERROR]:-gprecoverseg failed. Please check the output for more details." to logfile + And gprecoverseg should print "[ERROR]:-Failed to start the synchronization step of the segment rebalance" to logfile + And gprecoverseg should print "incremental" errors to logfile for content 0,1 + And "SUSPEND_PG_REWIND" environment variable should be restored + And verify that pg_rewind is not running for content 0,1 + And verify that mirror on content 0,1 is down + When the user runs "gprecoverseg -aF" + Then gprecoverseg should return a return code of 0 + And user can start transactions + And all the segments are running + And the segments are synchronized + And the cluster is rebalanced + + + @concourse_cluster + Scenario: gprecoverseg recovery to new host populates hostname and address from the config file correctly + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of a "primary" segment on a remote host is saved + And the gprecoverseg input file "recover-config.conf" is cleaned up + When user kills a "primary" process with the saved information + And user can start transactions + Then the saved "primary" segment is marked down in config + When a gprecoverseg input file "recover-config.conf" is created with added parameter hostname to recover the failed segment on new host + And the user runs "gprecoverseg -i /tmp/recover-config.conf -a -v" + Then gprecoverseg should return a return code of 0 + When check hostname and address updated on segment configuration with the saved information + And all the segments are running + And the segments are synchronized + + @concourse_cluster + Scenario: gprecoverseg recovery to same host (full inplace) populates hostname and address from the config file correctly + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of a "primary" segment on a remote host is saved + And the gprecoverseg input file "recover-config.conf" is cleaned up + When user kills a "primary" process with the saved information + And user can start transactions + Then the saved "primary" segment is marked down in config + When a gprecoverseg input file "recover-config.conf" is created with hostname parameter to recover the failed segment on same host + And the user runs "gprecoverseg -i /tmp/recover-config.conf -a -v" + Then gprecoverseg should return a return code of 0 + When check hostname and address updated on segment configuration with the saved information + And all the segments are running + And the segments are synchronized + + + @concourse_cluster + Scenario: gprecoverseg recovery with invalid format with hostname in config file + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of a "primary" segment on a remote host is saved + And the gprecoverseg input file "recover-config-invalid.conf" is cleaned up + When user kills a "primary" process with the saved information + And user can start transactions + Then the saved "primary" segment is marked down in config + When a gprecoverseg input file "recover-config-invalid.conf" is created with invalid format for inplace full recovery of failed segment + And the user runs "gprecoverseg -i /tmp/recover-config-invalid.conf -a -v" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "line 1 of file /tmp/recover-config-invalid.conf: expected equal parts, either 3 or 4 on both segment group, obtained 4 on group1 and 3 on group2" to stdout + Then the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 0 + And the cluster is rebalanced + + + @concourse_cluster + Scenario: gprecoverseg incremental recovery populates hostname and address from the config file correctly + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of a "primary" segment on a remote host is saved + And the gprecoverseg input file "recover-config.conf" is cleaned up + When user kills a "primary" process with the saved information + And user can start transactions + Then the saved "primary" segment is marked down in config + When a gprecoverseg input file "recover-config.conf" is created with hostname parameter matches with segment configuration table for incremental recovery of failed segment + And the user runs "gprecoverseg -i /tmp/recover-config.conf -a -v" + Then gprecoverseg should return a return code of 0 + When check hostname and address updated on segment configuration with the saved information + And all the segments are running + And the segments are synchronized + + @concourse_cluster + Scenario: gprecoverseg recovery with and without hostname parameter in config file + Given the database is running + And all the segments are running + And the segments are synchronized + And user stops all primary processes + And user can start transactions + And the gprecoverseg input file "recover-config.conf" is cleaned up + When a gprecoverseg input file "recover-config.conf" is created with and without parameter hostname to recover all the failed segments + And the user runs "gprecoverseg -i /tmp/recover-config.conf -a -F -v" + Then gprecoverseg should return a return code of 0 + And all the segments are running + And the segments are synchronized + + @concourse_cluster + Scenario: gprecoverseg throws warning and skips recovery if provided hostname and address can not be resolved to same host + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of a "primary" segment on a remote host is saved + And user stops all primary processes + And user can start transactions + And the gprecoverseg input file "recover-config.conf" is cleaned up + When a gprecoverseg input file "recover-config.conf" created with invalid failover hostname for full recovery of failed segment + And the user runs "gprecoverseg -i /tmp/recover-config.conf -a -F -v" + Then gprecoverseg should return a return code of 0 + And gprecoverseg should print a "Not able to co-relate hostname:.* with address.*Skipping recovery for segments with contentId" warning + Then the user runs "gprecoverseg -a" + Then gprecoverseg should return a return code of 0 + And all the segments are running + And the segments are synchronized + + @concourse_cluster + Scenario: gprecoverseg incremental recovery fails if config file contains wrong hostname of failed segment + Given the database is running + And all the segments are running + And the segments are synchronized + And the information of a "primary" segment on a remote host is saved + And the gprecoverseg input file "recover-config.conf" is cleaned up + When user kills a "primary" process with the saved information + And user can start transactions + Then the saved "primary" segment is marked down in config + When a gprecoverseg input file "recover-config.conf" is created with invalid hostname parameter that does not matches with the segment configuration table hostname + And the user runs "gprecoverseg -i /tmp/recover-config.conf -a -v" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "A segment to recover was not found in configuration. This segment is described by hostname|address|port|directory .*'" to stdout + Then the user runs "gprecoverseg -a" + And gprecoverseg should return a return code of 0 + And the cluster is rebalanced + + @demo_cluster + Scenario: gprecoverseg recovers segment when config file contains hostname on demo cluster + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the hostsname input file to recover segment with content 0 full inplace + And update /etc/hosts file with address for the localhost + When the user runs gprecoverseg with input file and additional args "-a" + And gprecoverseg should return a return code of 0 + And restore /etc/hosts file and cleanup hostlist file + And the cluster configuration has no segments where "content=0 and status='d'" + Then the cluster is rebalanced + + @demo_cluster + Scenario: gprecoverseg skips recovery when config file contains invalid hostname on demo cluster + Given the database is running + And all the segments are running + And the segments are synchronized + And all files in gpAdminLogs directory are deleted on all hosts in the cluster + And user immediately stops all primary processes for content 0 + And user can start transactions + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the hostsname input file to recover segment with content 0 with invalid hostname + When the user runs gprecoverseg with input file and additional args "-a" + And gprecoverseg should print a "Could not resolve hostname:invalid_host" warning + And gprecoverseg should print a "Not able to co-relate hostname:invalid_host with address:.*Skipping recovery for segments with contentId" warning + And gprecoverseg should print "No segments to recover" to stdout + And gprecoverseg should return a return code of 0 + And the user runs "gprecoverseg -a -v" + Then gprecoverseg should return a return code of 0 + And the cluster is rebalanced diff --git a/gpMgmt/test/behave/mgmt_utils/gprecoverseg_newhost.feature b/gpMgmt/test/behave/mgmt_utils/gprecoverseg_newhost.feature index 65177ecb31..9204e701a2 100644 --- a/gpMgmt/test/behave/mgmt_utils/gprecoverseg_newhost.feature +++ b/gpMgmt/test/behave/mgmt_utils/gprecoverseg_newhost.feature @@ -78,7 +78,7 @@ Feature: gprecoverseg tests involving migrating to a new host When the user runs "gprecoverseg -a -p sdw5 --hba-hostnames" Then gprecoverseg should return a return code of 1 # And pg_hba file "/data/gpdata/mirror/gpseg0/pg_hba.conf" on host "sdw2" contains entries for "sdw5" - And check if moving the mirrors from sdw1 to sdw5 failed + And check if moving the mirrors from sdw1 to sdw5 failed without user termination And gprecoverseg should print "Recovery Target instance port = 20000" to stdout And gprecoverseg should print "Recovery Target instance port = 20001" to stdout And gprecoverseg should print "Recovery Target instance port = 20002" to stdout @@ -95,3 +95,141 @@ Feature: gprecoverseg tests involving migrating to a new host And the original cluster state is recreated for "one_host_down" And the cluster configuration is saved for "after_recreation" And the "before" and "after_recreation" cluster configuration matches with the expected for gprecoverseg newhost + + @concourse_cluster + Scenario: failed host is not in reach gprecoverseg recovery works well with all instances recovered + Given the database is running + And all the segments are running + And the segments are synchronized + And database "gptest" exists + And the cluster configuration is saved for "before" + And segment hosts "sdw1" are disconnected from the cluster and from the spare segment hosts "sdw5" + And the user runs psql with "-c 'SELECT gp_request_fts_probe_scan()'" against database "postgres" + And the cluster configuration has no segments where "hostname='sdw1' and status='u'" + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover with content id 0 to host sdw5 + And edit the input file to recover with content id 1 to host sdw5 + And edit the input file to recover with content id 6 to host sdw5 + And edit the input file to recover with content id 7 to host sdw5 + When the user runs gprecoverseg with input file and additional args "-av" + Then gprecoverseg should return a return code of 0 + Then the original cluster state is recreated for "one_host_down-1" + And the cluster configuration is saved for "after_recreation" + And the "before" and "after_recreation" cluster configuration matches with the expected for gprecoverseg newhost + + @concourse_cluster + Scenario: failed host is not in reach gprecoverseg recovery works well with partial recovery + Given the database is running + And all the segments are running + And the segments are synchronized + And database "gptest" exists + And the cluster configuration is saved for "before" + And segment hosts "sdw1" are disconnected from the cluster and from the spare segment hosts "sdw5" + And the user runs psql with "-c 'SELECT gp_request_fts_probe_scan()'" against database "postgres" + And the cluster configuration has no segments where "hostname='sdw1' and status='u'" + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover with content id 0 to host sdw5 + And edit the input file to recover with content id 6 to host sdw5 + When the user runs gprecoverseg with input file and additional args "-av" + Then gprecoverseg should return a return code of 0 + Then the original cluster state is recreated for "one_host_down-2" + And the cluster configuration is saved for "after_recreation" + And the "before" and "after_recreation" cluster configuration matches with the expected for gprecoverseg newhost + + @concourse_cluster + Scenario: failed host is not in reach gprecoverseg recovery works well only primaries are recovered + Given the database is running + And all the segments are running + And the segments are synchronized + And database "gptest" exists + And the cluster configuration is saved for "before" + And segment hosts "sdw1" are disconnected from the cluster and from the spare segment hosts "sdw5" + And the user runs psql with "-c 'SELECT gp_request_fts_probe_scan()'" against database "postgres" + And the cluster configuration has no segments where "hostname='sdw1' and status='u'" + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover with content id 0 to host sdw5 + And edit the input file to recover with content id 1 to host sdw5 + When the user runs gprecoverseg with input file and additional args "-av" + Then gprecoverseg should return a return code of 0 + Then the original cluster state is recreated for "one_host_down-3" + And the cluster configuration is saved for "after_recreation" + And the "before" and "after_recreation" cluster configuration matches with the expected for gprecoverseg newhost + + + @concourse_cluster + Scenario: gprecoverseg -p should terminate gracefully on user termination + Given the database is running + And all the segments are running + And the segments are synchronized + And database "gptest" exists + And the user runs gpconfig sets guc "wal_sender_timeout" with "15s" + And the user runs "gpstop -air" + And the cluster configuration is saved for "before" + And segment hosts "sdw1" are disconnected from the cluster and from the spare segment hosts "sdw5" + And the cluster configuration has no segments where "hostname='sdw1' and status='u'" + And the cluster configuration is saved for "before_recoverseg" + And datadirs from "before_recoverseg" configuration for "sdw1" are created on "sdw5" with mode 700 + When the user would run "gprecoverseg -a -p sdw5 --hba-hostnames" and terminate the process with SIGTERM + Then gprecoverseg should return a return code of 1 + And check if moving the mirrors from sdw1 to sdw5 failed with user termination + And gprecoverseg should print "[WARNING]:-Recieved SIGTERM signal, terminating gprecoverseg" escaped to stdout + And gprecoverseg should print "[ERROR]:-gprecoverseg process was interrupted by the user." escaped to stdout + And the cluster configuration is saved for "after_backout" + And the "before_recoverseg" and "after_backout" cluster configuration matches for gprecoverseg newhost + When the user runs "gprecoverseg -a -p sdw5 --hba-hostnames" + Then gprecoverseg should return a return code of 0 + And check segment conf: postgresql.conf + And the cluster configuration is saved for "one_host_down" + And the "before" and "one_host_down" cluster configuration matches with the expected for gprecoverseg newhost + And the mirrors replicate and fail over and back correctly + And the cluster is rebalanced + And the original cluster state is recreated for "one_host_down" + And the cluster configuration is saved for "after_recreation" + And the "before" and "after_recreation" cluster configuration matches with the expected for gprecoverseg newhost + + + @concourse_cluster + Scenario: gprecoverseg removes the stale replication entries from pg_hba when moving mirrors to new host + Given the database is running + And all the segments are running + And the segments are synchronized + And the cluster configuration is saved for "before" + And saving host IP address of "sdw1" + And segment hosts "sdw1" are disconnected from the cluster and from the spare segment hosts "sdw5" + And the cluster configuration has no segments where "hostname='sdw1' and status='u'" + When the user runs "gprecoverseg -a -p sdw5" + Then gprecoverseg should return a return code of 0 + And all the segments are running + And pg_hba file on primary of mirrors on "sdw5" with "all" contains no replication entries for "sdw1" + And verify that only replication connection primary has is to "sdw5" + And the user runs "gprecoverseg -ar" + And segment hosts "sdw1" are reconnected to the cluster and to the spare segment hosts "none" + And segment hosts "sdw5" are disconnected from the cluster and from the spare segment hosts "none" + And the cluster configuration has no segments where "hostname='sdw5' and status='u'" + # making the cluster back to it's original state + And segment hosts "sdw5" are reconnected to the cluster and to the spare segment hosts "none" + And segment hosts "sdw1" are disconnected from the cluster and from the spare segment hosts "none" + Then the original cluster state is recreated for "one_host_down" + And the cluster configuration is saved for "after_recreation" + And the "before" and "after_recreation" cluster configuration matches with the expected for gprecoverseg newhost + + @concourse_cluster + Scenario: failover host is not in reach gprecoverseg recovery to new host skips + Given the database is running + And all the segments are running + And the segments are synchronized + And database "gptest" exists + And the cluster configuration is saved for "before" + And the primary on content 0 is stopped with the immediate flag + And segment hosts "sdw1,sdw5" are disconnected from the cluster and from the spare segment hosts "sdw6" + And the user runs psql with "-c 'SELECT gp_request_fts_probe_scan()'" against database "postgres" + And the cluster configuration has no segments where "hostname='sdw1' and status='u'" + And a gprecoverseg directory under '/tmp' with mode '0700' is created + And a gprecoverseg input file is created + And edit the input file to recover with content id 0 to host sdw5 + When the user runs gprecoverseg with input file and additional args "-av" + Then gprecoverseg should return a return code of 2 + And gprecoverseg should print "The recovery target segment sdw5 (content 0) is unreachable" escaped to stdout diff --git a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py index 44cb6617e7..aa8b054497 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py @@ -386,10 +386,10 @@ def impl(context, content_ids): cmd.run(validateAfter=True) -@given('the user {action} the walsender on the {segment} on content {content}') -@when('the user {action} the walsender on the {segment} on content {content}') -@then('the user {action} the walsender on the {segment} on content {content}') -def impl(context, action, segment, content): +@given('the user {action} the walsender on the {segment} on content {content_ids}') +@when('the user {action} the walsender on the {segment} on content {content_ids}') +@then('the user {action} the walsender on the {segment} on content {content_ids}') +def impl(context, action, segment, content_ids): if segment == 'mirror': role = "'m'" elif segment == 'primary': @@ -400,7 +400,7 @@ def impl(context, action, segment, content): create_fault_query = "CREATE EXTENSION IF NOT EXISTS gp_inject_fault;" execute_sql('postgres', create_fault_query) - inject_fault_query = "SELECT gp_inject_fault_infinite('wal_sender_loop', '%s', dbid) FROM gp_segment_configuration WHERE content=%s AND role=%s;" % (action, content, role) + inject_fault_query = "SELECT gp_inject_fault_infinite('wal_sender_loop', '%s', dbid) FROM gp_segment_configuration WHERE content IN (%s) AND role=%s;" % (action, content_ids, role) execute_sql('postgres', inject_fault_query) return @@ -652,26 +652,16 @@ def impl(context, kill_process_name, log_msg, logfile_name): "fi; done" % (log_msg, logfile_name, kill_process_name) run_async_command(context, command) -@given('the user asynchronously sets up to end {process_name} process with SIGINT') -@when('the user asynchronously sets up to end {process_name} process with SIGINT') -@then('the user asynchronously sets up to end {process_name} process with SIGINT') -def impl(context, process_name): - command = "ps ux | grep bin/%s | awk '{print $2}' | xargs kill -2" % (process_name) - run_async_command(context, command) - - -@given('the user asynchronously sets up to end {process_name} process with SIGHUP') -@when('the user asynchronously sets up to end {process_name} process with SIGHUP') -@then('the user asynchronously sets up to end {process_name} process with SIGHUP') -def impl(context, process_name): - command = "ps ux | grep bin/%s | awk '{print $2}' | xargs kill -9" % (process_name) - run_async_command(context, command) +@given('the user asynchronously sets up to end {process_name} process with {signal_name}') +@when('the user asynchronously sets up to end {process_name} process with {signal_name}') +@then('the user asynchronously sets up to end {process_name} process with {signal_name}') +def impl(context, process_name, signal_name): + try: + sig = getattr(signal, signal_name) + except: + raise Exception("Unknown signal: {0}".format(signal_name)) -@given('the user asynchronously ends {process_name} process with SIGHUP') -@when('the user asynchronously ends {process_name} process with SIGHUP') -@then('the user asynchronously ends {process_name} process with SIGHUP') -def impl(context, process_name): - command = "ps ux | grep %s | awk '{print $2}' | xargs kill -9" % (process_name) + command = "ps ux | grep bin/{0} | awk '{{print $2}}' | xargs kill -{1}".format(process_name, sig.value) run_async_command(context, command) @when('the user asynchronously sets up to end gpcreateseg process when it starts') @@ -2737,6 +2727,21 @@ def impl(context, command, target): if target not in contents: raise Exception("cannot find %s in %s" % (target, filename)) + +@then('{command} should print "{target}" regex to logfile') +def impl(context, command, target): + log_dir = _get_gpAdminLogs_directory() + filename = glob.glob('%s/%s_*.log' % (log_dir, command))[0] + contents = '' + with open(filename) as fr: + for line in fr: + contents += line + + pat = re.compile(target) + if not pat.search(contents): + raise Exception("cannot find %s in %s" % (target, filename)) + + @given('verify that a role "{role_name}" exists in database "{dbname}"') @then('verify that a role "{role_name}" exists in database "{dbname}"') def impl(context, role_name, dbname): @@ -3971,3 +3976,105 @@ def impl(context, contentids): if not no_basebackup: raise Exception("pg_basebackup entry was found for contents %s in gp_stat_replication after %d retries" % (contentids, retries)) + + +@given("create event trigger function") +def impl(context): + dbname = "gptest" + query = "create or replace function notcie_ddl() returns event_trigger as $$ begin raise notice 'command % is executed.', tg_tag; end $$ language plpgsql" + + with closing(dbconn.connect(dbconn.DbURL(dbname=dbname), unsetSearchPath=False)) as conn: + dbconn.execSQL(conn, query) + + +@given('running postgres processes are saved in context') +@when('running postgres processes are saved in context') +@then('running postgres processes are saved in context') +def impl(context): + + # Store the pids in a dictionary where key will be the hostname and the + # value will be the pids of all the postgres processes running on that host + host_to_pid_map = dict() + segs = GpArray.initFromCatalog(dbconn.DbURL()).getDbList() + for seg in segs: + pids = gp.get_postgres_segment_processes(seg.datadir, seg.hostname) + if seg.hostname not in host_to_pid_map: + host_to_pid_map[seg.hostname] = pids + else: + host_to_pid_map[seg.hostname].extend(pids) + + context.host_to_pid_map = host_to_pid_map + + +@given('verify no postgres process is running on all hosts') +@when('verify no postgres process is running on all hosts') +@then('verify no postgres process is running on all hosts') +def impl(context): + host_to_pid_map = context.host_to_pid_map + + for host in host_to_pid_map: + for pid in host_to_pid_map[host]: + if unix.check_pid_on_remotehost(pid, host): + raise Exception("Postgres process {0} not killed on {1}.".format(pid, host)) + + +@then('the database segments are in execute mode') +def impl(context): + # Get all primary segments details + # For mirror segments, there's no way to distinguish if in execute mode or utility mode + with closing(dbconn.connect(dbconn.DbURL(), unsetSearchPath=False)) as conn: + sql = "SELECT dbid, hostname, port FROM gp_segment_configuration WHERE content > -1 and status = 'u' and role = 'p'" + rows = dbconn.query(conn, sql).fetchall() + + if len(rows) <= 0: + raise Exception("Found no entries in gp_segment_configuration table") + # Check for each segment if the process is in + for row in rows: + dbid = row[0] + hostname = row[1].strip() + portnum = row[2] + cmd = "psql -d template1 -p {0} -h {1} -c \";\"".format(portnum, hostname) + run_command(context, cmd) + # If node is in execute mode, psql should return value 2 and the print the following error message: + # For a primary segment: "FATAL: connections to primary segments are not allowed" + # For a mirror segment, always prints: "FATAL: the database system is in recovery mode" + if context.ret_code == 2 and \ + "FATAL: connections to primary segments are not allowed" in context.error_message: + continue + else: + raise Exception("segment process not running in execute mode for DBID:{0}".format(dbid)) + + +@when('the user would run "{command}" and terminate the process with SIGINT and selects "{input}" {delay} delay') +def impl(context, command, input, delay): + p = Popen(command.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) + + context.execute_steps('''Then verify that pg_basebackup is running for content 0''') + p.send_signal(signal.SIGINT) + + if delay == "with": + context.execute_steps('''When the user waits until mirror on content 0 is up''') + + p.stdin.write(input.encode("utf-8")) + p.stdin.flush() + + if input == "n": + context.execute_steps('''Then the user reset the walsender on the primary on content 0''') + + stdout, stderr = p.communicate() + context.ret_code = p.returncode + context.stdout_message = stdout.decode() + context.error_message = stderr.decode() + + +@when('the user would run "{command}" and terminate the process with SIGTERM') +def impl(context, command): + p = Popen(command.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) + + context.execute_steps('''Then the user just waits until recovery_progress.file is created in gpAdminLogs''') + p.send_signal(signal.SIGTERM) + + stdout, stderr = p.communicate() + context.ret_code = p.returncode + context.stdout_message = stdout.decode() + context.error_message = stderr.decode() diff --git a/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py index 2c0eee815c..7357b61417 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/recoverseg_mgmt_utils.py @@ -7,6 +7,8 @@ from contextlib import closing from gppylib.commands.base import Command, ExecutionError, REMOTE, WorkerPool from gppylib.db import dbconn from gppylib.gparray import GpArray, ROLE_PRIMARY, ROLE_MIRROR +from gppylib.programs.clsRecoverSegment_triples import get_segments_with_running_basebackup, is_pg_rewind_running +from gppylib.operations.get_segments_in_recovery import is_seg_in_backup_mode from test.behave_utils.utils import * import platform, shutil from behave import given, when, then @@ -212,9 +214,9 @@ def impl(context, utility, output, segment_type): expected = r'\(dbid {}\): {}'.format(segment.dbid, output) check_stdout_msg(context, expected) -@then('{utility} should print "{recovery_type}" errors to stdout for content {content_ids}') -@when('{utility} should print "{recovery_type}" errors to stdout for content {content_ids}') -def impl(context, utility, recovery_type, content_ids): +@then('{utility} should print "{recovery_type}" errors to {output} for content {content_ids}') +@when('{utility} should print "{recovery_type}" errors to {output} for content {content_ids}') +def impl(context, utility, recovery_type, output, content_ids): if content_ids == "None": return if recovery_type not in ("incremental", "full", "start"): @@ -233,6 +235,12 @@ def impl(context, utility, recovery_type, content_ids): elif recovery_type == 'start': expected = r'hostname: {}; port: {}; datadir: {}'.format(segment.getSegmentHostName(), segment.getSegmentPort(), segment.getSegmentDataDirectory()) + if output == "logfile": + context.execute_steps(''' + Then {0} should print "{1}" regex to logfile + '''.format(utility, expected)) + return + check_stdout_msg(context, expected) @@ -339,8 +347,8 @@ def impl(context, host): content_id_str = ','.join(str(i) for i in content_ids_on_host) recovery_fail_check(context, recovery_type='full', content_ids=content_id_str) -@then('check if moving the mirrors from {original_host} to {new_host} failed') -def impl(context, original_host, new_host): +@then('check if moving the mirrors from {original_host} to {new_host} failed {user_termination} user termination') +def impl(context, original_host, new_host, user_termination): all_segments = GpArray.initFromCatalog(dbconn.DbURL()).getDbList() segments = filter(lambda seg: seg.getSegmentRole() == ROLE_MIRROR and seg.getSegmentHostName() == original_host, all_segments) @@ -351,13 +359,17 @@ def impl(context, original_host, new_host): Then gprecoverseg should return a return code of 1 And user can start transactions And gprecoverseg should print "Initiating segment recovery." to stdout - And gprecoverseg should print "pg_basebackup: error: could not access directory" to stdout for mirrors with content {content_ids} And gprecoverseg should print "Failed to recover the following segments" to stdout And verify that mirror on content {content_ids} is down And gprecoverseg should print "gprecoverseg failed. Please check the output" to stdout And gprecoverseg should not print "Segments successfully recovered" to stdout '''.format(content_ids=content_id_str)) + if user_termination == "without": + context.execute_steps(''' + Then gprecoverseg should print "pg_basebackup: error: could not access directory" to stdout for mirrors with content {content_ids} + '''.format(content_ids=content_id_str)) + #TODO add this step #And gpAdminLogs directory has "pg_basebackup*" files on {new_host} only for content {content_ids} @@ -609,3 +621,277 @@ def impl(context, expected_additional_entries): if actual_backout_entries != expected_total_entries: raise Exception("Expected configuration history table to have {} backout entries, found {}".format( context.original_config_history_backout_count + expected_additional_entries, actual_backout_entries)) + + +@when('a gprecoverseg input file "{filename}" is created with added parameter hostname to recover the failed segment on new host') +def impl(context, filename): + with closing(dbconn.connect(dbconn.DbURL(dbname='template1'), unsetSearchPath=False)) as conn: + failed_port, failed_hostname, failed_datadir, failed_address = context.remote_pair_primary_port, \ + context.remote_pair_primary_host, context.remote_pair_primary_datadir, context.remote_pair_primary_address + result = dbconn.query(conn,"SELECT hostname FROM gp_segment_configuration WHERE preferred_role='p' and status = 'u' and content != -1;").fetchall() + failover_port, failover_hostname, failover_datadir = 23000, result[0][0], failed_datadir + + failover_host_address = get_host_address(failover_hostname) + context.recovery_host_address = failover_host_address + context.recovery_host_name = failover_hostname + + line = "{0}|{1}|{2}|{3} {4}|{5}|{6}|{7}" .format(failed_hostname, failed_address, failed_port, failed_datadir, + failover_hostname, failover_host_address, failover_port, failover_datadir) + + with open("/tmp/%s" % filename, "w") as fd: + fd.write("%s\n" % line) + + +@when('check hostname and address updated on segment configuration with the saved information') +def impl(context): + with closing(dbconn.connect(dbconn.DbURL(dbname='template1'), unsetSearchPath=False)) as conn: + result = dbconn.queryRow(conn, "SELECT content, hostname, address FROM gp_segment_configuration WHERE dbid = {};" .format(context.remote_pair_primary_segdbId)) + content, hostname, address = result[0], result[1], result[2] + + if address != context.recovery_host_address or hostname != context.recovery_host_name: + raise Exception( + 'Host name and address could not updated on segment configuration for dbId {0}'.format(context.remote_pair_primary_segdbId)) + + +@when('a gprecoverseg input file "{filename}" is created with hostname parameter to recover the failed segment on same host') +def impl(context, filename): + port, hostname, datadir, address = context.remote_pair_primary_port, context.remote_pair_primary_host,\ + context.remote_pair_primary_datadir, context.remote_pair_primary_address + + host_address = get_host_address(hostname) + context.recovery_host_address = host_address + context.recovery_host_name = hostname + + line = "{0}|{1}|{2}|{3} {4}|{5}|{6}|/tmp/newdir" .format(hostname, address, port, datadir, hostname, + host_address, port) + + with open("/tmp/%s" % filename, "w") as fd: + fd.write("%s\n" % line) + + +@when('a gprecoverseg input file "{filename}" is created with hostname parameter matches with segment configuration table for incremental recovery of failed segment') +def impl(context, filename): + port, hostname, datadir, address = context.remote_pair_primary_port, context.remote_pair_primary_host, \ + context.remote_pair_primary_datadir, context.remote_pair_primary_address + context.recovery_host_address = address + context.recovery_host_name = hostname + + line = "{0}|{1}|{2}|{3}" .format(hostname, address, port, datadir) + + with open("/tmp/%s" % filename, "w") as fd: + fd.write("%s\n" % line) + + +@when('a gprecoverseg input file "{filename}" is created with invalid format for inplace full recovery of failed segment') +def impl(context, filename): + port, hostname, datadir, address = context.remote_pair_primary_port, context.remote_pair_primary_host,\ + context.remote_pair_primary_datadir, context.remote_pair_primary_address + + host_address = get_host_address(hostname) + + line = "{0}|{1}|{2}|{3} {4}|{5}|/tmp/newdir" .format(hostname, address, port, datadir, host_address, port) + + with open("/tmp/%s" % filename, "w") as fd: + fd.write("%s\n" % line) + + +@when('a gprecoverseg input file "{filename}" created with invalid failover hostname for full recovery of failed segment') +def impl(context, filename): + port, hostname, datadir, address = context.remote_pair_primary_port, context.remote_pair_primary_host,\ + context.remote_pair_primary_datadir, context.remote_pair_primary_address + + host_address = get_host_address(hostname) + + line = "{0}|{1}|{2}|{3} {4}_1|{5}|{6}|/tmp/newdir" .format(hostname, address, port, datadir, hostname, host_address, port) + + with open("/tmp/%s" % filename, "w") as fd: + fd.write("%s\n" % line) + + +@when('a gprecoverseg input file "{filename}" is created with and without parameter hostname to recover all the failed segments') +def impl(context, filename): + lines = [] + with closing(dbconn.connect(dbconn.DbURL(dbname='template1'), unsetSearchPath=False)) as conn: + rows = dbconn.query(conn,"SELECT port, hostname, datadir, content, address FROM gp_segment_configuration WHERE status = 'd' and content != -1;").fetchall() + for i, row in enumerate(rows): + output_str = "" + hostname = row[1] + host_address = get_host_address(hostname) + port = row[0] + address = row[4] + datadir = row[2] + content = row[3] + + if content == 0: + output_str += "{0}|{1}|{2}".format(address, port, datadir) + elif content == 1: + output_str += "{0}|{1}|{2} {3}|{4}|/tmp/newdir{5}".format(address, port, datadir, address, port, i) + else: + output_str += "{0}|{1}|{2}|{3} {4}|{5}|{6}|/tmp/newdir{7}".format(hostname, address, port, datadir, + hostname, host_address, port, i) + + lines.append(output_str) + writeLinesToFile("/tmp/%s" % filename, lines) + +@when('a gprecoverseg input file "{filename}" is created with invalid hostname parameter that does not matches with the segment configuration table hostname') +def impl(context, filename): + port, hostname, datadir, address = context.remote_pair_primary_port, context.remote_pair_primary_host, \ + context.remote_pair_primary_datadir, context.remote_pair_primary_address + + line = "{0}|{1}|{2}|{3}" .format("invalid_hostname", address, port, datadir) + + with open("/tmp/%s" % filename, "w") as fd: + fd.write("%s\n" % line) + +def get_host_address(hostname): + cmd = Command("get the address of the host", cmdStr="hostname -I", ctxt=REMOTE, remoteHost=hostname) + cmd.run(validateAfter=True) + host_address = cmd.get_stdout().strip().split(' ') + return host_address[0] + + + +@then('pg_hba file on primary of mirrors on "{newhost}" with "{contents}" contains no replication entries for "{oldhost}"') +@when('pg_hba file on primary of mirrors on "{newhost}" with "{contents}" contains no replication entries for "{oldhost}"') +def impl(context, newhost, contents, oldhost): + all_segments = GpArray.initFromCatalog(dbconn.DbURL()).getSegmentList() + + for seg in all_segments: + if newhost != "none" and seg.mirrorDB.getSegmentHostName() != newhost: + continue + if contents != "all": + for content_id in contents.split(','): + if seg.mirrorDB.getSegmentContentId() != int(content_id): + continue + check_entry_present(context, seg, oldhost) + else: + check_entry_present(context, seg, oldhost) + +def check_entry_present(context, seg, oldhost): + for host in oldhost.split(','): + search_ip_addr = context.host_ip_list[host] + dbname = "template1" + ip_address = "','".join(search_ip_addr) + query = "SELECT count(*) FROM pg_hba_file_rules WHERE database='{{replication}}' AND (address='{0}' OR address IN ('{1}'))".format( + host, ip_address) + phost = seg.primaryDB.getSegmentHostName() + port = seg.primaryDB.getSegmentPort() + + with closing(dbconn.connect(dbconn.DbURL(dbname=dbname, port=port, hostname=phost), + utility=True, unsetSearchPath=False)) as conn: + result = dbconn.querySingleton(conn, query) + if result != 0: + raise Exception("{0} replication entry for {1}, {2} still existing in pg_hba.conf of {3}:{4}" + .format(result, host, search_ip_addr,phost, port)) + + +@then('verify that only replication connection primary has is to {new_mirror}') +@when('verify that only replication connection primary has is to {new_mirror}') +@given('verify that only replication connection primary has is to {new_mirror}') +def impl(context, new_mirror): + all_segments = GpArray.initFromCatalog(dbconn.DbURL()).getSegmentList() + + for seg in all_segments: + if seg.mirrorDB.getSegmentHostName() != new_mirror: + continue + + dbname = "template1" + search_ip_addr = context.host_ip_list[new_mirror] + ip_address = "','".join(search_ip_addr) + query = """ + SELECT + CASE + WHEN + (SELECT COUNT(*) FROM gp_stat_replication WHERE client_addr IN ('{1}')) = + (SELECT COUNT(*) FROM gp_stat_replication) + THEN TRUE + ELSE FALSE + END;""".format(ip_address) + + phost = seg.primaryDB.getSegmentHostName() + port = seg.primaryDB.getSegmentPort() + with closing(dbconn.connect(dbconn.DbURL(dbname=dbname, port=port, hostname=phost), + utility=True, unsetSearchPath=False)) as conn: + result = dbconn.querySingleton(conn, query) + if result != 't': + raise Exception("{} replication connections are not updated.".format(phost)) + + +@given('saving host IP address of "{host}"') +@then('saving host IP address of "{host}"') +@when('saving host IP address of "{host}"') +def impl(context, host): + context.host_ip_list = {} + for host_name in host.split(','): + if_addrs = gp.IfAddrs.list_addrs(host_name) + context.host_ip_list[host_name] = if_addrs + + + +@then('verify that pg_basebackup {action} running for content {content_ids}') +def impl(context, action, content_ids): + attempt = 0 + num_retries = 600 + content_ids_to_check = [int(c) for c in content_ids.split(',')] + + while attempt < num_retries: + attempt += 1 + content_ids_running_basebackup = get_segments_with_running_basebackup() + + if action == "is not": + if not any(content in content_ids_running_basebackup for content in content_ids_to_check): + return + + if action == "is": + if all(content in content_ids_running_basebackup for content in content_ids_to_check): + return + + time.sleep(0.1) + if attempt == num_retries: + raise Exception('Timed out after {} retries'.format(num_retries)) + + +@then('verify that pg_rewind {action} running for content {content_ids}') +def impl(context, action, content_ids): + qry = "SELECT hostname, port FROM gp_segment_configuration WHERE status='u' AND role='p' AND content IN ({0})".format(content_ids) + rows = getRows('postgres', qry) + + attempt = 0 + num_retries = 600 + while attempt < num_retries: + attempt += 1 + + if action == "is not": + if not any(is_pg_rewind_running(row[0], row[1]) for row in rows): + return + + if action == "is": + if all(is_pg_rewind_running(row[0], row[1]) for row in rows): + return + + time.sleep(0.1) + if attempt == num_retries: + raise Exception('Timed out after {} retries'.format(num_retries)) + + +@then('verify that differential {action} running for content {content_ids}') +def impl(context, action, content_ids): + qry = "SELECT hostname, port FROM gp_segment_configuration WHERE status='u' AND role='p' AND content IN ({0})".format(content_ids) + rows = getRows('postgres', qry) + + attempt = 0 + num_retries = 600 + while attempt < num_retries: + attempt += 1 + + if action == "is not": + if not any(is_seg_in_backup_mode(row[0], row[1]) for row in rows): + return + + if action == "is": + if all(is_seg_in_backup_mode(row[0], row[1]) for row in rows): + return + + time.sleep(0.1) + if attempt == num_retries: + raise Exception('Timed out after {} retries'.format(num_retries)) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
