This is an automated email from the ASF dual-hosted git repository. avamingli pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit e3fc63322fecfc18dacb878809f5ee2c36f6740e Author: Nihal Jain <[email protected]> AuthorDate: Tue Apr 23 17:06:36 2024 +0530 gpssh: Retry with TERM env variable set during failures Currently gpssh always clears out the TERM env variable before performing SSH connections. This will cause problems when users want only to use a specific terminal (like tmux) when performing SSH connections since the empty TERM will cause some terminals to fail. To fix this issue, a retry operation is added when trying to login to a host. The first login attempt is made in the current manner where it clears out the `TERM` env variable. If it fails, then the operation is retried by restoring the `TERM` env variable. `gpssh` will now also print the exception in case of errors so that we get to know the actual reason for SSH failures aiding us in debugging. **Output During Failure** Before: ``` [gpadmin@cdw ~]$ gpssh -h cdw [ERROR] unable to login to cdw Could not acquire connection. ``` After: ``` [gpadmin@cdw ~]$ gpssh -h cdw [ERROR] unable to login to cdw Could not acquire connection. End Of File (EOF). Exception style platform. <gpssh_modules.gppxssh_wrapper.PxsshWrapper object at 0x7ffa144b7860> version: 3.3 command: /usr/bin/ssh args: ['/usr/bin/ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'BatchMode=yes', '-q', '-l', 'gpadmin', 'cdw'] searcher: <pexpect.searcher_re object at 0x7ffa144b7c50> buffer (last 100 chars): b'' before (last 100 chars): b' 9 07:34:06 2024 from 10.0.34.166\r\r\nopen terminal failed: missing or unsuitable terminal: unknown\r\n' after: <class 'pexpect.EOF'> match: None match_index: None exitstatus: None flag_eof: True pid: 52832 child_fd: 3 closed: False timeout: 30 delimiter: <class 'pexpect.EOF'> logfile: None logfile_read: None logfile_send: None maxread: 2000 ignorecase: False searchwindowsize: None delaybeforesend: 0.05 delayafterclose: 0.1 delayafterterminate: 0.1 ``` --- gpMgmt/bin/gppylib/util/ssh_utils.py | 65 +++++++++++++--------- .../util/test/unit/test_cluster_ssh_utils.py | 33 ++++++++++- 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/gpMgmt/bin/gppylib/util/ssh_utils.py b/gpMgmt/bin/gppylib/util/ssh_utils.py index bab39ef6ef..dd18b982f4 100644 --- a/gpMgmt/bin/gppylib/util/ssh_utils.py +++ b/gpMgmt/bin/gppylib/util/ssh_utils.py @@ -192,35 +192,50 @@ class Session(cmd.Cmd): good_list = [] print_lock = threading.Lock() - def connect_host(hostname, p): - self.hostList.append(hostname) - try: - # The sync_multiplier value is passed onto pexpect.pxssh which is used to determine timeout - # values for prompt verification after an ssh connection is established. - p.login(hostname, self.userName, sync_multiplier=sync_multiplier) - p.x_peer = hostname - p.x_pid = p.pid - good_list.append(p) - if self.verbose: + def connect_host(hostname): + retry_login = True + + while True: + # create a new PxsshWrapper object for each retry to avoid using the + # same object which can cause unexpected behaviours + p = gppxssh_wrapper.PxsshWrapper(delaybeforesend=delaybeforesend, + sync_retries=sync_retries, + options={"StrictHostKeyChecking": "no", + "BatchMode": "yes"}) + + try: + # The sync_multiplier value is passed onto pexpect.pxssh which is used to determine timeout + # values for prompt verification after an ssh connection is established. + p.login(hostname, self.userName, sync_multiplier=sync_multiplier) + p.x_peer = hostname + p.x_pid = p.pid + good_list.append(p) + if self.verbose: + with print_lock: + print('[INFO] login %s' % hostname) + except Exception as e: + # some logins fail due to the clearing of the TERM env variable + # retry by restoring the TERM variable to see if it succeeds or else error out + if origTERM and retry_login: + retry_login = False + os.putenv('TERM', origTERM) + continue + with print_lock: - print('[INFO] login %s' % hostname) - except Exception as e: - with print_lock: - print('[ERROR] unable to login to %s' % hostname) - if type(e) is pxssh.ExceptionPxssh: - print(e) - elif type(e) is pxssh.EOF: - print('Could not acquire connection.') - else: - print('hint: use gpssh-exkeys to setup public-key authentication between hosts') + print('[ERROR] unable to login to %s' % hostname) + if type(e) is pxssh.ExceptionPxssh: + print(e) + elif type(e) is pxssh.EOF: + print('Could not acquire connection.') + print(e) + else: + print('hint: use gpssh-exkeys to setup public-key authentication between hosts') + + break thread_list = [] for host in hostList: - p = gppxssh_wrapper.PxsshWrapper(delaybeforesend=delaybeforesend, - sync_retries=sync_retries, - options={"StrictHostKeyChecking": "no", - "BatchMode": "yes"}) - t = threading.Thread(target=connect_host, args=(host, p)) + t = threading.Thread(target=connect_host, args=[host]) t.start() thread_list.append(t) diff --git a/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py b/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py index 0a24eeb5b7..33f56b8a1f 100644 --- a/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py +++ b/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py @@ -4,7 +4,7 @@ import mock import sys, os, pwd import unittest from io import StringIO -from mock import patch +from mock import patch, call try: gphome = os.environ.get('GPHOME') @@ -91,7 +91,38 @@ class SshUtilsTestCase(unittest.TestCase): session2 = Session() session2.login(['localhost'], 'gpadmin', 0.05, 1.0) self.assertIn('[ERROR] unable to login to localhost\nhint: use gpssh-exkeys to setup public-key authentication between hosts\n', mock_stdout.getvalue()) + + @patch('os.getenv', return_value="term") + @patch('os.putenv') + @patch('sys.stdout', new_callable=StringIO) + def test05_login_retry_when_term_variable_is_set(self, mock_stdout, mock_putenv, mock_getenv): + ''' + Test pxssh.login() retry when there is an exception and TERM env variable is set + ''' + + with mock.patch.object(pxssh.pxssh, 'login', side_effect=pxssh.EOF('foo')) as mock_login: + session = Session() + session.login(['localhost'], 'gpadmin', 0.05, 1.0) + self.assertIn('[ERROR] unable to login to localhost\nCould not acquire connection.\n', mock_stdout.getvalue()) + mock_stdout.truncate(0) + assert mock_putenv.call_count == 3 + mock_putenv.assert_has_calls([call('TERM', ''), call('TERM', 'term'), call('TERM', 'term')]) + + @patch('os.getenv', return_value=None) + @patch('os.putenv') + @patch('sys.stdout', new_callable=StringIO) + def test06_login_does_not_retry_when_term_variable_is_not_set(self, mock_stdout, mock_putenv, mock_getenv): + ''' + Test pxssh.login() does not retry when there is an exception and TERM env variable is not set + ''' + + with mock.patch.object(pxssh.pxssh, 'login', side_effect=pxssh.EOF('foo')) as mock_login: + session = Session() + session.login(['localhost'], 'gpadmin', 0.05, 1.0) + self.assertIn('[ERROR] unable to login to localhost\nCould not acquire connection.\n', mock_stdout.getvalue()) + self.assertNotIn('Retrying by restoring the TERM env variable.\n', mock_stdout.getvalue()) mock_stdout.truncate(0) + mock_putenv.assert_called_once_with('TERM', '') if __name__ == "__main__": unittest.main() --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
