This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit e3fc63322fecfc18dacb878809f5ee2c36f6740e
Author: Nihal Jain <[email protected]>
AuthorDate: Tue Apr 23 17:06:36 2024 +0530

    gpssh: Retry with TERM env variable set during failures
    
    Currently gpssh always clears out the TERM env variable before performing 
SSH connections. This will cause problems when users want only to use a 
specific terminal (like tmux) when performing SSH connections since the empty 
TERM will cause some terminals to fail.
    
    To fix this issue, a retry operation is added when trying to login to a 
host. The first login attempt is made in the current manner where it clears out 
the `TERM` env variable. If it fails, then the operation is retried by 
restoring the `TERM` env variable.
    
    `gpssh` will now also print the exception in case of errors so that we get 
to know the actual reason for SSH failures aiding us in debugging.
    
    **Output During Failure**
    Before:
    ```
    [gpadmin@cdw ~]$ gpssh -h cdw
    [ERROR] unable to login to cdw
    Could not acquire connection.
    ```
    
    After:
    ```
    [gpadmin@cdw ~]$ gpssh -h cdw
    [ERROR] unable to login to cdw
    Could not acquire connection.
    End Of File (EOF). Exception style platform.
    <gpssh_modules.gppxssh_wrapper.PxsshWrapper object at 0x7ffa144b7860>
    version: 3.3
    command: /usr/bin/ssh
    args: ['/usr/bin/ssh', '-o', 'StrictHostKeyChecking=no', '-o', 
'BatchMode=yes', '-q', '-l', 'gpadmin', 'cdw']
    searcher: <pexpect.searcher_re object at 0x7ffa144b7c50>
    buffer (last 100 chars): b''
    before (last 100 chars): b'  9 07:34:06 2024 from 10.0.34.166\r\r\nopen 
terminal failed: missing or unsuitable terminal: unknown\r\n'
    after: <class 'pexpect.EOF'>
    match: None
    match_index: None
    exitstatus: None
    flag_eof: True
    pid: 52832
    child_fd: 3
    closed: False
    timeout: 30
    delimiter: <class 'pexpect.EOF'>
    logfile: None
    logfile_read: None
    logfile_send: None
    maxread: 2000
    ignorecase: False
    searchwindowsize: None
    delaybeforesend: 0.05
    delayafterclose: 0.1
    delayafterterminate: 0.1
    ```
---
 gpMgmt/bin/gppylib/util/ssh_utils.py               | 65 +++++++++++++---------
 .../util/test/unit/test_cluster_ssh_utils.py       | 33 ++++++++++-
 2 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/gpMgmt/bin/gppylib/util/ssh_utils.py 
b/gpMgmt/bin/gppylib/util/ssh_utils.py
index bab39ef6ef..dd18b982f4 100644
--- a/gpMgmt/bin/gppylib/util/ssh_utils.py
+++ b/gpMgmt/bin/gppylib/util/ssh_utils.py
@@ -192,35 +192,50 @@ class Session(cmd.Cmd):
         good_list = []
         print_lock = threading.Lock()
 
-        def connect_host(hostname, p):
-            self.hostList.append(hostname)
-            try:
-                # The sync_multiplier value is passed onto pexpect.pxssh which 
is used to determine timeout
-                # values for prompt verification after an ssh connection is 
established.
-                p.login(hostname, self.userName, 
sync_multiplier=sync_multiplier)
-                p.x_peer = hostname
-                p.x_pid = p.pid
-                good_list.append(p)
-                if self.verbose:
+        def connect_host(hostname):
+            retry_login = True
+
+            while True:
+                # create a new PxsshWrapper object for each retry to avoid 
using the
+                # same object which can cause unexpected behaviours
+                p = 
gppxssh_wrapper.PxsshWrapper(delaybeforesend=delaybeforesend,
+                                                 sync_retries=sync_retries,
+                                                 
options={"StrictHostKeyChecking": "no",
+                                                          "BatchMode": "yes"})
+
+                try:
+                    # The sync_multiplier value is passed onto pexpect.pxssh 
which is used to determine timeout
+                    # values for prompt verification after an ssh connection 
is established.
+                    p.login(hostname, self.userName, 
sync_multiplier=sync_multiplier)
+                    p.x_peer = hostname
+                    p.x_pid = p.pid
+                    good_list.append(p)
+                    if self.verbose:
+                        with print_lock:
+                            print('[INFO] login %s' % hostname)
+                except Exception as e:
+                    # some logins fail due to the clearing of the TERM env 
variable
+                    # retry by restoring the TERM variable to see if it 
succeeds or else error out
+                    if origTERM and retry_login:
+                        retry_login = False
+                        os.putenv('TERM', origTERM)
+                        continue
+
                     with print_lock:
-                        print('[INFO] login %s' % hostname)
-            except Exception as e:
-                with print_lock:
-                    print('[ERROR] unable to login to %s' % hostname)
-                    if type(e) is pxssh.ExceptionPxssh:
-                        print(e)
-                    elif type(e) is pxssh.EOF:
-                        print('Could not acquire connection.')
-                    else:
-                        print('hint: use gpssh-exkeys to setup public-key 
authentication between hosts')
+                        print('[ERROR] unable to login to %s' % hostname)
+                        if type(e) is pxssh.ExceptionPxssh:
+                            print(e)
+                        elif type(e) is pxssh.EOF:
+                            print('Could not acquire connection.')
+                            print(e)
+                        else:
+                            print('hint: use gpssh-exkeys to setup public-key 
authentication between hosts')
+
+                break
 
         thread_list = []
         for host in hostList:
-            p = gppxssh_wrapper.PxsshWrapper(delaybeforesend=delaybeforesend,
-                                             sync_retries=sync_retries,
-                                             options={"StrictHostKeyChecking": 
"no",
-                                                      "BatchMode": "yes"})
-            t = threading.Thread(target=connect_host, args=(host, p))
+            t = threading.Thread(target=connect_host, args=[host])
             t.start()
             thread_list.append(t)
 
diff --git a/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py 
b/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py
index 0a24eeb5b7..33f56b8a1f 100644
--- a/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py
+++ b/gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py
@@ -4,7 +4,7 @@ import mock
 import sys, os, pwd
 import unittest
 from io import StringIO
-from mock import patch
+from mock import patch, call
 
 try:
     gphome = os.environ.get('GPHOME')
@@ -91,7 +91,38 @@ class SshUtilsTestCase(unittest.TestCase):
             session2 = Session()
             session2.login(['localhost'], 'gpadmin', 0.05, 1.0)
             self.assertIn('[ERROR] unable to login to localhost\nhint: use 
gpssh-exkeys to setup public-key authentication between hosts\n', 
mock_stdout.getvalue())
+
+    @patch('os.getenv', return_value="term")
+    @patch('os.putenv')
+    @patch('sys.stdout', new_callable=StringIO)
+    def test05_login_retry_when_term_variable_is_set(self, mock_stdout, 
mock_putenv, mock_getenv):
+        '''
+        Test pxssh.login() retry when there is an exception and TERM env 
variable is set
+        '''
+
+        with mock.patch.object(pxssh.pxssh, 'login', 
side_effect=pxssh.EOF('foo')) as mock_login:
+            session = Session()
+            session.login(['localhost'], 'gpadmin', 0.05, 1.0)
+            self.assertIn('[ERROR] unable to login to localhost\nCould not 
acquire connection.\n', mock_stdout.getvalue())
+            mock_stdout.truncate(0)
+            assert mock_putenv.call_count == 3
+            mock_putenv.assert_has_calls([call('TERM', ''), call('TERM', 
'term'), call('TERM', 'term')])
+
+    @patch('os.getenv', return_value=None)
+    @patch('os.putenv')
+    @patch('sys.stdout', new_callable=StringIO)
+    def test06_login_does_not_retry_when_term_variable_is_not_set(self, 
mock_stdout, mock_putenv, mock_getenv):
+        '''
+        Test pxssh.login() does not retry when there is an exception and TERM 
env variable is not set
+        '''
+
+        with mock.patch.object(pxssh.pxssh, 'login', 
side_effect=pxssh.EOF('foo')) as mock_login:
+            session = Session()
+            session.login(['localhost'], 'gpadmin', 0.05, 1.0)
+            self.assertIn('[ERROR] unable to login to localhost\nCould not 
acquire connection.\n', mock_stdout.getvalue())
+            self.assertNotIn('Retrying by restoring the TERM env variable.\n', 
mock_stdout.getvalue())
             mock_stdout.truncate(0)
+            mock_putenv.assert_called_once_with('TERM', '')
 
 if __name__ == "__main__":
     unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to