This is an automated email from the ASF dual-hosted git repository.

mykolabodnar pushed a commit to branch DATALAB-2398
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git

commit 9fc649a2be054869883d30646f0c014629163ee0
Author: bodnarmykola <[email protected]>
AuthorDate: Wed May 26 19:06:43 2021 +0300

    [DATALAB-2398] - Jupyter PySpark kernel from python venv implemented
---
 .../src/general/conf/datalab.ini                   |  2 ++
 .../src/general/lib/os/fab.py                      | 37 ++++++++++------------
 .../templates/os/py3spark_local_template.json      | 10 +++---
 .../src/jupyter/scripts/configure_jupyter_node.py  |  9 ++++--
 4 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/infrastructure-provisioning/src/general/conf/datalab.ini 
b/infrastructure-provisioning/src/general/conf/datalab.ini
index 2679b22..2040dc8 100644
--- a/infrastructure-provisioning/src/general/conf/datalab.ini
+++ b/infrastructure-provisioning/src/general/conf/datalab.ini
@@ -248,6 +248,8 @@ spark_version = 3.0.1
 hadoop_version = 2.7
 ### Version of Jupyter to be installed on notebook
 jupyter_version = 6.1.6
+### Version of Python to be installed as virualenv on notebook
+python_venv_version = 3.7.9
 ### Version of TensorFlow to be installed on notebook
 tensorflow_version = 2.3.2
 ### Version of Zeppelin to be installed on notebook
diff --git a/infrastructure-provisioning/src/general/lib/os/fab.py 
b/infrastructure-provisioning/src/general/lib/os/fab.py
index a9daee1..8b54a2e 100644
--- a/infrastructure-provisioning/src/general/lib/os/fab.py
+++ b/infrastructure-provisioning/src/general/lib/os/fab.py
@@ -38,26 +38,20 @@ from fabric import *
 from patchwork.files import exists
 from patchwork import files
 
-def ensure_python_venv(python_version):
+def ensure_python_venv(python_venv_version):
     try:
-        if not exist(conn, '/opt/python/python{}'.format(python_version)):
-            conn.sudo('wget 
https://www.python.org/ftp/python/{0}/Python-{0}.tgz -O 
/tmp/Python-{0}.tgz'.format(python_version))
-            conn.sudo('tar zxvf /tmp/Python-{}.tgz -C 
/tmp/'.format(python_version))
-            conn.sudo('cd /tmp/Python-{0} && ./configure 
--prefix=/opt/python/python{0} --with-zlib-dir=/usr/local/lib/ 
--with-ensurepip=install'.format(
-                    python_version)
-            conn.sudo('cd /tmp/Python-{0} make 
altinstall'.format(python_version))
-            conn.sudo('cd /tmp && rm -rf Python-{}'.format(python_version))
-            conn.sudo('virtualenv /opt/python/python{}'.format(python_version))
-            venv_command = 'source 
/opt/python/python{}/bin/activate'.format(python_version)
-            pip_command = 
'/opt/python/python{0}/bin/pip{1}'.format(python_version, python_version[:3])
-            conn.sudo('{0} && install -U pip=={}'.format(venv_command, 
pip_command, os.environ['pip_version']))
-            subprocess.run({0} && {1} install 
pyzmq==17.0.0'.format(venv_command, pip_command))
-            subprocess.run(
-                '{0} && sudo -i {1} install ipython ipykernel 
--no-cache-dir'.format(venv_command, pip_command),
-                shell=True, check=True)
-            subprocess.run(
-                '{0} && sudo -i {1} install boto boto3 NumPy=={2} SciPy 
Matplotlib pandas Sympy Pillow sklearn --no-cache-dir'
-                .format(venv_command, pip_command, numpy_version), shell=True, 
check=True)
+        if not exists(conn, 
'/opt/python/python{}'.format(python_venv_version)):
+            conn.sudo('wget 
https://www.python.org/ftp/python/{0}/Python-{0}.tgz -O 
/tmp/Python-{0}.tgz'.format(python_venv_version))
+            conn.sudo('tar zxvf /tmp/Python-{}.tgz -C 
/tmp/'.format(python_venv_version))
+            conn.sudo('''bash -l -c 'cd /tmp/Python-{0} && ./configure 
--prefix=/opt/python/python{0} --with-zlib-dir=/usr/local/lib/ 
--with-ensurepip=install' '''.format(python_venv_version))
+            conn.sudo('''bash -l -c 'cd /tmp/Python-{0} && make altinstall' 
'''.format(python_venv_version))
+            conn.sudo('''bash -l -c 'cd /tmp && rm -rf Python-{}' 
'''.format(python_venv_version))
+            conn.sudo('virtualenv 
/opt/python/python{}'.format(python_venv_version))
+            venv_command = 'source 
/opt/python/python{}/bin/activate'.format(python_venv_version)
+            pip_command = 
'/opt/python/python{0}/bin/pip{1}'.format(python_venv_version, 
python_venv_version[:3])
+            conn.sudo('''bash -l -c '{0} && {1} install -U pip=={2}' 
'''.format(venv_command, pip_command, os.environ['conf_pip_version']))
+            conn.sudo('''bash -l -c '{0} && {1} install ipython ipykernel 
--no-cache-dir' '''.format(venv_command, pip_command))
+            conn.sudo('''bash -l -c '{0} && {1} install NumPy=={2} SciPy 
Matplotlib pandas Sympy Pillow sklearn --no-cache-dir' '''.format(venv_command, 
pip_command, os.environ['notebook_numpy_version']))
 
     except Exception as err:
         print('Error:', str(err))
@@ -404,7 +398,7 @@ def ensure_pyspark_local_kernel(os_user, 
pyspark_local_path_dir, templates_dir,
             sys.exit(1)
 
 
-def ensure_py3spark_local_kernel(os_user, py3spark_local_path_dir, 
templates_dir, spark_version):
+def ensure_py3spark_local_kernel(os_user, py3spark_local_path_dir, 
templates_dir, spark_version, python_venv_path, python_venv_version):
     if not exists(conn,'/home/' + os_user + 
'/.ensure_dir/py3spark_local_kernel_ensured'):
         try:
             conn.sudo('mkdir -p ' + py3spark_local_path_dir)
@@ -412,6 +406,9 @@ def ensure_py3spark_local_kernel(os_user, 
py3spark_local_path_dir, templates_dir
             conn.put(templates_dir + 'py3spark_local_template.json', 
'/tmp/py3spark_local_template.json')
             conn.sudo(
                 '''bash -l -c "PYJ=`find /opt/spark/ -name '*py4j*.zip' | tr 
'\\n' ':' | sed 's|:$||g'`; sed -i 's|PY4J|'$PYJ'|g' 
/tmp/py3spark_local_template.json" ''')
+            conn.sudo('sed -i "s|PYTHON_VENV_PATH|' + python_venv_path + '|g" 
/tmp/py3spark_local_template.json')
+            conn.sudo('sed -i "s|PYTHON_VENV_VERSION|' + python_venv_version + 
'|g" /tmp/py3spark_local_template.json')
+            conn.sudo('sed -i "s|PYTHON_VENV_SHORT_VERSION|' + 
python_venv_version[:3] + '|g" /tmp/py3spark_local_template.json')
             conn.sudo('sed -i "s|SP_VER|' + spark_version + '|g" 
/tmp/py3spark_local_template.json')
             conn.sudo('sed -i 
\'/PYTHONPATH\"\:/s|\(.*\)"|\\1/home/{0}/caffe/python:/home/{0}/pytorch/build:"|\'
 /tmp/py3spark_local_template.json'.format(os_user))
             conn.sudo('\cp /tmp/py3spark_local_template.json ' + 
py3spark_local_path_dir + 'kernel.json')
diff --git 
a/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json
 
b/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json
index 45c1213..428524d 100644
--- 
a/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json
+++ 
b/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json
@@ -1,18 +1,20 @@
 {
  "argv": [
-  "/usr/bin/python3",
+  "PYTHON_VENV_PATH",
   "-m",
   "ipykernel",
   "-f",
   "{connection_file}"
  ],
  "language": "python",
- "display_name": "Local PySpark (Python-3.8 / Spark-SP_VER )",
+ "display_name": "Local PySpark (Python-PYTHON_VENV_VERSION / Spark-SP_VER )",
  "env": {
-  "PYSPARK_PYTHON": "python3.8",
+  "PYSPARK_PYTHON": "pythonPYTHON_VENV_SHORT_VERSION",
   "SPARK_HOME": "/opt/spark/",
   "PYTHONPATH": "PY4J:/opt/spark/python/:",
   "PYTHONSTARTUP": "/opt/spark/python/pyspark/shell.py",
   "PYSPARK_SUBMIT_ARGS": "--name LocalPySpark pyspark-shell"
  }
-}
\ No newline at end of file
+}
+
+
diff --git 
a/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py 
b/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py
index 2791b78..76c5fbf 100644
--- a/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py
+++ b/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py
@@ -45,6 +45,7 @@ args = parser.parse_args()
 spark_version = args.spark_version
 hadoop_version = args.hadoop_version
 jupyter_version = os.environ['notebook_jupyter_version']
+python_venv_version = os.environ['notebook_python_venv_version']
 scala_link = "https://www.scala-lang.org/files/archive/";
 if args.region == 'cn-north-1':
     spark_link = "http://mirrors.hust.edu.cn/apache/spark/spark-"; + 
spark_version + "/spark-" + spark_version + \
@@ -52,7 +53,7 @@ if args.region == 'cn-north-1':
 else:
     spark_link = "https://archive.apache.org/dist/spark/spark-"; + 
spark_version + "/spark-" + spark_version + \
                  "-bin-hadoop" + hadoop_version + ".tgz"
-
+python_venv_path = 
'/opt/python/python{0}/bin/python{1}'.format(python_venv_version, 
python_venv_version[:3])
 pyspark_local_path_dir = '/home/' + args.os_user + 
'/.local/share/jupyter/kernels/pyspark_local/'
 py3spark_local_path_dir = '/home/' + args.os_user + 
'/.local/share/jupyter/kernels/py3spark_local/'
 jupyter_conf_file = '/home/' + args.os_user + 
'/.local/share/jupyter/jupyter_notebook_config.py'
@@ -96,6 +97,10 @@ if __name__ == "__main__":
     print("Install Python 3 modules")
     ensure_python3_libraries(args.os_user)
 
+    # INSTALL PYTHON IN VIRTUALENV
+    print("Configure Python Virtualenv")
+    ensure_python_venv(python_venv_version)
+
     # INSTALL JUPYTER NOTEBOOK
     print("Install Jupyter")
     configure_jupyter(args.os_user, jupyter_conf_file, templates_dir, 
jupyter_version, args.exploratory_name)
@@ -115,7 +120,7 @@ if __name__ == "__main__":
     #print("Install pyspark local kernel for Jupyter")
     #ensure_pyspark_local_kernel(args.os_user, pyspark_local_path_dir, 
templates_dir, spark_version)
     print("Install py3spark local kernel for Jupyter")
-    ensure_py3spark_local_kernel(args.os_user, py3spark_local_path_dir, 
templates_dir, spark_version)
+    ensure_py3spark_local_kernel(args.os_user, py3spark_local_path_dir, 
templates_dir, spark_version, python_venv_path, python_venv_version)
     print("Install Toree-Scala kernel for Jupyter")
     ensure_toree_local_kernel(args.os_user, toree_link, scala_kernel_path, 
files_dir, local_spark_scala_version, spark_version)
     if os.environ['notebook_r_enabled'] == 'true':

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to