This is an automated email from the ASF dual-hosted git repository. mykolabodnar pushed a commit to branch DATALAB-2398 in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit 9fc649a2be054869883d30646f0c014629163ee0 Author: bodnarmykola <[email protected]> AuthorDate: Wed May 26 19:06:43 2021 +0300 [DATALAB-2398] - Jupyter PySpark kernel from python venv implemented --- .../src/general/conf/datalab.ini | 2 ++ .../src/general/lib/os/fab.py | 37 ++++++++++------------ .../templates/os/py3spark_local_template.json | 10 +++--- .../src/jupyter/scripts/configure_jupyter_node.py | 9 ++++-- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/infrastructure-provisioning/src/general/conf/datalab.ini b/infrastructure-provisioning/src/general/conf/datalab.ini index 2679b22..2040dc8 100644 --- a/infrastructure-provisioning/src/general/conf/datalab.ini +++ b/infrastructure-provisioning/src/general/conf/datalab.ini @@ -248,6 +248,8 @@ spark_version = 3.0.1 hadoop_version = 2.7 ### Version of Jupyter to be installed on notebook jupyter_version = 6.1.6 +### Version of Python to be installed as virualenv on notebook +python_venv_version = 3.7.9 ### Version of TensorFlow to be installed on notebook tensorflow_version = 2.3.2 ### Version of Zeppelin to be installed on notebook diff --git a/infrastructure-provisioning/src/general/lib/os/fab.py b/infrastructure-provisioning/src/general/lib/os/fab.py index a9daee1..8b54a2e 100644 --- a/infrastructure-provisioning/src/general/lib/os/fab.py +++ b/infrastructure-provisioning/src/general/lib/os/fab.py @@ -38,26 +38,20 @@ from fabric import * from patchwork.files import exists from patchwork import files -def ensure_python_venv(python_version): +def ensure_python_venv(python_venv_version): try: - if not exist(conn, '/opt/python/python{}'.format(python_version)): - conn.sudo('wget https://www.python.org/ftp/python/{0}/Python-{0}.tgz -O /tmp/Python-{0}.tgz'.format(python_version)) - conn.sudo('tar zxvf /tmp/Python-{}.tgz -C /tmp/'.format(python_version)) - conn.sudo('cd /tmp/Python-{0} && ./configure --prefix=/opt/python/python{0} --with-zlib-dir=/usr/local/lib/ --with-ensurepip=install'.format( - python_version) - conn.sudo('cd /tmp/Python-{0} make altinstall'.format(python_version)) - conn.sudo('cd /tmp && rm -rf Python-{}'.format(python_version)) - conn.sudo('virtualenv /opt/python/python{}'.format(python_version)) - venv_command = 'source /opt/python/python{}/bin/activate'.format(python_version) - pip_command = '/opt/python/python{0}/bin/pip{1}'.format(python_version, python_version[:3]) - conn.sudo('{0} && install -U pip=={}'.format(venv_command, pip_command, os.environ['pip_version'])) - subprocess.run({0} && {1} install pyzmq==17.0.0'.format(venv_command, pip_command)) - subprocess.run( - '{0} && sudo -i {1} install ipython ipykernel --no-cache-dir'.format(venv_command, pip_command), - shell=True, check=True) - subprocess.run( - '{0} && sudo -i {1} install boto boto3 NumPy=={2} SciPy Matplotlib pandas Sympy Pillow sklearn --no-cache-dir' - .format(venv_command, pip_command, numpy_version), shell=True, check=True) + if not exists(conn, '/opt/python/python{}'.format(python_venv_version)): + conn.sudo('wget https://www.python.org/ftp/python/{0}/Python-{0}.tgz -O /tmp/Python-{0}.tgz'.format(python_venv_version)) + conn.sudo('tar zxvf /tmp/Python-{}.tgz -C /tmp/'.format(python_venv_version)) + conn.sudo('''bash -l -c 'cd /tmp/Python-{0} && ./configure --prefix=/opt/python/python{0} --with-zlib-dir=/usr/local/lib/ --with-ensurepip=install' '''.format(python_venv_version)) + conn.sudo('''bash -l -c 'cd /tmp/Python-{0} && make altinstall' '''.format(python_venv_version)) + conn.sudo('''bash -l -c 'cd /tmp && rm -rf Python-{}' '''.format(python_venv_version)) + conn.sudo('virtualenv /opt/python/python{}'.format(python_venv_version)) + venv_command = 'source /opt/python/python{}/bin/activate'.format(python_venv_version) + pip_command = '/opt/python/python{0}/bin/pip{1}'.format(python_venv_version, python_venv_version[:3]) + conn.sudo('''bash -l -c '{0} && {1} install -U pip=={2}' '''.format(venv_command, pip_command, os.environ['conf_pip_version'])) + conn.sudo('''bash -l -c '{0} && {1} install ipython ipykernel --no-cache-dir' '''.format(venv_command, pip_command)) + conn.sudo('''bash -l -c '{0} && {1} install NumPy=={2} SciPy Matplotlib pandas Sympy Pillow sklearn --no-cache-dir' '''.format(venv_command, pip_command, os.environ['notebook_numpy_version'])) except Exception as err: print('Error:', str(err)) @@ -404,7 +398,7 @@ def ensure_pyspark_local_kernel(os_user, pyspark_local_path_dir, templates_dir, sys.exit(1) -def ensure_py3spark_local_kernel(os_user, py3spark_local_path_dir, templates_dir, spark_version): +def ensure_py3spark_local_kernel(os_user, py3spark_local_path_dir, templates_dir, spark_version, python_venv_path, python_venv_version): if not exists(conn,'/home/' + os_user + '/.ensure_dir/py3spark_local_kernel_ensured'): try: conn.sudo('mkdir -p ' + py3spark_local_path_dir) @@ -412,6 +406,9 @@ def ensure_py3spark_local_kernel(os_user, py3spark_local_path_dir, templates_dir conn.put(templates_dir + 'py3spark_local_template.json', '/tmp/py3spark_local_template.json') conn.sudo( '''bash -l -c "PYJ=`find /opt/spark/ -name '*py4j*.zip' | tr '\\n' ':' | sed 's|:$||g'`; sed -i 's|PY4J|'$PYJ'|g' /tmp/py3spark_local_template.json" ''') + conn.sudo('sed -i "s|PYTHON_VENV_PATH|' + python_venv_path + '|g" /tmp/py3spark_local_template.json') + conn.sudo('sed -i "s|PYTHON_VENV_VERSION|' + python_venv_version + '|g" /tmp/py3spark_local_template.json') + conn.sudo('sed -i "s|PYTHON_VENV_SHORT_VERSION|' + python_venv_version[:3] + '|g" /tmp/py3spark_local_template.json') conn.sudo('sed -i "s|SP_VER|' + spark_version + '|g" /tmp/py3spark_local_template.json') conn.sudo('sed -i \'/PYTHONPATH\"\:/s|\(.*\)"|\\1/home/{0}/caffe/python:/home/{0}/pytorch/build:"|\' /tmp/py3spark_local_template.json'.format(os_user)) conn.sudo('\cp /tmp/py3spark_local_template.json ' + py3spark_local_path_dir + 'kernel.json') diff --git a/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json b/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json index 45c1213..428524d 100644 --- a/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json +++ b/infrastructure-provisioning/src/general/templates/os/py3spark_local_template.json @@ -1,18 +1,20 @@ { "argv": [ - "/usr/bin/python3", + "PYTHON_VENV_PATH", "-m", "ipykernel", "-f", "{connection_file}" ], "language": "python", - "display_name": "Local PySpark (Python-3.8 / Spark-SP_VER )", + "display_name": "Local PySpark (Python-PYTHON_VENV_VERSION / Spark-SP_VER )", "env": { - "PYSPARK_PYTHON": "python3.8", + "PYSPARK_PYTHON": "pythonPYTHON_VENV_SHORT_VERSION", "SPARK_HOME": "/opt/spark/", "PYTHONPATH": "PY4J:/opt/spark/python/:", "PYTHONSTARTUP": "/opt/spark/python/pyspark/shell.py", "PYSPARK_SUBMIT_ARGS": "--name LocalPySpark pyspark-shell" } -} \ No newline at end of file +} + + diff --git a/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py b/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py index 2791b78..76c5fbf 100644 --- a/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py +++ b/infrastructure-provisioning/src/jupyter/scripts/configure_jupyter_node.py @@ -45,6 +45,7 @@ args = parser.parse_args() spark_version = args.spark_version hadoop_version = args.hadoop_version jupyter_version = os.environ['notebook_jupyter_version'] +python_venv_version = os.environ['notebook_python_venv_version'] scala_link = "https://www.scala-lang.org/files/archive/" if args.region == 'cn-north-1': spark_link = "http://mirrors.hust.edu.cn/apache/spark/spark-" + spark_version + "/spark-" + spark_version + \ @@ -52,7 +53,7 @@ if args.region == 'cn-north-1': else: spark_link = "https://archive.apache.org/dist/spark/spark-" + spark_version + "/spark-" + spark_version + \ "-bin-hadoop" + hadoop_version + ".tgz" - +python_venv_path = '/opt/python/python{0}/bin/python{1}'.format(python_venv_version, python_venv_version[:3]) pyspark_local_path_dir = '/home/' + args.os_user + '/.local/share/jupyter/kernels/pyspark_local/' py3spark_local_path_dir = '/home/' + args.os_user + '/.local/share/jupyter/kernels/py3spark_local/' jupyter_conf_file = '/home/' + args.os_user + '/.local/share/jupyter/jupyter_notebook_config.py' @@ -96,6 +97,10 @@ if __name__ == "__main__": print("Install Python 3 modules") ensure_python3_libraries(args.os_user) + # INSTALL PYTHON IN VIRTUALENV + print("Configure Python Virtualenv") + ensure_python_venv(python_venv_version) + # INSTALL JUPYTER NOTEBOOK print("Install Jupyter") configure_jupyter(args.os_user, jupyter_conf_file, templates_dir, jupyter_version, args.exploratory_name) @@ -115,7 +120,7 @@ if __name__ == "__main__": #print("Install pyspark local kernel for Jupyter") #ensure_pyspark_local_kernel(args.os_user, pyspark_local_path_dir, templates_dir, spark_version) print("Install py3spark local kernel for Jupyter") - ensure_py3spark_local_kernel(args.os_user, py3spark_local_path_dir, templates_dir, spark_version) + ensure_py3spark_local_kernel(args.os_user, py3spark_local_path_dir, templates_dir, spark_version, python_venv_path, python_venv_version) print("Install Toree-Scala kernel for Jupyter") ensure_toree_local_kernel(args.os_user, toree_link, scala_kernel_path, files_dir, local_spark_scala_version, spark_version) if os.environ['notebook_r_enabled'] == 'true': --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
