This is an automated email from the ASF dual-hosted git repository. lfrolov pushed a commit to branch DATALAB-2698 in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit 42e7adc25f7c82b552a3da466c60f0c596f6ca26 Author: leonidfrolov <[email protected]> AuthorDate: Mon Feb 14 18:11:19 2022 +0200 [DATALAB-2698]: fixed nvidia drivers for aws tensor --- .../src/general/lib/os/debian/notebook_lib.py | 26 +++++++++++++++++++--- .../general/scripts/aws/common_prepare_notebook.py | 10 +++++++-- .../src/general/scripts/aws/tensor_configure.py | 16 +++++++++++++ .../scripts/{gcp => os}/common_install_gpu.py | 0 .../src/tensor/scripts/configure_tensor_node.py | 3 +++ 5 files changed, 50 insertions(+), 5 deletions(-) diff --git a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py index 0fcea27..768e6aa 100644 --- a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py +++ b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py @@ -248,6 +248,7 @@ def ensure_additional_python_libs(os_user): datalab.fab.conn.sudo('pip3 install NumPy=={} SciPy pandas Sympy Pillow sklearn --no-cache-dir'.format(os.environ['notebook_numpy_version'])) if os.environ['application'] in ('tensor', 'deeplearning'): datalab.fab.conn.sudo('pip3 install opencv-python h5py --no-cache-dir') + #datalab.fab.conn.sudo('pip3 install python3-opencv scikit-learn --no-cache-dir') datalab.fab.conn.sudo('touch /home/' + os_user + '/.ensure_dir/additional_python_libs_ensured') except: sys.exit(1) @@ -293,14 +294,22 @@ def ensure_python3_libraries(os_user): def install_nvidia_drivers(os_user): if not exists(datalab.fab.conn,'/home/{}/.ensure_dir/nvidia_ensured'.format(os_user)): try: + if os.environ['conf_cloud_provider'] == 'aws': + cuda_version = '11.3.0' + cuda_file_name = "cuda-repo-ubuntu2004-11-3-local_11.3.0-465.19.01-1_amd64.deb" + cuda_key = '/var/cuda-repo-ubuntu2004-11-3-local/7fa2af80.pub' + else: + cuda_version = '11.4.0' + cuda_file_name = 'cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb' + cuda_key = '/var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub' # install nvidia drivers datalab.fab.conn.sudo( 'wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin') datalab.fab.conn.sudo('mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600') datalab.fab.conn.sudo( - 'wget https://developer.download.nvidia.com/compute/cuda/11.4.0/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb') - datalab.fab.conn.sudo('dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb') - datalab.fab.conn.sudo('apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub') + 'wget https://developer.download.nvidia.com/compute/cuda/{}/local_installers/{}'.format(cuda_version, cuda_file_name)) + datalab.fab.conn.sudo('dpkg -i {}'.format(cuda_file_name)) + datalab.fab.conn.sudo('apt-key add {}'.format(cuda_key)) manage_pkg('update', 'remote', '') manage_pkg('-y install', 'remote', 'cuda') #clean space on disk @@ -386,6 +395,17 @@ def install_tensor(os_user, cuda_version, cuda_file_name, sys.exit(1) +def ensure_pytorch(os_user, gpu=True): + if not exists(datalab.fab.conn, '/home/' + os_user + '/.ensure_dir/pytorch_ensured'): + if gpu: + install_venv_pip_pkg('torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113' + ' -f https://download.pytorch.org/whl/cu113/torch_stable.html') + else: + datalab.fab.conn.sudo('pip3 install torch==1.10.2+cpu torchvision==0.11.3+cpu torchaudio==0.10.2+cpu -f ' + 'https://download.pytorch.org/whl/cpu/torch_stable.html --no-cache-dir') + datalab.fab.conn.sudo('touch /home/' + os_user + '/.ensure_dir/pytorch_ensured') + + def install_maven(os_user): if not exists(datalab.fab.conn,'/home/' + os_user + '/.ensure_dir/maven_ensured'): manage_pkg('-y install', 'remote', 'maven') diff --git a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py index 6d40ffe..a11cc32 100644 --- a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py +++ b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py @@ -79,8 +79,14 @@ if __name__ == "__main__": notebook_config['project_name'], notebook_config['endpoint_name'], notebook_config['exploratory_name'], args.uuid) - notebook_config['primary_disk_size'] = (lambda x: '100' if x == 'deeplearning' else '16')( - os.environ['application']) + #notebook_config['primary_disk_size'] = (lambda x: '100' if x == 'deeplearning' else '16')( + # os.environ['application']) + if os.environ['application'] == 'deeplearning': + notebook_config['primary_disk_size'] = '100' + elif os.environ['application'] == 'tensor': + notebook_config['primary_disk_size'] = '32' + else: + notebook_config['primary_disk_size'] = '16' notebook_config['role_profile_name'] = '{}-{}-{}-nb-de-profile'.format( notebook_config['service_base_name'], notebook_config['project_name'], notebook_config['endpoint_name']) notebook_config['security_group_name'] = '{}-{}-{}-nb-sg'.format(notebook_config['service_base_name'], diff --git a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py index 2a0d115..d3b7d8e 100644 --- a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py +++ b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py @@ -154,6 +154,22 @@ if __name__ == "__main__": datalab.actions_lib.remove_ec2(notebook_config['tag_name'], notebook_config['instance_name']) sys.exit(1) + #Installing GPU drivers + try: + logging.info('[INSTALLING GPU DRIVERS]') + params = "--hostname {} --keyfile {} --os_user {}".format( + instance_hostname, keyfile_name, notebook_config['datalab_ssh_user']) + try: + subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True) + except: + datalab.fab.append_result("Failed installing users key") + raise Exception + + except Exception as err: + datalab.fab.append_result("Failed to install GPU drivers.", str(err)) + GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone']) + sys.exit(1) + # installing and configuring TensorFlow and all dependencies try: logging.info('[CONFIGURE TENSORFLOW NOTEBOOK INSTANCE]') diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py b/infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py similarity index 100% rename from infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py rename to infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py diff --git a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py index c9b5e3f..3566518 100644 --- a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py +++ b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py @@ -142,6 +142,9 @@ if __name__ == "__main__": # INSTALL OPTIONAL PACKAGES print("Installing additional Python packages") ensure_additional_python_libs(args.os_user) + if os.environ['conf_cloud_provider'] == 'aws': + print('Installing Pytorch') + ensure_pytorch(args.os_user) print("Install Matplotlib") ensure_matplot(args.os_user) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
