This is an automated email from the ASF dual-hosted git repository.
lfrolov pushed a commit to branch 2.5.1-tcpc-deployment
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
The following commit(s) were added to refs/heads/2.5.1-tcpc-deployment by this
push:
new 69836c6 [DATALAB-2698]: fixed nvidia drivers for aws tensor
69836c6 is described below
commit 69836c64715c01fc13580d7394f835e25e0fe4e5
Author: leonidfrolov <[email protected]>
AuthorDate: Mon Feb 14 18:11:19 2022 +0200
[DATALAB-2698]: fixed nvidia drivers for aws tensor
---
.../src/general/lib/os/debian/notebook_lib.py | 26 +++++++++++++++++++---
.../general/scripts/aws/common_prepare_notebook.py | 10 +++++++--
.../src/general/scripts/aws/tensor_configure.py | 16 +++++++++++++
.../scripts/{gcp => os}/common_install_gpu.py | 0
.../src/tensor/scripts/configure_tensor_node.py | 3 +++
5 files changed, 50 insertions(+), 5 deletions(-)
diff --git
a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
index 3f6ad1e..a6c9fe4 100644
--- a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
+++ b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
@@ -248,6 +248,7 @@ def ensure_additional_python_libs(os_user):
datalab.fab.conn.sudo('pip3 install NumPy=={} SciPy pandas
Sympy Pillow sklearn
--no-cache-dir'.format(os.environ['notebook_numpy_version']))
if os.environ['application'] in ('tensor', 'deeplearning'):
datalab.fab.conn.sudo('pip3 install opencv-python h5py
--no-cache-dir')
+ #datalab.fab.conn.sudo('pip3 install python3-opencv
scikit-learn --no-cache-dir')
datalab.fab.conn.sudo('touch /home/' + os_user +
'/.ensure_dir/additional_python_libs_ensured')
except:
sys.exit(1)
@@ -293,14 +294,22 @@ def ensure_python3_libraries(os_user):
def install_nvidia_drivers(os_user):
if not
exists(datalab.fab.conn,'/home/{}/.ensure_dir/nvidia_ensured'.format(os_user)):
try:
+ if os.environ['conf_cloud_provider'] == 'aws':
+ cuda_version = '11.3.0'
+ cuda_file_name =
"cuda-repo-ubuntu2004-11-3-local_11.3.0-465.19.01-1_amd64.deb"
+ cuda_key = '/var/cuda-repo-ubuntu2004-11-3-local/7fa2af80.pub'
+ else:
+ cuda_version = '11.4.0'
+ cuda_file_name =
'cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb'
+ cuda_key = '/var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub'
# install nvidia drivers
datalab.fab.conn.sudo(
'wget
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin')
datalab.fab.conn.sudo('mv cuda-ubuntu2004.pin
/etc/apt/preferences.d/cuda-repository-pin-600')
datalab.fab.conn.sudo(
- 'wget
https://developer.download.nvidia.com/compute/cuda/11.4.0/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb')
- datalab.fab.conn.sudo('dpkg -i
cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb')
- datalab.fab.conn.sudo('apt-key add
/var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub')
+ 'wget
https://developer.download.nvidia.com/compute/cuda/{}/local_installers/{}'.format(cuda_version,
cuda_file_name))
+ datalab.fab.conn.sudo('dpkg -i {}'.format(cuda_file_name))
+ datalab.fab.conn.sudo('apt-key add {}'.format(cuda_key))
manage_pkg('update', 'remote', '')
manage_pkg('-y install', 'remote', 'cuda')
#clean space on disk
@@ -386,6 +395,17 @@ def install_tensor(os_user, cuda_version, cuda_file_name,
sys.exit(1)
+def ensure_pytorch(os_user, gpu=True):
+ if not exists(datalab.fab.conn, '/home/' + os_user +
'/.ensure_dir/pytorch_ensured'):
+ if gpu:
+ install_venv_pip_pkg('torch==1.10.2+cu113
torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113'
+ ' -f
https://download.pytorch.org/whl/cu113/torch_stable.html')
+ else:
+ datalab.fab.conn.sudo('pip3 install torch==1.10.2+cpu
torchvision==0.11.3+cpu torchaudio==0.10.2+cpu -f '
+
'https://download.pytorch.org/whl/cpu/torch_stable.html --no-cache-dir')
+ datalab.fab.conn.sudo('touch /home/' + os_user +
'/.ensure_dir/pytorch_ensured')
+
+
def install_maven(os_user):
if not exists(datalab.fab.conn,'/home/' + os_user +
'/.ensure_dir/maven_ensured'):
manage_pkg('-y install', 'remote', 'maven')
diff --git
a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
index 6d40ffe..a11cc32 100644
---
a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
+++
b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
@@ -79,8 +79,14 @@ if __name__ == "__main__":
notebook_config['project_name'],
notebook_config['endpoint_name'],
notebook_config['exploratory_name'], args.uuid)
- notebook_config['primary_disk_size'] = (lambda x: '100' if x ==
'deeplearning' else '16')(
- os.environ['application'])
+ #notebook_config['primary_disk_size'] = (lambda x: '100' if x ==
'deeplearning' else '16')(
+ # os.environ['application'])
+ if os.environ['application'] == 'deeplearning':
+ notebook_config['primary_disk_size'] = '100'
+ elif os.environ['application'] == 'tensor':
+ notebook_config['primary_disk_size'] = '32'
+ else:
+ notebook_config['primary_disk_size'] = '16'
notebook_config['role_profile_name'] = '{}-{}-{}-nb-de-profile'.format(
notebook_config['service_base_name'],
notebook_config['project_name'], notebook_config['endpoint_name'])
notebook_config['security_group_name'] =
'{}-{}-{}-nb-sg'.format(notebook_config['service_base_name'],
diff --git
a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
index 2a0d115..d3b7d8e 100644
--- a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
+++ b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
@@ -154,6 +154,22 @@ if __name__ == "__main__":
datalab.actions_lib.remove_ec2(notebook_config['tag_name'],
notebook_config['instance_name'])
sys.exit(1)
+ #Installing GPU drivers
+ try:
+ logging.info('[INSTALLING GPU DRIVERS]')
+ params = "--hostname {} --keyfile {} --os_user {}".format(
+ instance_hostname, keyfile_name,
notebook_config['datalab_ssh_user'])
+ try:
+ subprocess.run("~/scripts/{}.py {}".format('common_install_gpu',
params), shell=True, check=True)
+ except:
+ datalab.fab.append_result("Failed installing users key")
+ raise Exception
+
+ except Exception as err:
+ datalab.fab.append_result("Failed to install GPU drivers.", str(err))
+ GCPActions.remove_instance(notebook_config['instance_name'],
notebook_config['zone'])
+ sys.exit(1)
+
# installing and configuring TensorFlow and all dependencies
try:
logging.info('[CONFIGURE TENSORFLOW NOTEBOOK INSTANCE]')
diff --git
a/infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py
b/infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py
similarity index 100%
rename from
infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py
rename to
infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py
diff --git
a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
index c9b5e3f..3566518 100644
--- a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
+++ b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
@@ -142,6 +142,9 @@ if __name__ == "__main__":
# INSTALL OPTIONAL PACKAGES
print("Installing additional Python packages")
ensure_additional_python_libs(args.os_user)
+ if os.environ['conf_cloud_provider'] == 'aws':
+ print('Installing Pytorch')
+ ensure_pytorch(args.os_user)
print("Install Matplotlib")
ensure_matplot(args.os_user)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]