This is an automated email from the ASF dual-hosted git repository.

lfrolov pushed a commit to branch 2.5.1-tcpc-deployment
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git


The following commit(s) were added to refs/heads/2.5.1-tcpc-deployment by this 
push:
     new 69836c6  [DATALAB-2698]: fixed nvidia drivers for aws tensor
69836c6 is described below

commit 69836c64715c01fc13580d7394f835e25e0fe4e5
Author: leonidfrolov <[email protected]>
AuthorDate: Mon Feb 14 18:11:19 2022 +0200

    [DATALAB-2698]: fixed nvidia drivers for aws tensor
---
 .../src/general/lib/os/debian/notebook_lib.py      | 26 +++++++++++++++++++---
 .../general/scripts/aws/common_prepare_notebook.py | 10 +++++++--
 .../src/general/scripts/aws/tensor_configure.py    | 16 +++++++++++++
 .../scripts/{gcp => os}/common_install_gpu.py      |  0
 .../src/tensor/scripts/configure_tensor_node.py    |  3 +++
 5 files changed, 50 insertions(+), 5 deletions(-)

diff --git 
a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py 
b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
index 3f6ad1e..a6c9fe4 100644
--- a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
+++ b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
@@ -248,6 +248,7 @@ def ensure_additional_python_libs(os_user):
                 datalab.fab.conn.sudo('pip3 install NumPy=={} SciPy pandas 
Sympy Pillow sklearn 
--no-cache-dir'.format(os.environ['notebook_numpy_version']))
             if os.environ['application'] in ('tensor', 'deeplearning'):
                 datalab.fab.conn.sudo('pip3 install opencv-python h5py 
--no-cache-dir')
+                #datalab.fab.conn.sudo('pip3 install python3-opencv 
scikit-learn --no-cache-dir')
             datalab.fab.conn.sudo('touch /home/' + os_user + 
'/.ensure_dir/additional_python_libs_ensured')
         except:
             sys.exit(1)
@@ -293,14 +294,22 @@ def ensure_python3_libraries(os_user):
 def install_nvidia_drivers(os_user):
     if not 
exists(datalab.fab.conn,'/home/{}/.ensure_dir/nvidia_ensured'.format(os_user)):
         try:
+            if os.environ['conf_cloud_provider'] == 'aws':
+                cuda_version = '11.3.0'
+                cuda_file_name = 
"cuda-repo-ubuntu2004-11-3-local_11.3.0-465.19.01-1_amd64.deb"
+                cuda_key = '/var/cuda-repo-ubuntu2004-11-3-local/7fa2af80.pub'
+            else:
+                cuda_version = '11.4.0'
+                cuda_file_name = 
'cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb'
+                cuda_key = '/var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub'
             # install nvidia drivers
             datalab.fab.conn.sudo(
                 'wget 
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin')
             datalab.fab.conn.sudo('mv cuda-ubuntu2004.pin 
/etc/apt/preferences.d/cuda-repository-pin-600')
             datalab.fab.conn.sudo(
-                'wget 
https://developer.download.nvidia.com/compute/cuda/11.4.0/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb')
-            datalab.fab.conn.sudo('dpkg -i 
cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb')
-            datalab.fab.conn.sudo('apt-key add 
/var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub')
+                'wget 
https://developer.download.nvidia.com/compute/cuda/{}/local_installers/{}'.format(cuda_version,
 cuda_file_name))
+            datalab.fab.conn.sudo('dpkg -i {}'.format(cuda_file_name))
+            datalab.fab.conn.sudo('apt-key add {}'.format(cuda_key))
             manage_pkg('update', 'remote', '')
             manage_pkg('-y install', 'remote', 'cuda')
             #clean space on disk
@@ -386,6 +395,17 @@ def install_tensor(os_user, cuda_version, cuda_file_name,
             sys.exit(1)
 
 
+def ensure_pytorch(os_user, gpu=True):
+    if not exists(datalab.fab.conn, '/home/' + os_user + 
'/.ensure_dir/pytorch_ensured'):
+        if gpu:
+            install_venv_pip_pkg('torch==1.10.2+cu113 
torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113'
+                                 ' -f 
https://download.pytorch.org/whl/cu113/torch_stable.html')
+        else:
+            datalab.fab.conn.sudo('pip3 install torch==1.10.2+cpu 
torchvision==0.11.3+cpu torchaudio==0.10.2+cpu -f '
+                                  
'https://download.pytorch.org/whl/cpu/torch_stable.html --no-cache-dir')
+        datalab.fab.conn.sudo('touch /home/' + os_user + 
'/.ensure_dir/pytorch_ensured')
+
+
 def install_maven(os_user):
     if not exists(datalab.fab.conn,'/home/' + os_user + 
'/.ensure_dir/maven_ensured'):
         manage_pkg('-y install', 'remote', 'maven')
diff --git 
a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
 
b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
index 6d40ffe..a11cc32 100644
--- 
a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
+++ 
b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
@@ -79,8 +79,14 @@ if __name__ == "__main__":
                                                                       
notebook_config['project_name'],
                                                                       
notebook_config['endpoint_name'],
                                                                       
notebook_config['exploratory_name'], args.uuid)
-        notebook_config['primary_disk_size'] = (lambda x: '100' if x == 
'deeplearning' else '16')(
-            os.environ['application'])
+        #notebook_config['primary_disk_size'] = (lambda x: '100' if x == 
'deeplearning' else '16')(
+        #    os.environ['application'])
+        if os.environ['application'] == 'deeplearning':
+            notebook_config['primary_disk_size'] = '100'
+        elif os.environ['application'] == 'tensor':
+            notebook_config['primary_disk_size'] = '32'
+        else:
+            notebook_config['primary_disk_size'] = '16'
         notebook_config['role_profile_name'] = '{}-{}-{}-nb-de-profile'.format(
             notebook_config['service_base_name'], 
notebook_config['project_name'], notebook_config['endpoint_name'])
         notebook_config['security_group_name'] = 
'{}-{}-{}-nb-sg'.format(notebook_config['service_base_name'],
diff --git 
a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py 
b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
index 2a0d115..d3b7d8e 100644
--- a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
+++ b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
@@ -154,6 +154,22 @@ if __name__ == "__main__":
         datalab.actions_lib.remove_ec2(notebook_config['tag_name'], 
notebook_config['instance_name'])
         sys.exit(1)
 
+    #Installing GPU drivers
+    try:
+        logging.info('[INSTALLING GPU DRIVERS]')
+        params = "--hostname {} --keyfile {} --os_user {}".format(
+            instance_hostname, keyfile_name, 
notebook_config['datalab_ssh_user'])
+        try:
+            subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', 
params), shell=True, check=True)
+        except:
+            datalab.fab.append_result("Failed installing users key")
+            raise Exception
+
+    except Exception as err:
+        datalab.fab.append_result("Failed to install GPU drivers.", str(err))
+        GCPActions.remove_instance(notebook_config['instance_name'], 
notebook_config['zone'])
+        sys.exit(1)
+
     # installing and configuring TensorFlow and all dependencies
     try:
         logging.info('[CONFIGURE TENSORFLOW NOTEBOOK INSTANCE]')
diff --git 
a/infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py 
b/infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py
similarity index 100%
rename from 
infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py
rename to 
infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py
diff --git 
a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py 
b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
index c9b5e3f..3566518 100644
--- a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
+++ b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
@@ -142,6 +142,9 @@ if __name__ == "__main__":
     # INSTALL OPTIONAL PACKAGES
     print("Installing additional Python packages")
     ensure_additional_python_libs(args.os_user)
+    if os.environ['conf_cloud_provider'] == 'aws':
+        print('Installing Pytorch')
+        ensure_pytorch(args.os_user)
     print("Install Matplotlib")
     ensure_matplot(args.os_user)
     

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to