[madlib] branch master updated: DL: Remove with clause while running tf operations

nkak Wed, 24 Feb 2021 17:00:03 -0800

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new 638be5d  DL: Remove with clause while running tf operations
638be5d is described below

commit 638be5d1f21081773a09f3978140698b232d629c
Author: Nikhil Kak <n...@vmware.com>
AuthorDate: Thu Feb 11 19:33:12 2021 -0800

    DL: Remove with clause while running tf operations
    
    JIRA: MADLIB-1467
    
    We noticed that we run tf operations like `model.fit` on gpu device
    using `with tf.device(/gpu:0):`. It turns out that some of the
    operations(like passing class_weight as a fit param) may not be fully
    supported on gpus. Since we already set `CUDA_VISIBLE_DEVICES` env
    variable, we don't explicitly need to use the with clause to force the
    operation to be run on gpus. So to fix this issue, this commit removes all
    the with clauses so that operations are allowed to run on cpu as well.
    
    Co-authored-by: Ekta Khanna <ekha...@vmware.com>
---
 .../modules/deep_learning/madlib_keras.py_in       | 34 +++++++++-------------
 .../deep_learning/madlib_keras_predict.py_in       |  3 +-
 .../deep_learning/madlib_keras_wrapper.py_in       |  5 ++--
 3 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 67f2a56..50aa651 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -80,15 +80,13 @@ def get_init_model_and_sess(GD, device_name, gpu_count, 
segments_per_host,
         if GD_STORE.SEGMENT_MODEL not in GD:
             plpy.error("Session and model should exist in GD after the first 
row"
                        "of the first iteration")
-        with tf.device(device_name):
-            sess = GD[GD_STORE.SESS]
-            segment_model = GD[GD_STORE.SEGMENT_MODEL]
-            K.set_session(sess)
+        sess = GD[GD_STORE.SESS]
+        segment_model = GD[GD_STORE.SEGMENT_MODEL]
+        K.set_session(sess)
     else:
-        with tf.device(device_name):
-            sess = get_keras_session(device_name, gpu_count, segments_per_host)
-            K.set_session(sess)
-            segment_model = init_model(model_architecture, compile_params, 
custom_function_map)
+        sess = get_keras_session(device_name, gpu_count, segments_per_host)
+        K.set_session(sess)
+        segment_model = init_model(model_architecture, compile_params, 
custom_function_map)
         GD_STORE.init(GD, sess, segment_model)
     return segment_model, sess
 
@@ -622,8 +620,7 @@ def fit_transition(state, dependent_var, independent_var, 
dependent_var_shape,
     else:
         agg_image_count = 0
         GD[GD_STORE.AGG_IMAGE_COUNT] = agg_image_count
-        with tf.device(device_name):
-            set_model_weights(segment_model, prev_serialized_weights)
+        set_model_weights(segment_model, prev_serialized_weights)
 
     x_train = []
     y_train = []
@@ -637,8 +634,7 @@ def fit_transition(state, dependent_var, independent_var, 
dependent_var_shape,
     # Fit segment model on data
     #TODO consider not doing this every time
     fit_params = parse_and_validate_fit_params(fit_params)
-    with tf.device(device_name):
-        segment_model.fit(x_train, y_train, **fit_params)
+    segment_model.fit(x_train, y_train, **fit_params)
 
     # Aggregating number of images, loss and accuracy
 
@@ -735,13 +731,12 @@ def fit_multiple_transition_caching(dependent_var, 
independent_var, dependent_va
                                                       model_architecture, 
compile_params,
                                                       custom_function_map)
 
-        with tf.device(device_name):
-            set_model_weights(segment_model, serialized_weights)
-            fit_params = parse_and_validate_fit_params(fit_params)
+        set_model_weights(segment_model, serialized_weights)
+        fit_params = parse_and_validate_fit_params(fit_params)
 
-            for i in range(len(GD['x_train'])):
-                # Fit segment model on data
-                segment_model.fit(GD['x_train'][i], GD['y_train'][i], 
**fit_params)
+        for i in range(len(GD['x_train'])):
+            # Fit segment model on data
+            segment_model.fit(GD['x_train'][i], GD['y_train'][i], **fit_params)
 
     return_state = get_state_to_return(segment_model, is_last_row, True,
                                        agg_image_count)
@@ -1077,8 +1072,7 @@ def internal_keras_eval_transition(state, dependent_var, 
independent_var,
     image_count = len(y_val[0])
     agg_image_count += image_count
 
-    with tf.device(device_name):
-        res = segment_model.evaluate(x_val, y_val)
+    res = segment_model.evaluate(x_val, y_val)
 
     # if metric is None, model.evaluate will only return loss as a scalar
     # Otherwise, it will return a list which has loss and metric
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index 3c7b28e..1b8f226 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -381,8 +381,7 @@ def internal_keras_predict(independent_var, 
model_architecture, model_weights,
         for i in independent_var:
             if i is not None:
                 
independent_var_filtered.append(expand_input_dims(i)/normalizing_const)
-        with tf.device(device_name):
-            probs = model.predict(independent_var_filtered)
+        probs = model.predict(independent_var_filtered)
         # probs is a list containing a list of probability values, of all
         # class levels. Since we are assuming each input is a single image,
         # and not mini-batched, this list contains exactly one list in it,
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
index 7b1b3de..cbd882a 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
@@ -86,9 +86,8 @@ def get_device_name_and_set_cuda_env(gpu_count, seg):
     return device_name
 
 def set_keras_session(device_name, gpu_count, segments_per_host):
-    with tf.device(device_name):
-        session = get_keras_session(device_name, gpu_count, segments_per_host)
-        K.set_session(session)
+    session = get_keras_session(device_name, gpu_count, segments_per_host)
+    K.set_session(session)
 
 def get_keras_session(device_name, gpu_count, segments_per_host):
     config = tf.ConfigProto()

[madlib] branch master updated: DL: Remove with clause while running tf operations

Reply via email to