This is an automated email from the ASF dual-hosted git repository. nkak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push: new 638be5d DL: Remove with clause while running tf operations 638be5d is described below commit 638be5d1f21081773a09f3978140698b232d629c Author: Nikhil Kak <n...@vmware.com> AuthorDate: Thu Feb 11 19:33:12 2021 -0800 DL: Remove with clause while running tf operations JIRA: MADLIB-1467 We noticed that we run tf operations like `model.fit` on gpu device using `with tf.device(/gpu:0):`. It turns out that some of the operations(like passing class_weight as a fit param) may not be fully supported on gpus. Since we already set `CUDA_VISIBLE_DEVICES` env variable, we don't explicitly need to use the with clause to force the operation to be run on gpus. So to fix this issue, this commit removes all the with clauses so that operations are allowed to run on cpu as well. Co-authored-by: Ekta Khanna <ekha...@vmware.com> --- .../modules/deep_learning/madlib_keras.py_in | 34 +++++++++------------- .../deep_learning/madlib_keras_predict.py_in | 3 +- .../deep_learning/madlib_keras_wrapper.py_in | 5 ++-- 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in index 67f2a56..50aa651 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in @@ -80,15 +80,13 @@ def get_init_model_and_sess(GD, device_name, gpu_count, segments_per_host, if GD_STORE.SEGMENT_MODEL not in GD: plpy.error("Session and model should exist in GD after the first row" "of the first iteration") - with tf.device(device_name): - sess = GD[GD_STORE.SESS] - segment_model = GD[GD_STORE.SEGMENT_MODEL] - K.set_session(sess) + sess = GD[GD_STORE.SESS] + segment_model = GD[GD_STORE.SEGMENT_MODEL] + K.set_session(sess) else: - with tf.device(device_name): - sess = get_keras_session(device_name, gpu_count, segments_per_host) - K.set_session(sess) - segment_model = init_model(model_architecture, compile_params, custom_function_map) + sess = get_keras_session(device_name, gpu_count, segments_per_host) + K.set_session(sess) + segment_model = init_model(model_architecture, compile_params, custom_function_map) GD_STORE.init(GD, sess, segment_model) return segment_model, sess @@ -622,8 +620,7 @@ def fit_transition(state, dependent_var, independent_var, dependent_var_shape, else: agg_image_count = 0 GD[GD_STORE.AGG_IMAGE_COUNT] = agg_image_count - with tf.device(device_name): - set_model_weights(segment_model, prev_serialized_weights) + set_model_weights(segment_model, prev_serialized_weights) x_train = [] y_train = [] @@ -637,8 +634,7 @@ def fit_transition(state, dependent_var, independent_var, dependent_var_shape, # Fit segment model on data #TODO consider not doing this every time fit_params = parse_and_validate_fit_params(fit_params) - with tf.device(device_name): - segment_model.fit(x_train, y_train, **fit_params) + segment_model.fit(x_train, y_train, **fit_params) # Aggregating number of images, loss and accuracy @@ -735,13 +731,12 @@ def fit_multiple_transition_caching(dependent_var, independent_var, dependent_va model_architecture, compile_params, custom_function_map) - with tf.device(device_name): - set_model_weights(segment_model, serialized_weights) - fit_params = parse_and_validate_fit_params(fit_params) + set_model_weights(segment_model, serialized_weights) + fit_params = parse_and_validate_fit_params(fit_params) - for i in range(len(GD['x_train'])): - # Fit segment model on data - segment_model.fit(GD['x_train'][i], GD['y_train'][i], **fit_params) + for i in range(len(GD['x_train'])): + # Fit segment model on data + segment_model.fit(GD['x_train'][i], GD['y_train'][i], **fit_params) return_state = get_state_to_return(segment_model, is_last_row, True, agg_image_count) @@ -1077,8 +1072,7 @@ def internal_keras_eval_transition(state, dependent_var, independent_var, image_count = len(y_val[0]) agg_image_count += image_count - with tf.device(device_name): - res = segment_model.evaluate(x_val, y_val) + res = segment_model.evaluate(x_val, y_val) # if metric is None, model.evaluate will only return loss as a scalar # Otherwise, it will return a list which has loss and metric diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in index 3c7b28e..1b8f226 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in @@ -381,8 +381,7 @@ def internal_keras_predict(independent_var, model_architecture, model_weights, for i in independent_var: if i is not None: independent_var_filtered.append(expand_input_dims(i)/normalizing_const) - with tf.device(device_name): - probs = model.predict(independent_var_filtered) + probs = model.predict(independent_var_filtered) # probs is a list containing a list of probability values, of all # class levels. Since we are assuming each input is a single image, # and not mini-batched, this list contains exactly one list in it, diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in index 7b1b3de..cbd882a 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in @@ -86,9 +86,8 @@ def get_device_name_and_set_cuda_env(gpu_count, seg): return device_name def set_keras_session(device_name, gpu_count, segments_per_host): - with tf.device(device_name): - session = get_keras_session(device_name, gpu_count, segments_per_host) - K.set_session(session) + session = get_keras_session(device_name, gpu_count, segments_per_host) + K.set_session(session) def get_keras_session(device_name, gpu_count, segments_per_host): config = tf.ConfigProto()