makemebitter commented on a change in pull request #425: DL: Add training for
multiple models
URL: https://github.com/apache/madlib/pull/425#discussion_r311237076
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras.py_in
##########
@@ -403,49 +408,84 @@ def should_compute_metrics_this_iter(curr_iter,
metrics_compute_frequency,
return (curr_iter)%metrics_compute_frequency == 0 or \
curr_iter == num_iterations
+
def fit_transition(state, dependent_var, independent_var, model_architecture,
compile_params, fit_params, current_seg_id, seg_ids,
images_per_seg, gpus_per_host, segments_per_host,
- prev_serialized_weights, **kwargs):
+ prev_serialized_weights, is_final=True,
+ is_cerebro=False, **kwargs):
+
if not independent_var or not dependent_var:
return state
start_transition = time.time()
SD = kwargs['SD']
device_name = get_device_name_and_set_cuda_env(gpus_per_host,
current_seg_id)
- # Set up system if this is the first buffer on segment'
- if not state:
- set_keras_session(device_name, gpus_per_host, segments_per_host)
- segment_model = model_from_json(model_architecture)
- compile_and_set_weights(segment_model, compile_params, device_name,
- prev_serialized_weights)
- SD['segment_model'] = segment_model
- agg_image_count = 0
+ # Cerebro state has the weights passed from the previous segment
+ if is_cerebro:
+ prev_serialized_weights = madlib_keras_serializer.\
+ get_serialized_1d_weights_from_state(prev_serialized_weights)
+
+ # If a live session is present, re-use it. Otherwise, recreate it.
+ if 'sess' in SD:
+ sess = SD['sess']
else:
- segment_model = SD['segment_model']
- agg_image_count =
madlib_keras_serializer.get_image_count_from_state(state)
+ sess = get_keras_session(device_name, gpus_per_host, segments_per_host)
+ SD['sess'] = sess
+ K.set_session(sess)
+
+ # Set up system if this is the first buffer on segment.
+
+ # On cerebro, clean the session on the first row.
+ if is_cerebro:
+ if not state:
+ K.clear_session()
+ segment_model = model_from_json(model_architecture)
+ compile_and_set_weights(segment_model, compile_params, device_name,
+ prev_serialized_weights)
+ SD['segment_model'] = segment_model
+ agg_image_count = 0
+ else:
+ all_ops_len = len([n.name for n in
tf.get_default_graph().as_graph_def().node])
+ segment_model = SD['segment_model']
+ agg_image_count =
madlib_keras_serializer.get_image_count_from_state(state)
+
+ # On single model fit, reuse the model if it exists
+ else:
+ if not state:
+ agg_image_count = 0
+ else:
+ agg_image_count =
madlib_keras_serializer.get_image_count_from_state(state)
+ if 'segment_model' in SD:
+ segment_model = SD['segment_model']
+ if not state:
+ model_shapes = get_model_shapes(segment_model)
+ model_weights =
madlib_keras_serializer.deserialize_as_nd_weights(
+ prev_serialized_weights, model_shapes)
+ segment_model.set_weights(model_weights)
+ else:
+ segment_model = model_from_json(model_architecture)
+ compile_and_set_weights(segment_model, compile_params, device_name,
+ prev_serialized_weights)
+ SD['segment_model'] = segment_model
# Prepare the data
x_train = np_array_float32(independent_var)
y_train = np_array_int16(dependent_var)
# Fit segment model on data
start_fit = time.time()
- with K.tf.device(device_name):
- #TODO consider not doing this every time
- fit_params = parse_and_validate_fit_params(fit_params)
- history = segment_model.fit(x_train, y_train, **fit_params)
+ #TODO consider not doing this every time
+ fit_params = parse_and_validate_fit_params(fit_params)
+ history = segment_model.fit(x_train, y_train, **fit_params)
Review comment:
Some tf.devices context managers have no effects. For example, the ones
surrounding K.set_session.
For the rest, because of the tweaks with CUDA_VISIBLE_DEVICES_KEY
environments, there are only two options for device_name: '/gpu:0' and '/cpu:0'.
When it is '/cpu:0', CUDA_VISIBLE_DEVICES_KEY is already set to -1 meaning
no GPU visible and every op will be assigned to CPU, so there is no need for
the context manager.
When it is '/gpu:0', CUDA_VISIBLE_DEVICES_KEY is set to the only GPU
assigned for this segment and '/gpu:0' maps to that GPU. In this case, by
default tensorflow will try to assign all ops to the GPU, if a CUDA kernel
exists for the ops. ref:https://www.tensorflow.org/guide/using_gpu This means
if the ops within the context manager can be placed on GPU, they will be placed
on GPU by default. If they can't, writing tf.devices won't help either.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services