orhankislal commented on a change in pull request #392: DL: Improve performance
for predict
URL: https://github.com/apache/madlib/pull/392#discussion_r285779326
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
##########
@@ -78,49 +79,133 @@ def predict(schema_madlib, model_table, test_table,
id_col,
class_values, intermediate_col, pred_col_name,
pred_col_type, is_response, MODULE_NAME)
- segment_id = -1 if is_platform_pg() else '{0}.gp_segment_id'.format(
- test_table)
+ gp_segment_id_col, seg_ids_test, \
+ images_per_seg_test =
get_images_per_seg_for_non_minibatched_data(test_table)
+ segments_per_host = get_segments_per_host()
- plpy.execute("""
+ if is_platform_pg():
+ set_keras_session(gpus_per_host, segments_per_host)
+ else:
+ # we want to disable gpu on gpdb's master node because GPUs will only
be used
+ # for segment nodes.
+ set_cuda_env('-1')
+
+ predict_query = plpy.prepare("""
CREATE TABLE {output_table} AS
SELECT {id_col}, {prediction_select_clause}
FROM (
SELECT {test_table}.{id_col},
({schema_madlib}.internal_keras_predict
({independent_varname},
- $MAD${model_arch}$MAD$,
- {0},
- ARRAY{input_shape},
+ $1,
+ $2,
{is_response},
{normalizing_const},
+ {gp_segment_id_col},
+ ARRAY{seg_ids_test},
+ ARRAY{images_per_seg_test},
{gpus_per_host},
- {segment_id})
+ {segments_per_host})
) AS {intermediate_col}
- FROM {test_table}, {model_table}
+ FROM {test_table}
) q
- """.format(MODEL_DATA_COLNAME, **locals()))
-
-def internal_keras_predict(x_test, model_arch, model_data, input_shape,
- is_response, normalizing_const, gpus_per_host, seg):
- model = model_from_json(model_arch)
- device_name = get_device_name_and_set_cuda_env(gpus_per_host, seg)
- model_shapes = madlib_keras_serializer.get_model_shapes(model)
- set_model_weights(model, device_name, model_data, model_shapes)
- # Since the test data isn't mini-batched,
- # we have to make sure that the test data np array has the same
- # number of dimensions as input_shape. So we add a dimension to x.
- x_test = expand_input_dims(x_test, target_type='float32')
- x_test /= normalizing_const
- if is_response:
- proba_argmax = model.predict_classes(x_test)
- # proba_argmax is a list with exactly one element in it. That element
- # refers to the index containing the largest probability value in the
- # output of Keras' predict function.
- return proba_argmax
+ """.format(**locals()), ["text", "bytea"])
+ plpy.execute(predict_query, [model_arch, model_data])
+
+ if is_platform_pg():
+ clear_keras_session()
+
+def get_images_per_seg_for_non_minibatched_data(table_name):
+ """
+ This function queries the given table and returns the total rows per
segment.
+ Since we cannot pass a dictionary to the keras fit step function we create
arrays
+ out of the segment numbers and the rows per segment values.
+ This function assumes that the table is not empty.
+ :param table_name:
+ :return: Returns two arrays
Review comment:
We return the gp_segment_id_col as well. Also, the automatic numbering
doesn't work for comments.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services