This is an automated email from the ASF dual-hosted git repository. nkak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 65ce6020075893af96ae8b18e8412e5517ce0e1c Author: Orhan Kislal <okis...@pivotal.io> AuthorDate: Wed Apr 24 14:14:14 2019 -0700 DL: Replace use_gpu flag with gpus_per_host integer JIRA: MADLIB-1308 Previously, gpus_per_host were hard coded to 4. This commit removes this hard coding and takes in this value from the user. We also tried to use the tensorflow function `list_local_devices` to get the count of gpus per host. This did give us the count but would hang forever on some segments. So we decided to not use this function. We now cache the CUDA_VISIBLE_DEVICES env variable (which is set to -1 for master) and then reset it at the end of fit function. Finally, we dynamically calculate the gpu memory fraction to support the case when the number of gpus is less than the number of segments. Co-authored-by: Nikhil Kak <n...@pivotal.io> --- .../modules/deep_learning/madlib_keras.py_in | 73 ++++++++++------ .../modules/deep_learning/madlib_keras.sql_in | 35 ++++---- .../deep_learning/madlib_keras_predict.py_in | 10 +-- .../deep_learning/madlib_keras_wrapper.py_in | 48 +++++++++-- .../modules/deep_learning/test/madlib_keras.sql_in | 59 +++++++------ .../test/unit_tests/test_madlib_keras.py_in | 97 ++++++++++++++++++---- .../postgres/modules/utilities/utilities.py_in | 18 ++++ 7 files changed, 237 insertions(+), 103 deletions(-) diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in index 82d1069..8b2a747 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in @@ -47,13 +47,14 @@ from keras_model_arch_table import Format from utilities.model_arch_info import get_input_shape from utilities.model_arch_info import get_num_classes from utilities.utilities import is_platform_pg +from utilities.utilities import get_segments_per_host from utilities.utilities import madlib_version from utilities.validate_args import get_col_value_and_type from utilities.validate_args import quote_ident def fit(schema_madlib, source_table, model, dependent_varname, independent_varname, model_arch_table, model_arch_id, compile_params, - fit_params, num_iterations, use_gpu = True, + fit_params, num_iterations, gpus_per_host = 0, validation_table=None, name="", description="", **kwargs): source_table = quote_ident(source_table) @@ -66,7 +67,20 @@ def fit(schema_madlib, source_table, model, dependent_varname, dependent_varname, independent_varname, num_iterations) start_training_time = datetime.datetime.now() - use_gpu = bool(use_gpu) + + gpus_per_host = 0 if gpus_per_host is None else gpus_per_host + segments_per_host = get_segments_per_host() + + if 0 < gpus_per_host < segments_per_host: + plpy.warning('The number of gpus per host is less than the number of ' + 'segments per host. The support for this case is ' + 'experimental and it may fail.') + + #TODO add a unit test for this in a future PR + # save the original value of the env variable so that we can reset it later. + original_cuda_env = None + if CUDA_VISIBLE_DEVICES_KEY in os.environ: + original_cuda_env = os.environ[CUDA_VISIBLE_DEVICES_KEY] # Get the serialized master model start_deserialization = time.time() @@ -88,10 +102,11 @@ def fit(schema_madlib, source_table, model, dependent_varname, #TODO: Refactor the pg related logic in a future PR when we think # about making the fit function easier to read and maintain. if is_platform_pg(): - set_keras_session(use_gpu) + set_keras_session(gpus_per_host, segments_per_host) else: - # Disable GPU on master for gpdb - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + # we want to disable gpu on gpdb's master node because GPUs will only be used + # for segment nodes. + set_cuda_env('-1') # Compute total images on each segment gp_segment_id_col,\ @@ -137,7 +152,8 @@ def fit(schema_madlib, source_table, model, dependent_varname, $MAD${model_arch}$MAD$::TEXT, {compile_params_to_pass}::TEXT, {fit_params_to_pass}::TEXT, - {use_gpu}, + {gpus_per_host}, + {segments_per_host}, $1 ) AS iteration_result FROM {source_table} @@ -176,7 +192,9 @@ def fit(schema_madlib, source_table, model, dependent_varname, independent_varname, compile_params_to_pass, model_arch, model_state, - use_gpu, seg_ids_val, + gpus_per_host, + segments_per_host, + seg_ids_val, rows_per_seg_val, gp_segment_id_col) end_val = time.time() @@ -286,6 +304,9 @@ def fit(schema_madlib, source_table, model, dependent_varname, if is_platform_pg(): clear_keras_session() + #TODO add a unit test for this in a future PR + reset_cuda_env(original_cuda_env) + def get_images_per_seg(source_table, dependent_varname): """ Compute total images in each segment, by querying source_table. For @@ -316,7 +337,7 @@ def get_images_per_seg(source_table, dependent_varname): for each_segment in total_images_per_seg] gp_segment_id_col = 'gp_segment_id' return gp_segment_id_col, seg_ids_train, total_images_per_seg - + def get_rows_per_seg_from_db(table_name): """ This function queries the given table and returns the total rows per segment. @@ -350,8 +371,8 @@ def get_rows_per_seg_from_db(table_name): def fit_transition(state, ind_var, dep_var, current_seg_id, num_classes, all_seg_ids, total_images_per_seg, architecture, - compile_params, fit_params, use_gpu, previous_state, - **kwargs): + compile_params, fit_params, gpus_per_host, segments_per_host, + previous_state, **kwargs): """ @@ -365,7 +386,7 @@ def fit_transition(state, ind_var, dep_var, current_seg_id, num_classes, :param architecture: :param compile_params: :param fit_params: - :param use_gpu: + :param gpus_per_host: :param previous_state: :param kwargs: :return: @@ -375,16 +396,15 @@ def fit_transition(state, ind_var, dep_var, current_seg_id, num_classes, start_transition = time.time() SD = kwargs['SD'] - # Configure GPUs/CPUs - device_name = get_device_name_and_set_cuda_env(use_gpu, current_seg_id) - + device_name = get_device_name_and_set_cuda_env(gpus_per_host, + current_seg_id) # Set up system if this is the first buffer on segment' - if not state: if not is_platform_pg(): - set_keras_session(use_gpu) + set_keras_session(gpus_per_host, segments_per_host) segment_model = model_from_json(architecture) SD['model_shapes'] = madlib_keras_serializer.get_model_shapes(segment_model) + # Configure GPUs/CPUs compile_and_set_weights(segment_model, compile_params, device_name, previous_state, SD['model_shapes']) SD['segment_model'] = segment_model @@ -524,7 +544,8 @@ def evaluate1(schema_madlib, model_table, test_table, id_col, model_arch_table, loss_acc = get_loss_acc_from_keras_eval(schema_madlib, test_table, dependent_varname, independent_varname, compile_params, model_arch, - model_data, False) + model_data, False, None) + #TODO remove these infos after adding create table command plpy.info('len of evaluate result is {}'.format(len(loss_acc))) plpy.info('evaluate result loss is {}'.format(loss_acc[0])) @@ -532,7 +553,8 @@ def evaluate1(schema_madlib, model_table, test_table, id_col, model_arch_table, def get_loss_acc_from_keras_eval(schema_madlib, table, dependent_varname, independent_varname, compile_params, model_arch, - model_data, use_gpu, seg_ids_val, + model_data, gpus_per_host, segments_per_host, + seg_ids_val, rows_per_seg_val, gp_segment_id_col): """ This function will call the internal keras evaluate function to get the loss @@ -545,10 +567,11 @@ def get_loss_acc_from_keras_eval(schema_madlib, table, dependent_varname, {independent_varname}, $MAD${model_arch}$MAD$, $1, {compile_params}, - {use_gpu}, - ARRAY{seg_ids_val}, + {gpus_per_host}, + {segments_per_host}, + ARRAY{seg_ids_val}, ARRAY{rows_per_seg_val}, - {gp_segment_id_col})) as loss_acc + {gp_segment_id_col})) as loss_acc from {table} ) q""".format(**locals()), ["bytea"]) res = plpy.execute(evaluate_query, [model_data]) @@ -557,14 +580,16 @@ def get_loss_acc_from_keras_eval(schema_madlib, table, dependent_varname, def internal_keras_evaluate(dependent_var, independent_var, model_architecture, - model_data, compile_params, use_gpu, seg_ids_val, + model_data, compile_params, gpus_per_host, + segments_per_host, seg_ids_val, rows_per_seg_val, current_seg, **kwargs): SD = kwargs['SD'] - device_name = get_device_name_and_set_cuda_env(use_gpu, current_seg) + device_name = get_device_name_and_set_cuda_env(gpus_per_host, + current_seg) if 'segment_model' not in SD: if not is_platform_pg(): - set_keras_session(use_gpu) + set_keras_session(gpus_per_host, segments_per_host) model = model_from_json(model_architecture) model_shapes = madlib_keras_serializer.get_model_shapes(model) _, _, _, model_weights = madlib_keras_serializer.deserialize_weights( diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in index 3c44205..a492d14 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in @@ -38,7 +38,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit( compile_params VARCHAR, fit_params VARCHAR, num_iterations INTEGER, - use_gpu BOOLEAN, + gpus_per_host INTEGER, validation_table VARCHAR, name VARCHAR, description VARCHAR @@ -60,10 +60,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit( compile_params VARCHAR, fit_params VARCHAR, num_iterations INTEGER, - use_gpu BOOLEAN, + gpus_per_host INTEGER, validation_table VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, NULL, NULL); + SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, + $10, $11, NULL, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -77,7 +78,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit( compile_params VARCHAR, fit_params VARCHAR, num_iterations INTEGER, - use_gpu BOOLEAN + gpus_per_host INTEGER ) RETURNS VOID AS $$ SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, NULL, NULL, NULL); $$ LANGUAGE sql VOLATILE @@ -94,7 +95,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit( fit_params VARCHAR, num_iterations INTEGER ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, TRUE, NULL, NULL, NULL); + SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 0, NULL, NULL, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -109,7 +110,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.fit_transition( architecture TEXT, compile_params TEXT, fit_params TEXT, - use_gpu BOOLEAN, + gpus_per_host INTEGER, + segments_per_host INTEGER, previous_state BYTEA ) RETURNS BYTEA AS $$ PythonFunctionBodyOnlyNoSchema(`deep_learning', `madlib_keras') @@ -146,7 +148,8 @@ DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fit_step( TEXT, TEXT, TEXT, - BOOLEAN, + INTEGER, + INTEGER, BYTEA); CREATE AGGREGATE MADLIB_SCHEMA.fit_step( /* ind_var */ REAL[], @@ -158,7 +161,8 @@ CREATE AGGREGATE MADLIB_SCHEMA.fit_step( /* architecture */ TEXT, /* compile_params */ TEXT, /* fit_params */ TEXT, - /* use_gpu */ BOOLEAN, + /* gpus_per_host */ INTEGER, + /* segments_per_host */ INTEGER, /* previous_state */ BYTEA )( STYPE=BYTEA, @@ -174,7 +178,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict( independent_varname VARCHAR, output_table VARCHAR, pred_type VARCHAR, - use_gpu BOOLEAN + gpus_per_host INTEGER ) RETURNS VOID AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict') with AOControl(False): @@ -185,7 +189,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict( independent_varname, output_table, pred_type, - use_gpu) + gpus_per_host) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -197,7 +201,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict( output_table VARCHAR, pred_type VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, $6, TRUE); + SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, $6, 0); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -208,7 +212,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict( independent_varname VARCHAR, output_table VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, NULL, TRUE); + SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, NULL, 0); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -219,7 +223,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_keras_predict( input_shape INTEGER[], is_response BOOLEAN, normalizing_const DOUBLE PRECISION, - use_gpu BOOLEAN, + gpus_per_host INTEGER, seg INTEGER ) RETURNS DOUBLE PRECISION[] AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict') @@ -231,7 +235,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_keras_predict( input_shape, is_response, normalizing_const, - use_gpu, + gpus_per_host, seg) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -268,7 +272,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_keras_evaluate( model_architecture TEXT, model_data bytea, compile_params TEXT, - use_gpu BOOLEAN, + gpus_per_host INTEGER, + segments_per_host INTEGER, seg_ids_val INTEGER[], rows_per_seg_val INTEGER[], current_seg INTEGER diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in index 4e2a206..3ad3bbf 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in @@ -38,8 +38,6 @@ from utilities.utilities import add_postfix from utilities.utilities import create_cols_from_array_sql_string from utilities.utilities import is_platform_pg from utilities.utilities import unique_string -from utilities.validate_args import input_tbl_valid -from utilities.validate_args import output_tbl_valid import madlib_keras_serializer @@ -81,7 +79,7 @@ def _strip_trailing_nulls_from_class_values(class_values): return class_values def predict(schema_madlib, model_table, test_table, id_col, - independent_varname, output_table, pred_type, use_gpu, **kwargs): + independent_varname, output_table, pred_type, gpus_per_host, **kwargs): if not pred_type: pred_type = 'response' input_validator = PredictInputValidator( @@ -129,7 +127,7 @@ def predict(schema_madlib, model_table, test_table, id_col, ARRAY{input_shape}, {is_response}, {normalizing_const}, - {use_gpu}, + {gpus_per_host}, {segment_id}) ) AS {intermediate_col} FROM {test_table}, {model_table} @@ -137,9 +135,9 @@ def predict(schema_madlib, model_table, test_table, id_col, """.format(MODEL_DATA_COLNAME, **locals())) def internal_keras_predict(x_test, model_arch, model_data, input_shape, - is_response, normalizing_const, use_gpu, seg): + is_response, normalizing_const, gpus_per_host, seg): model = model_from_json(model_arch) - device_name = get_device_name_and_set_cuda_env(use_gpu, seg) + device_name = get_device_name_and_set_cuda_env(gpus_per_host, seg) model_shapes = madlib_keras_serializer.get_model_shapes(model) set_model_weights(model, device_name, model_data, model_shapes) # Since the test data isn't mini-batched, diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in index 6149411..4a11d18 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in @@ -20,6 +20,7 @@ import ast import os import plpy +from math import ceil # Do not remove `import keras` although it's not directly used in this file. # See madlib_keras.py_in for more details @@ -35,32 +36,61 @@ import madlib_keras_serializer from utilities.utilities import _assert from utilities.utilities import is_platform_pg +CUDA_VISIBLE_DEVICES_KEY = 'CUDA_VISIBLE_DEVICES' ####################################################################### ########### Keras specific functions ##### ####################################################################### -def get_device_name_and_set_cuda_env(use_gpu, seg): - gpus_per_host = 4 - if use_gpu: + +def set_cuda_env(value): + """ + :param value: -1 to disable gpu + :return: + """ + os.environ[CUDA_VISIBLE_DEVICES_KEY] = value + +def reset_cuda_env(value): + """ + This function will reset the cuda env variable. This should only be called + if set_cuda_env was called previously. + :param value: + """ + if value: + set_cuda_env(value) + else: + del os.environ[CUDA_VISIBLE_DEVICES_KEY] + +def get_device_name_and_set_cuda_env(gpus_per_host, seg): + if gpus_per_host > 0: device_name = '/gpu:0' if is_platform_pg(): cuda_visible_dev = ','.join([i for i in range(gpus_per_host)]) else: cuda_visible_dev = str(seg % gpus_per_host) - os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_dev + set_cuda_env(cuda_visible_dev) else: # cpu only device_name = '/cpu:0' - os.environ["CUDA_VISIBLE_DEVICES"] = '-1' - + set_cuda_env('-1') return device_name -def set_keras_session(use_gpu): +def set_keras_session(gpus_per_host, segments_per_host): config = K.tf.ConfigProto() - if use_gpu: + if gpus_per_host > 0: + memory_fraction = get_gpu_memory_fraction(gpus_per_host, segments_per_host) config.gpu_options.allow_growth = False - config.gpu_options.per_process_gpu_memory_fraction = 0.9 + config.gpu_options.per_process_gpu_memory_fraction = memory_fraction session = K.tf.Session(config=config) K.set_session(session) +def get_gpu_memory_fraction(gpus_per_host, segments_per_host): + """ + We cap the gpu memory usage to 90% of the total available gpu memory. + This 90% is evenly distributed among the segments per gpu. + :param gpus_per_host: + :param segments_per_host: + :return: + """ + return 0.9 / ceil(1.0 * segments_per_host / gpus_per_host) + def clear_keras_session(): sess = K.get_session() K.clear_session() diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in index 527d6e8..2421f5f 100644 --- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in @@ -112,7 +112,7 @@ SELECT madlib_keras_fit( $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, 3, - FALSE, + NULL, 'cifar_10_sample_val'); SELECT assert( @@ -151,7 +151,7 @@ FROM (SELECT * FROM keras_saved_out_summary) summary; SELECT assert(model_data IS NOT NULL , 'Keras model output validation failed') FROM (SELECT * FROM keras_saved_out) k; --- Fit with use_gpu set to TRUE must error out on machines +-- Fit with gpus_per_host set to 2 must error out on machines -- that don't have GPUs. Since Jenkins builds are run on docker containers -- that don't have GPUs, these queries must error out. DROP TABLE IF EXISTS keras_saved_out_gpu, keras_saved_out_gpu_summary; @@ -165,18 +165,18 @@ SELECT assert(trap_error($TRAP$madlib_keras_fit( $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, 3, - TRUE, + 2, 'cifar_10_sample_val');$TRAP$) = 1, - 'Fit with use_gpu=True must error out.'); + 'Fit with gpus_per_host=2 must error out.'); --- Prediction with use_gpu set to TRUE must error out on machines +-- Prediction with gpus_per_host set to 2 must error out on machines -- that don't have GPUs. Since Jenkins builds are run on docker containers -- that don't have GPUs, these queries must error out. -- IMPRORTANT: The following test must be run when we have a valid -- keras_saved_out model table. Otherwise, it will fail because of a -- non-existent model table, while we want to trap failure due to --- use_gpu=TRUE +-- gpus_per_host=2 DROP TABLE IF EXISTS cifar10_predict_gpu; SELECT assert(trap_error($TRAP$madlib_keras_predict( 'keras_saved_out', @@ -185,8 +185,8 @@ SELECT assert(trap_error($TRAP$madlib_keras_predict( 'x', 'cifar10_predict_gpu', NULL, - TRUE);$TRAP$) = 1, - 'Prediction with use_gpu=TRUE must error out.'); + 2);$TRAP$) = 1, + 'Prediction with gpus_per_host=2 must error out.'); -- Test for -- Non null name and description columns @@ -202,7 +202,7 @@ SELECT madlib_keras_fit( $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, 2, - FALSE, + NULL, NULL, 'model name', 'model desc'); SELECT assert( @@ -248,7 +248,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', NULL, - FALSE); + 0); -- Validate that prediction output table exists and has correct schema SELECT assert(UPPER(atttypid::regtype::TEXT) = 'INTEGER', 'id column should be INTEGER type') @@ -276,7 +276,7 @@ SELECT assert(trap_error($TRAP$madlib_keras_predict( 'x', 'cifar10_predict', NULL, - FALSE);$TRAP$) = 1, + 0);$TRAP$) = 1, 'Passing batched image table to predict should error out.'); -- Compile and fit parameter tests @@ -291,7 +291,7 @@ SELECT madlib_keras_fit( $$ optimizer='SGD', loss=losses.categorical_crossentropy, metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, 1, - FALSE, + NULL, NULL, 'model name', 'model desc'); @@ -306,7 +306,7 @@ SELECT madlib_keras_fit( $$ optimizer='Adam()', loss=losses.categorical_crossentropy, metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, 1, - FALSE, + NULL, NULL, 'model name', 'model desc'); @@ -322,7 +322,7 @@ SELECT madlib_keras_fit( $$ batch_size=2, epochs=1, verbose=0 $$::text, 1, - FALSE, + 0, NULL, 'model name', 'model desc'); @@ -337,7 +337,7 @@ SELECT madlib_keras_fit( $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), metrics=['accuracy'], loss_weights=[2], sample_weight_mode=None, loss='categorical_crossentropy' $$::text, $$ epochs=10, verbose=0, shuffle=True, initial_epoch=1, steps_per_epoch=2 $$::text, 1, - FALSE, + NULL, NULL, 'model name', 'model desc'); @@ -357,7 +357,7 @@ select assert(trap_error($TRAP$madlib_keras_fit( $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, 2, - FALSE, + NULL, 'cifar_10_sample_val_failure');$TRAP$) = 1, 'Passing y of type non numeric array to fit should error out.'); @@ -370,7 +370,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'prob', - FALSE); + 0); SELECT assert(UPPER(atttypid::regtype::TEXT) = 'DOUBLE PRECISION', 'column prob_0 should be double precision type') @@ -437,8 +437,7 @@ SELECT madlib_keras_fit( 1, $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, - 3, - FALSE); + 3); -- Assert fit has correct class_values SELECT assert( dependent_vartype = 'text' AND @@ -455,7 +454,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'prob', - FALSE); + 0); -- Validate the output datatype of newly created prediction columns -- for prediction type = 'prob' and class_values 'TEXT' with NULL as a valid @@ -489,7 +488,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'response', - FALSE); + 0); -- Validate the output datatype of newly created prediction columns -- for prediction type = 'response' and class_values 'TEXT' with NULL @@ -513,7 +512,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'prob', - FALSE); + 0); -- Validate the output datatype of newly created prediction column -- for prediction type = 'response' and class_value = NULL @@ -532,7 +531,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'response', - FALSE); + 0); -- Validate the output datatype of newly created prediction column -- for prediction type = 'response' and class_value = NULL @@ -570,8 +569,7 @@ SELECT madlib_keras_fit( 1, $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, - 3, - FALSE); + 3); -- Assert fit has correct class_values SELECT assert( @@ -589,7 +587,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'prob', - FALSE); + 0); -- Validate the output datatype of newly created prediction column -- for prediction type = 'prob' and class_values 'INT' with NULL @@ -613,7 +611,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'response', - FALSE); + 0); -- Validate the output datatype of newly created prediction column -- for prediction type = 'response' and class_values 'TEXT' with NULL @@ -669,8 +667,7 @@ SELECT madlib_keras_fit( 1, $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text, $$ batch_size=2, epochs=1, verbose=0 $$::text, - 3, - FALSE); + 3); -- Predict with correctly shaped data, must go thru. DROP TABLE IF EXISTS cifar10_predict; @@ -681,7 +678,7 @@ SELECT madlib_keras_predict( 'x', 'cifar10_predict', 'prob', - FALSE); + 0); -- Prediction with incorrectly shaped data must error out. DROP TABLE IF EXISTS cifar10_predict; @@ -692,5 +689,5 @@ SELECT assert(trap_error($TRAP$madlib_keras_predict( 'x', 'cifar10_predict', 'prob', - FALSE);$TRAP$) = 1, + 0);$TRAP$) = 1, 'Input shape is (32, 32, 3) but model was trained with (3, 32, 32). Should have failed.'); diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in index 912f266..e315a31 100644 --- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in @@ -19,6 +19,7 @@ import sys import numpy as np +import os from os import path # Add convex module to the pythonpath. sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) @@ -95,7 +96,7 @@ class MadlibKerasFitTestCase(unittest.TestCase): new_model_state = self.subject.fit_transition( None, self.independent_var , self.dependent_var, 0, 2, self.all_seg_ids, self.total_images_per_seg, - self.model.to_json(), self.compile_params, self.fit_params, False, + self.model.to_json(), self.compile_params, self.fit_params, 0, 4, previous_state.tostring(), **k) state = np.fromstring(new_model_state, dtype=np.float32) image_count = state[2] @@ -128,7 +129,7 @@ class MadlibKerasFitTestCase(unittest.TestCase): new_model_state = self.subject.fit_transition( None, self.independent_var , self.dependent_var, 0, 2, self.all_seg_ids, self.total_images_per_seg, - self.model.to_json(), self.compile_params, self.fit_params, False, + self.model.to_json(), self.compile_params, self.fit_params, 0, 4, previous_state.tostring(), **k) state = np.fromstring(new_model_state, dtype=np.float32) image_count = state[2] @@ -163,7 +164,7 @@ class MadlibKerasFitTestCase(unittest.TestCase): k['SD']['segment_model'] = self.model new_model_state = self.subject.fit_transition( state.tostring(), self.independent_var, self.dependent_var, 0, 2, self.all_seg_ids, self.total_images_per_seg, - self.model.to_json(), None, self.fit_params, False, 'dummy_previous_state', **k) + self.model.to_json(), None, self.fit_params, 0, 4, 'dummy_previous_state', **k) state = np.fromstring(new_model_state, dtype=np.float32) image_count = state[2] @@ -199,7 +200,7 @@ class MadlibKerasFitTestCase(unittest.TestCase): k['SD']['segment_model'] = self.model new_model_state = self.subject.fit_transition( state.tostring(), self.independent_var , self.dependent_var, 0, 2, self.all_seg_ids, self.total_images_per_seg, - self.model.to_json(), None, self.fit_params, False, 'dummy_previous_state', **k) + self.model.to_json(), None, self.fit_params, 0, 4, 'dummy_previous_state', **k) state = np.fromstring(new_model_state, dtype=np.float32) image_count = state[2] @@ -236,7 +237,7 @@ class MadlibKerasFitTestCase(unittest.TestCase): k['SD']['segment_model'] = self.model new_model_state = self.subject.fit_transition( state.tostring(), self.independent_var , self.dependent_var, 0, 2, self.all_seg_ids, self.total_images_per_seg, - self.model.to_json(), None, self.fit_params, False, 'dummy_previous_state', **k) + self.model.to_json(), None, self.fit_params, 0, 4, 'dummy_previous_state', **k) state = np.fromstring(new_model_state, dtype=np.float32) image_count = state[2] @@ -262,11 +263,13 @@ class MadlibKerasFitTestCase(unittest.TestCase): total_images_per_seg = [0,1,1] - with self.assertRaises(plpy.PLPYException): + with self.assertRaises(plpy.PLPYException) as error: new_model_state = self.subject.fit_transition( - None, self.independent_var , self.dependent_var, 0, 2, self.all_seg_ids, total_images_per_seg, - self.model.to_json(), self.compile_params, self.fit_params, False, - previous_state.tostring(), **k) + None, self.independent_var , self.dependent_var, 0, 2, + self.all_seg_ids, total_images_per_seg, + self.model.to_json(), self.compile_params, self.fit_params, + 0, 4, previous_state.tostring(), **k) + self.assertIn('0 rows', str(error.exception)) def test_fit_transition_too_many_images(self): self.subject.K.set_session = Mock() @@ -280,12 +283,28 @@ class MadlibKerasFitTestCase(unittest.TestCase): total_images_per_seg = [1,1,1] - with self.assertRaises(plpy.PLPYException): + with self.assertRaises(plpy.PLPYException) as error: new_model_state = self.subject.fit_transition( None, self.independent_var , self.dependent_var, 0, 2, self.all_seg_ids, total_images_per_seg, - self.model.to_json(), self.compile_params, self.fit_params, False, + self.model.to_json(), self.compile_params, self.fit_params, 0, 4, previous_state.tostring(), **k) + self.assertIn('only 1', str(error.exception)) + + def test_fit_transition_first_tuple_none_ind_var_dep_var(self): + k = {} + self.assertEqual('dummy_state', + self.subject.fit_transition('dummy_state', None , [0], 1, 2, + [0,1,2], [3,3,3], 'dummy_model_json', "foo", "bar", 0, 4, + 'dummy_prev_state', **k)) + self.assertEqual('dummy_state', + self.subject.fit_transition('dummy_state', [[0.5]], None, 1, 2, + [0,1,2], [3,3,3], 'dummy_model_json', "foo", "bar", 0, 4, + 'dummy_prev_state', **k)) + self.assertEqual('dummy_state', + self.subject.fit_transition('dummy_state', None, None, 1, 2, + [0,1,2], [3,3,3], 'dummy_model_json', "foo", "bar", 0, 4, + 'dummy_prev_state', **k)) def test_fit_merge(self): image_count = self.total_images_per_seg[0] @@ -367,14 +386,33 @@ class MadlibKerasFitTestCase(unittest.TestCase): result = self.subject.fit_final(None) self.assertEqual(result, None) - def test_get_device_name_and_set_cuda_env(self): - import os + def test_get_device_name_and_set_cuda_env_postgres(self): + self.subject.is_platform_pg = Mock(return_value = True) + + seg_id = -1 + gpus_per_host = 3 self.assertEqual('/gpu:0', self.subject.get_device_name_and_set_cuda_env( - True, 1)) - self.assertEqual('1', os.environ["CUDA_VISIBLE_DEVICES"]) + gpus_per_host, seg_id )) + self.assertEqual('0,1,2', os.environ['CUDA_VISIBLE_DEVICES']) + + gpus_per_host = 0 self.assertEqual('/cpu:0', self.subject.get_device_name_and_set_cuda_env( - False, 1)) - self.assertEqual('-1', os.environ["CUDA_VISIBLE_DEVICES"]) + gpus_per_host, seg_id )) + self.assertEqual('-1', os.environ['CUDA_VISIBLE_DEVICES']) + + def test_get_device_name_and_set_cuda_env_gpdb(self): + self.subject.is_platform_pg = Mock(return_value = False) + + seg_id=3 + gpus_per_host=2 + self.assertEqual('/gpu:0', self.subject.get_device_name_and_set_cuda_env( + gpus_per_host, seg_id)) + self.assertEqual('1', os.environ['CUDA_VISIBLE_DEVICES']) + + gpus_per_host=0 + self.assertEqual('/cpu:0', self.subject.get_device_name_and_set_cuda_env( + gpus_per_host, seg_id)) + self.assertEqual('-1', os.environ['CUDA_VISIBLE_DEVICES']) def test_fit_transition_first_tuple_none_ind_var_dep_var(self): k = {} @@ -443,7 +481,30 @@ class MadlibKerasFitTestCase(unittest.TestCase): accepted_fit_params) self.assertDictEqual(result_params, target_dict) -## Negative Tests + def test_get_gpu_memory_fraction(self): + + gpus_per_host = 4 + segments_per_host = 4 + result = self.subject.get_gpu_memory_fraction(gpus_per_host, segments_per_host) + self.assertEqual(result, 0.9) + + gpus_per_host = 10 + segments_per_host = 4 + result = self.subject.get_gpu_memory_fraction(gpus_per_host, segments_per_host) + self.assertEqual(result, 0.9) + + gpus_per_host = 2 + segments_per_host = 6 + result = self.subject.get_gpu_memory_fraction(gpus_per_host, segments_per_host) + self.assertEqual(result, 0.3) + + gpus_per_host = 1 + segments_per_host = 4 + result = self.subject.get_gpu_memory_fraction(gpus_per_host, segments_per_host) + self.assertEqual(result, 0.225) + + + ## Negative Tests def test_parse_and_validate_compile_params_dict_metrics_fail(self): test_str = "optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics={'0':'accuracy'}" diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index e57f407..e6e31c5 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -47,6 +47,24 @@ def get_seg_number(): return max(1, count) # ------------------------------------------------------------------------------ +def get_segments_per_host(): + """ Find out how many primary segments(not include master segment) exist + per host. We assume every host has the same number of segments and + we only return the first one. + """ + if is_platform_pg(): + return 1 + else: + count = plpy.execute(""" + SELECT count(*) from gp_segment_configuration + WHERE role = 'p' and content != -1 + GROUP BY hostname + LIMIT 1 + """)[0]['count'] + # in case some weird gpdb configuration happens, always returns + # primary segment number >= 1 + return max(1, count) +# ------------------------------------------------------------------------------ def is_orca(): if has_function_properties():