[madlib] 01/02: DL: Replace use_gpu flag with gpus_per_host integer

nkak Mon, 06 May 2019 10:56:44 -0700

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit 65ce6020075893af96ae8b18e8412e5517ce0e1c
Author: Orhan Kislal <okis...@pivotal.io>
AuthorDate: Wed Apr 24 14:14:14 2019 -0700

    DL: Replace use_gpu flag with gpus_per_host integer
    
    JIRA: MADLIB-1308
    
    Previously, gpus_per_host were hard coded to 4. This commit removes this
    hard coding and takes in this value from the user.
    
    We also tried to use the tensorflow function `list_local_devices` to get
    the count of gpus per host. This did give us the count but would hang
    forever on some segments. So we decided to not use this function.
    
    We now cache the CUDA_VISIBLE_DEVICES env variable (which is set to
    -1 for master) and then reset it at the end of fit function.
    
    Finally, we dynamically calculate the gpu memory fraction to support
    the case when the number of gpus is less than the number of segments.
    
    Co-authored-by: Nikhil Kak <n...@pivotal.io>
---
 .../modules/deep_learning/madlib_keras.py_in       | 73 ++++++++++------
 .../modules/deep_learning/madlib_keras.sql_in      | 35 ++++----
 .../deep_learning/madlib_keras_predict.py_in       | 10 +--
 .../deep_learning/madlib_keras_wrapper.py_in       | 48 +++++++++--
 .../modules/deep_learning/test/madlib_keras.sql_in | 59 +++++++------
 .../test/unit_tests/test_madlib_keras.py_in        | 97 ++++++++++++++++++----
 .../postgres/modules/utilities/utilities.py_in     | 18 ++++
 7 files changed, 237 insertions(+), 103 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 82d1069..8b2a747 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -47,13 +47,14 @@ from keras_model_arch_table import Format
 from utilities.model_arch_info import get_input_shape
 from utilities.model_arch_info import get_num_classes
 from utilities.utilities import is_platform_pg
+from utilities.utilities import get_segments_per_host
 from utilities.utilities import madlib_version
 from utilities.validate_args import get_col_value_and_type
 from utilities.validate_args import quote_ident
 
 def fit(schema_madlib, source_table, model, dependent_varname,
         independent_varname, model_arch_table, model_arch_id, compile_params,
-        fit_params, num_iterations, use_gpu = True,
+        fit_params, num_iterations, gpus_per_host = 0,
         validation_table=None, name="", description="", **kwargs):
 
     source_table = quote_ident(source_table)
@@ -66,7 +67,20 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
         dependent_varname, independent_varname, num_iterations)
 
     start_training_time = datetime.datetime.now()
-    use_gpu = bool(use_gpu)
+
+    gpus_per_host = 0 if gpus_per_host is None else gpus_per_host
+    segments_per_host = get_segments_per_host()
+
+    if 0 < gpus_per_host < segments_per_host:
+        plpy.warning('The number of gpus per host is less than the number of '
+                     'segments per host. The support for this case is '
+                     'experimental and it may fail.')
+
+    #TODO add a unit test for this in a future PR
+    # save the original value of the env variable so that we can reset it 
later.
+    original_cuda_env = None
+    if CUDA_VISIBLE_DEVICES_KEY in os.environ:
+        original_cuda_env = os.environ[CUDA_VISIBLE_DEVICES_KEY]
 
     # Get the serialized master model
     start_deserialization = time.time()
@@ -88,10 +102,11 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
     #TODO: Refactor the pg related logic in a future PR when we think
     # about making the fit function easier to read and maintain.
     if is_platform_pg():
-        set_keras_session(use_gpu)
+        set_keras_session(gpus_per_host, segments_per_host)
     else:
-        # Disable GPU on master for gpdb
-        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+        # we want to disable gpu on gpdb's master node because GPUs will only 
be used
+        # for segment nodes.
+        set_cuda_env('-1')
 
     # Compute total images on each segment
     gp_segment_id_col,\
@@ -137,7 +152,8 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
             $MAD${model_arch}$MAD$::TEXT,
             {compile_params_to_pass}::TEXT,
             {fit_params_to_pass}::TEXT,
-            {use_gpu},
+            {gpus_per_host},
+            {segments_per_host},
             $1
         ) AS iteration_result
         FROM {source_table}
@@ -176,7 +192,9 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
                                                            independent_varname,
                                                            
compile_params_to_pass,
                                                            model_arch, 
model_state,
-                                                           use_gpu, 
seg_ids_val,
+                                                           gpus_per_host,
+                                                           segments_per_host,
+                                                           seg_ids_val,
                                                            rows_per_seg_val,
                                                            gp_segment_id_col)
             end_val = time.time()
@@ -286,6 +304,9 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
     if is_platform_pg():
         clear_keras_session()
 
+    #TODO add a unit test for this in a future PR
+    reset_cuda_env(original_cuda_env)
+
 def get_images_per_seg(source_table, dependent_varname):
     """
     Compute total images in each segment, by querying source_table.  For
@@ -316,7 +337,7 @@ def get_images_per_seg(source_table, dependent_varname):
                    for each_segment in total_images_per_seg]
         gp_segment_id_col = 'gp_segment_id'
     return gp_segment_id_col, seg_ids_train, total_images_per_seg
- 
+
 def get_rows_per_seg_from_db(table_name):
     """
     This function queries the given table and returns the total rows per 
segment.
@@ -350,8 +371,8 @@ def get_rows_per_seg_from_db(table_name):
 
 def fit_transition(state, ind_var, dep_var, current_seg_id, num_classes,
                    all_seg_ids, total_images_per_seg, architecture,
-                   compile_params, fit_params, use_gpu, previous_state,
-                   **kwargs):
+                   compile_params, fit_params, gpus_per_host, 
segments_per_host,
+                   previous_state, **kwargs):
 
     """
 
@@ -365,7 +386,7 @@ def fit_transition(state, ind_var, dep_var, current_seg_id, 
num_classes,
     :param architecture:
     :param compile_params:
     :param fit_params:
-    :param use_gpu:
+    :param gpus_per_host:
     :param previous_state:
     :param kwargs:
     :return:
@@ -375,16 +396,15 @@ def fit_transition(state, ind_var, dep_var, 
current_seg_id, num_classes,
 
     start_transition = time.time()
     SD = kwargs['SD']
-    # Configure GPUs/CPUs
-    device_name = get_device_name_and_set_cuda_env(use_gpu, current_seg_id)
-
+    device_name = get_device_name_and_set_cuda_env(gpus_per_host,
+                                                   current_seg_id)
     # Set up system if this is the first buffer on segment'
-
     if not state:
         if not is_platform_pg():
-            set_keras_session(use_gpu)
+            set_keras_session(gpus_per_host, segments_per_host)
         segment_model = model_from_json(architecture)
         SD['model_shapes'] = 
madlib_keras_serializer.get_model_shapes(segment_model)
+        # Configure GPUs/CPUs
         compile_and_set_weights(segment_model, compile_params, device_name,
                                 previous_state, SD['model_shapes'])
         SD['segment_model'] = segment_model
@@ -524,7 +544,8 @@ def evaluate1(schema_madlib, model_table, test_table, 
id_col, model_arch_table,
 
     loss_acc = get_loss_acc_from_keras_eval(schema_madlib, test_table, 
dependent_varname,
                                             independent_varname, 
compile_params, model_arch,
-                                            model_data, False)
+                                            model_data, False, None)
+
     #TODO remove these infos after adding create table command
     plpy.info('len of evaluate result is {}'.format(len(loss_acc)))
     plpy.info('evaluate result loss is {}'.format(loss_acc[0]))
@@ -532,7 +553,8 @@ def evaluate1(schema_madlib, model_table, test_table, 
id_col, model_arch_table,
 
 def get_loss_acc_from_keras_eval(schema_madlib, table, dependent_varname,
                                  independent_varname, compile_params, 
model_arch,
-                                 model_data, use_gpu, seg_ids_val,
+                                 model_data, gpus_per_host, segments_per_host,
+                                 seg_ids_val,
                                  rows_per_seg_val, gp_segment_id_col):
     """
     This function will call the internal keras evaluate function to get the 
loss
@@ -545,10 +567,11 @@ def get_loss_acc_from_keras_eval(schema_madlib, table, 
dependent_varname,
                                             {independent_varname},
                                             $MAD${model_arch}$MAD$,
                                             $1, {compile_params},
-                                            {use_gpu}, 
-                                            ARRAY{seg_ids_val}, 
+                                            {gpus_per_host},
+                                            {segments_per_host},
+                                            ARRAY{seg_ids_val},
                                             ARRAY{rows_per_seg_val},
-                                            {gp_segment_id_col})) as loss_acc 
+                                            {gp_segment_id_col})) as loss_acc
         from {table}
     ) q""".format(**locals()), ["bytea"])
     res = plpy.execute(evaluate_query, [model_data])
@@ -557,14 +580,16 @@ def get_loss_acc_from_keras_eval(schema_madlib, table, 
dependent_varname,
 
 
 def internal_keras_evaluate(dependent_var, independent_var, model_architecture,
-                            model_data, compile_params, use_gpu, seg_ids_val,
+                            model_data, compile_params, gpus_per_host,
+                            segments_per_host, seg_ids_val,
                             rows_per_seg_val, current_seg, **kwargs):
     SD = kwargs['SD']
-    device_name = get_device_name_and_set_cuda_env(use_gpu, current_seg)
+    device_name = get_device_name_and_set_cuda_env(gpus_per_host,
+                                                   current_seg)
 
     if 'segment_model' not in SD:
         if not is_platform_pg():
-            set_keras_session(use_gpu)
+            set_keras_session(gpus_per_host, segments_per_host)
         model = model_from_json(model_architecture)
         model_shapes = madlib_keras_serializer.get_model_shapes(model)
         _, _, _, model_weights = madlib_keras_serializer.deserialize_weights(
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 3c44205..a492d14 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -38,7 +38,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     compile_params          VARCHAR,
     fit_params              VARCHAR,
     num_iterations          INTEGER,
-    use_gpu                 BOOLEAN,
+    gpus_per_host           INTEGER,
     validation_table        VARCHAR,
     name                    VARCHAR,
     description             VARCHAR
@@ -60,10 +60,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     compile_params          VARCHAR,
     fit_params              VARCHAR,
     num_iterations          INTEGER,
-    use_gpu                 BOOLEAN,
+    gpus_per_host           INTEGER,
     validation_table        VARCHAR
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
$10, $11, NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9,
+                                          $10, $11, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
@@ -77,7 +78,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     compile_params          VARCHAR,
     fit_params              VARCHAR,
     num_iterations          INTEGER,
-    use_gpu                 BOOLEAN
+    gpus_per_host           INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
$10, NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
@@ -94,7 +95,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     fit_params              VARCHAR,
     num_iterations          INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
TRUE, NULL, NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
0, NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
@@ -109,7 +110,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.fit_transition(
     architecture               TEXT,
     compile_params             TEXT,
     fit_params                 TEXT,
-    use_gpu                    BOOLEAN,
+    gpus_per_host              INTEGER,
+    segments_per_host          INTEGER,
     previous_state             BYTEA
 ) RETURNS BYTEA AS $$
 PythonFunctionBodyOnlyNoSchema(`deep_learning', `madlib_keras')
@@ -146,7 +148,8 @@ DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fit_step(
   TEXT,
   TEXT,
   TEXT,
-  BOOLEAN,
+  INTEGER,
+  INTEGER,
   BYTEA);
 CREATE AGGREGATE MADLIB_SCHEMA.fit_step(
     /* ind_var */                REAL[],
@@ -158,7 +161,8 @@ CREATE AGGREGATE MADLIB_SCHEMA.fit_step(
     /* architecture */           TEXT,
     /* compile_params */         TEXT,
     /* fit_params */             TEXT,
-    /* use_gpu */                BOOLEAN,
+    /* gpus_per_host  */         INTEGER,
+    /* segments_per_host  */     INTEGER,
     /* previous_state */         BYTEA
 )(
     STYPE=BYTEA,
@@ -174,7 +178,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_predict(
     independent_varname     VARCHAR,
     output_table            VARCHAR,
     pred_type               VARCHAR,
-    use_gpu                 BOOLEAN
+    gpus_per_host           INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
     with AOControl(False):
@@ -185,7 +189,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_predict(
                independent_varname,
                output_table,
                pred_type,
-               use_gpu)
+               gpus_per_host)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -197,7 +201,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_predict(
     output_table            VARCHAR,
     pred_type               VARCHAR
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, $6, TRUE);
+    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, $6, 0);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
@@ -208,7 +212,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_predict(
     independent_varname     VARCHAR,
     output_table            VARCHAR
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, NULL, TRUE);
+    SELECT MADLIB_SCHEMA.madlib_keras_predict($1, $2, $3, $4, $5, NULL, 0);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
@@ -219,7 +223,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
    input_shape        INTEGER[],
    is_response        BOOLEAN,
    normalizing_const  DOUBLE PRECISION,
-   use_gpu            BOOLEAN,
+   gpus_per_host      INTEGER,
    seg                INTEGER
 ) RETURNS DOUBLE PRECISION[] AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
@@ -231,7 +235,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
                input_shape,
                is_response,
                normalizing_const,
-               use_gpu,
+               gpus_per_host,
                seg)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
@@ -268,7 +272,8 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_evaluate(
    model_architecture TEXT,
    model_data bytea,
    compile_params TEXT,
-   use_gpu BOOLEAN,
+   gpus_per_host INTEGER,
+   segments_per_host INTEGER,
    seg_ids_val INTEGER[],
    rows_per_seg_val INTEGER[],
    current_seg INTEGER
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index 4e2a206..3ad3bbf 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -38,8 +38,6 @@ from utilities.utilities import add_postfix
 from utilities.utilities import create_cols_from_array_sql_string
 from utilities.utilities import is_platform_pg
 from utilities.utilities import unique_string
-from utilities.validate_args import input_tbl_valid
-from utilities.validate_args import output_tbl_valid
 
 import madlib_keras_serializer
 
@@ -81,7 +79,7 @@ def _strip_trailing_nulls_from_class_values(class_values):
     return class_values
 
 def predict(schema_madlib, model_table, test_table, id_col,
-            independent_varname, output_table, pred_type, use_gpu, **kwargs):
+            independent_varname, output_table, pred_type, gpus_per_host, 
**kwargs):
     if not pred_type:
         pred_type = 'response'
     input_validator = PredictInputValidator(
@@ -129,7 +127,7 @@ def predict(schema_madlib, model_table, test_table, id_col,
                         ARRAY{input_shape},
                         {is_response},
                         {normalizing_const},
-                        {use_gpu},
+                        {gpus_per_host},
                         {segment_id})
                    ) AS {intermediate_col}
         FROM {test_table}, {model_table}
@@ -137,9 +135,9 @@ def predict(schema_madlib, model_table, test_table, id_col,
         """.format(MODEL_DATA_COLNAME, **locals()))
 
 def internal_keras_predict(x_test, model_arch, model_data, input_shape,
-                           is_response, normalizing_const, use_gpu, seg):
+                           is_response, normalizing_const, gpus_per_host, seg):
     model = model_from_json(model_arch)
-    device_name = get_device_name_and_set_cuda_env(use_gpu, seg)
+    device_name = get_device_name_and_set_cuda_env(gpus_per_host, seg)
     model_shapes = madlib_keras_serializer.get_model_shapes(model)
     set_model_weights(model, device_name, model_data, model_shapes)
     # Since the test data isn't mini-batched,
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
index 6149411..4a11d18 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
@@ -20,6 +20,7 @@
 import ast
 import os
 import plpy
+from math import ceil
 
 # Do not remove `import keras` although it's not directly used in this file.
 # See madlib_keras.py_in for more details
@@ -35,32 +36,61 @@ import madlib_keras_serializer
 from utilities.utilities import _assert
 from utilities.utilities import is_platform_pg
 
+CUDA_VISIBLE_DEVICES_KEY = 'CUDA_VISIBLE_DEVICES'
 #######################################################################
 ########### Keras specific functions #####
 #######################################################################
-def get_device_name_and_set_cuda_env(use_gpu, seg):
-    gpus_per_host = 4
-    if use_gpu:
+
+def set_cuda_env(value):
+    """
+    :param value: -1 to disable gpu
+    :return:
+    """
+    os.environ[CUDA_VISIBLE_DEVICES_KEY] = value
+
+def reset_cuda_env(value):
+    """
+    This function will reset the cuda env variable. This should only be called
+    if set_cuda_env was called previously.
+    :param value:
+    """
+    if value:
+        set_cuda_env(value)
+    else:
+        del os.environ[CUDA_VISIBLE_DEVICES_KEY]
+
+def get_device_name_and_set_cuda_env(gpus_per_host, seg):
+    if gpus_per_host > 0:
         device_name = '/gpu:0'
         if is_platform_pg():
             cuda_visible_dev = ','.join([i for i in range(gpus_per_host)])
         else:
             cuda_visible_dev = str(seg % gpus_per_host)
-        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_dev
+        set_cuda_env(cuda_visible_dev)
     else: # cpu only
         device_name = '/cpu:0'
-        os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
-
+        set_cuda_env('-1')
     return device_name
 
-def set_keras_session(use_gpu):
+def set_keras_session(gpus_per_host, segments_per_host):
     config = K.tf.ConfigProto()
-    if use_gpu:
+    if gpus_per_host > 0:
+        memory_fraction = get_gpu_memory_fraction(gpus_per_host, 
segments_per_host)
         config.gpu_options.allow_growth = False
-        config.gpu_options.per_process_gpu_memory_fraction = 0.9
+        config.gpu_options.per_process_gpu_memory_fraction = memory_fraction
     session = K.tf.Session(config=config)
     K.set_session(session)
 
+def get_gpu_memory_fraction(gpus_per_host, segments_per_host):
+    """
+    We cap the gpu memory usage to 90% of the total available gpu memory.
+    This 90% is evenly distributed among the segments per gpu.
+    :param gpus_per_host:
+    :param segments_per_host:
+    :return:
+    """
+    return 0.9 / ceil(1.0 * segments_per_host / gpus_per_host)
+
 def clear_keras_session():
     sess = K.get_session()
     K.clear_session()
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index 527d6e8..2421f5f 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -112,7 +112,7 @@ SELECT madlib_keras_fit(
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
     3,
-    FALSE,
+    NULL,
     'cifar_10_sample_val');
 
 SELECT assert(
@@ -151,7 +151,7 @@ FROM (SELECT * FROM keras_saved_out_summary) summary;
 
 SELECT assert(model_data IS NOT NULL , 'Keras model output validation failed') 
FROM (SELECT * FROM keras_saved_out) k;
 
--- Fit with use_gpu set to TRUE must error out on machines
+-- Fit with gpus_per_host set to 2 must error out on machines
 -- that don't have GPUs. Since Jenkins builds are run on docker containers
 -- that don't have GPUs, these queries must error out.
 DROP TABLE IF EXISTS keras_saved_out_gpu, keras_saved_out_gpu_summary;
@@ -165,18 +165,18 @@ SELECT assert(trap_error($TRAP$madlib_keras_fit(
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
     3,
-    TRUE,
+    2,
     'cifar_10_sample_val');$TRAP$) = 1,
-       'Fit with use_gpu=True must error out.');
+       'Fit with gpus_per_host=2 must error out.');
 
--- Prediction with use_gpu set to TRUE must error out on machines
+-- Prediction with gpus_per_host set to 2 must error out on machines
 -- that don't have GPUs. Since Jenkins builds are run on docker containers
 -- that don't have GPUs, these queries must error out.
 
 -- IMPRORTANT: The following test must be run when we have a valid
 -- keras_saved_out model table. Otherwise, it will fail because of a
 -- non-existent model table, while we want to trap failure due to
--- use_gpu=TRUE
+-- gpus_per_host=2
 DROP TABLE IF EXISTS cifar10_predict_gpu;
 SELECT assert(trap_error($TRAP$madlib_keras_predict(
     'keras_saved_out',
@@ -185,8 +185,8 @@ SELECT assert(trap_error($TRAP$madlib_keras_predict(
     'x',
     'cifar10_predict_gpu',
     NULL,
-    TRUE);$TRAP$) = 1,
-    'Prediction with use_gpu=TRUE must error out.');
+    2);$TRAP$) = 1,
+    'Prediction with gpus_per_host=2 must error out.');
 
 -- Test for
   -- Non null name and description columns
@@ -202,7 +202,7 @@ SELECT madlib_keras_fit(
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
     2,
-    FALSE,
+    NULL,
     NULL,
     'model name', 'model desc');
 SELECT assert(
@@ -248,7 +248,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     NULL,
-    FALSE);
+    0);
 
 -- Validate that prediction output table exists and has correct schema
 SELECT assert(UPPER(atttypid::regtype::TEXT) = 'INTEGER', 'id column should be 
INTEGER type')
@@ -276,7 +276,7 @@ SELECT assert(trap_error($TRAP$madlib_keras_predict(
     'x',
     'cifar10_predict',
     NULL,
-    FALSE);$TRAP$) = 1,
+    0);$TRAP$) = 1,
     'Passing batched image table to predict should error out.');
 
 -- Compile and fit parameter tests
@@ -291,7 +291,7 @@ SELECT madlib_keras_fit(
     $$ optimizer='SGD', loss=losses.categorical_crossentropy, 
metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
     1,
-    FALSE,
+    NULL,
     NULL,
     'model name', 'model desc');
 
@@ -306,7 +306,7 @@ SELECT madlib_keras_fit(
     $$ optimizer='Adam()', loss=losses.categorical_crossentropy, 
metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
     1,
-    FALSE,
+    NULL,
     NULL,
     'model name', 'model desc');
 
@@ -322,7 +322,7 @@ SELECT madlib_keras_fit(
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
 
     1,
-    FALSE,
+    0,
     NULL,
     'model name', 'model desc');
 
@@ -337,7 +337,7 @@ SELECT madlib_keras_fit(
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
metrics=['accuracy'], loss_weights=[2], sample_weight_mode=None, 
loss='categorical_crossentropy' $$::text,
     $$ epochs=10, verbose=0, shuffle=True, initial_epoch=1, steps_per_epoch=2 
$$::text,
     1,
-    FALSE,
+    NULL,
     NULL,
     'model name', 'model desc');
 
@@ -357,7 +357,7 @@ select assert(trap_error($TRAP$madlib_keras_fit(
            $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
            $$ batch_size=2, epochs=1, verbose=0 $$::text,
            2,
-           FALSE,
+           NULL,
           'cifar_10_sample_val_failure');$TRAP$) = 1,
        'Passing y of type non numeric array to fit should error out.');
 
@@ -370,7 +370,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'prob',
-    FALSE);
+    0);
 
 SELECT assert(UPPER(atttypid::regtype::TEXT) =
     'DOUBLE PRECISION', 'column prob_0 should be double precision type')
@@ -437,8 +437,7 @@ SELECT madlib_keras_fit(
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
-    3,
-    FALSE);
+    3);
 -- Assert fit has correct class_values
 SELECT assert(
     dependent_vartype = 'text' AND
@@ -455,7 +454,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'prob',
-    FALSE);
+    0);
 
 -- Validate the output datatype of newly created prediction columns
 -- for prediction type = 'prob' and class_values 'TEXT' with NULL as a valid
@@ -489,7 +488,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'response',
-    FALSE);
+    0);
 
 -- Validate the output datatype of newly created prediction columns
 -- for prediction type = 'response' and class_values 'TEXT' with NULL
@@ -513,7 +512,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'prob',
-    FALSE);
+    0);
 
 -- Validate the output datatype of newly created prediction column
 -- for prediction type = 'response' and class_value = NULL
@@ -532,7 +531,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'response',
-    FALSE);
+    0);
 
 -- Validate the output datatype of newly created prediction column
 -- for prediction type = 'response' and class_value = NULL
@@ -570,8 +569,7 @@ SELECT madlib_keras_fit(
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
-    3,
-    FALSE);
+    3);
 
 -- Assert fit has correct class_values
 SELECT assert(
@@ -589,7 +587,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'prob',
-    FALSE);
+    0);
 
 -- Validate the output datatype of newly created prediction column
 -- for prediction type = 'prob' and class_values 'INT' with NULL
@@ -613,7 +611,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'response',
-    FALSE);
+    0);
 
 -- Validate the output datatype of newly created prediction column
 -- for prediction type = 'response' and class_values 'TEXT' with NULL
@@ -669,8 +667,7 @@ SELECT madlib_keras_fit(
     1,
     $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
     $$ batch_size=2, epochs=1, verbose=0 $$::text,
-    3,
-    FALSE);
+    3);
 
 -- Predict with correctly shaped data, must go thru.
 DROP TABLE IF EXISTS cifar10_predict;
@@ -681,7 +678,7 @@ SELECT madlib_keras_predict(
     'x',
     'cifar10_predict',
     'prob',
-    FALSE);
+    0);
 
 -- Prediction with incorrectly shaped data must error out.
 DROP TABLE IF EXISTS cifar10_predict;
@@ -692,5 +689,5 @@ SELECT assert(trap_error($TRAP$madlib_keras_predict(
         'x',
         'cifar10_predict',
         'prob',
-        FALSE);$TRAP$) = 1,
+        0);$TRAP$) = 1,
     'Input shape is (32, 32, 3) but model was trained with (3, 32, 32). Should 
have failed.');
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 912f266..e315a31 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -19,6 +19,7 @@
 
 import sys
 import numpy as np
+import os
 from os import path
 # Add convex module to the pythonpath.
 
sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
@@ -95,7 +96,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
 
         new_model_state = self.subject.fit_transition(
             None, self.independent_var , self.dependent_var, 0, 2, 
self.all_seg_ids, self.total_images_per_seg,
-            self.model.to_json(), self.compile_params, self.fit_params, False,
+            self.model.to_json(), self.compile_params, self.fit_params, 0, 4,
             previous_state.tostring(), **k)
         state = np.fromstring(new_model_state, dtype=np.float32)
         image_count = state[2]
@@ -128,7 +129,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
 
         new_model_state = self.subject.fit_transition(
             None, self.independent_var , self.dependent_var, 0, 2, 
self.all_seg_ids, self.total_images_per_seg,
-            self.model.to_json(), self.compile_params, self.fit_params, False,
+            self.model.to_json(), self.compile_params, self.fit_params, 0, 4,
             previous_state.tostring(), **k)
         state = np.fromstring(new_model_state, dtype=np.float32)
         image_count = state[2]
@@ -163,7 +164,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
         k['SD']['segment_model'] = self.model
         new_model_state = self.subject.fit_transition(
             state.tostring(), self.independent_var, self.dependent_var, 0, 2, 
self.all_seg_ids, self.total_images_per_seg,
-            self.model.to_json(), None, self.fit_params, False, 
'dummy_previous_state', **k)
+            self.model.to_json(), None, self.fit_params, 0, 4, 
'dummy_previous_state', **k)
 
         state = np.fromstring(new_model_state, dtype=np.float32)
         image_count = state[2]
@@ -199,7 +200,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
         k['SD']['segment_model'] = self.model
         new_model_state = self.subject.fit_transition(
             state.tostring(), self.independent_var , self.dependent_var, 0, 2, 
self.all_seg_ids, self.total_images_per_seg,
-            self.model.to_json(), None, self.fit_params, False, 
'dummy_previous_state', **k)
+            self.model.to_json(), None, self.fit_params, 0, 4, 
'dummy_previous_state', **k)
 
         state = np.fromstring(new_model_state, dtype=np.float32)
         image_count = state[2]
@@ -236,7 +237,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
         k['SD']['segment_model'] = self.model
         new_model_state = self.subject.fit_transition(
             state.tostring(), self.independent_var , self.dependent_var, 0, 2, 
self.all_seg_ids, self.total_images_per_seg,
-            self.model.to_json(), None, self.fit_params, False, 
'dummy_previous_state', **k)
+            self.model.to_json(), None, self.fit_params, 0, 4, 
'dummy_previous_state', **k)
 
         state = np.fromstring(new_model_state, dtype=np.float32)
         image_count = state[2]
@@ -262,11 +263,13 @@ class MadlibKerasFitTestCase(unittest.TestCase):
 
         total_images_per_seg = [0,1,1]
 
-        with self.assertRaises(plpy.PLPYException):
+        with self.assertRaises(plpy.PLPYException) as error:
             new_model_state = self.subject.fit_transition(
-            None, self.independent_var , self.dependent_var, 0, 2, 
self.all_seg_ids, total_images_per_seg,
-            self.model.to_json(), self.compile_params, self.fit_params, False,
-            previous_state.tostring(), **k)
+                None, self.independent_var , self.dependent_var, 0, 2,
+                self.all_seg_ids, total_images_per_seg,
+                self.model.to_json(), self.compile_params, self.fit_params,
+                0, 4, previous_state.tostring(), **k)
+        self.assertIn('0 rows', str(error.exception))
 
     def test_fit_transition_too_many_images(self):
         self.subject.K.set_session = Mock()
@@ -280,12 +283,28 @@ class MadlibKerasFitTestCase(unittest.TestCase):
 
         total_images_per_seg = [1,1,1]
 
-        with self.assertRaises(plpy.PLPYException):
+        with self.assertRaises(plpy.PLPYException) as error:
             new_model_state = self.subject.fit_transition(
             None, self.independent_var , self.dependent_var, 0, 2, 
self.all_seg_ids, total_images_per_seg,
-            self.model.to_json(), self.compile_params, self.fit_params, False,
+            self.model.to_json(), self.compile_params, self.fit_params, 0, 4,
             previous_state.tostring(), **k)
 
+        self.assertIn('only 1', str(error.exception))
+
+    def test_fit_transition_first_tuple_none_ind_var_dep_var(self):
+        k = {}
+        self.assertEqual('dummy_state',
+                         self.subject.fit_transition('dummy_state', None , 
[0], 1, 2,
+                                                     [0,1,2], [3,3,3], 
'dummy_model_json', "foo", "bar", 0, 4,
+                                                     'dummy_prev_state', **k))
+        self.assertEqual('dummy_state',
+                         self.subject.fit_transition('dummy_state', [[0.5]], 
None, 1, 2,
+                                                     [0,1,2], [3,3,3], 
'dummy_model_json', "foo", "bar", 0, 4,
+                                                     'dummy_prev_state', **k))
+        self.assertEqual('dummy_state',
+                         self.subject.fit_transition('dummy_state', None, 
None, 1, 2,
+                                                     [0,1,2], [3,3,3], 
'dummy_model_json', "foo", "bar", 0, 4,
+                                                     'dummy_prev_state', **k))
 
     def test_fit_merge(self):
         image_count = self.total_images_per_seg[0]
@@ -367,14 +386,33 @@ class MadlibKerasFitTestCase(unittest.TestCase):
         result = self.subject.fit_final(None)
         self.assertEqual(result, None)
 
-    def test_get_device_name_and_set_cuda_env(self):
-        import os
+    def test_get_device_name_and_set_cuda_env_postgres(self):
+        self.subject.is_platform_pg = Mock(return_value = True)
+
+        seg_id = -1
+        gpus_per_host = 3
         self.assertEqual('/gpu:0', 
self.subject.get_device_name_and_set_cuda_env(
-            True, 1))
-        self.assertEqual('1', os.environ["CUDA_VISIBLE_DEVICES"])
+            gpus_per_host, seg_id ))
+        self.assertEqual('0,1,2', os.environ['CUDA_VISIBLE_DEVICES'])
+
+        gpus_per_host = 0
         self.assertEqual('/cpu:0', 
self.subject.get_device_name_and_set_cuda_env(
-            False, 1))
-        self.assertEqual('-1', os.environ["CUDA_VISIBLE_DEVICES"])
+            gpus_per_host, seg_id ))
+        self.assertEqual('-1', os.environ['CUDA_VISIBLE_DEVICES'])
+
+    def test_get_device_name_and_set_cuda_env_gpdb(self):
+        self.subject.is_platform_pg = Mock(return_value = False)
+
+        seg_id=3
+        gpus_per_host=2
+        self.assertEqual('/gpu:0', 
self.subject.get_device_name_and_set_cuda_env(
+            gpus_per_host, seg_id))
+        self.assertEqual('1', os.environ['CUDA_VISIBLE_DEVICES'])
+
+        gpus_per_host=0
+        self.assertEqual('/cpu:0', 
self.subject.get_device_name_and_set_cuda_env(
+            gpus_per_host, seg_id))
+        self.assertEqual('-1', os.environ['CUDA_VISIBLE_DEVICES'])
 
     def test_fit_transition_first_tuple_none_ind_var_dep_var(self):
         k = {}
@@ -443,7 +481,30 @@ class MadlibKerasFitTestCase(unittest.TestCase):
                             accepted_fit_params)
         self.assertDictEqual(result_params, target_dict)
 
-## Negative Tests
+    def test_get_gpu_memory_fraction(self):
+
+        gpus_per_host = 4
+        segments_per_host = 4
+        result = self.subject.get_gpu_memory_fraction(gpus_per_host, 
segments_per_host)
+        self.assertEqual(result, 0.9)
+
+        gpus_per_host = 10
+        segments_per_host = 4
+        result = self.subject.get_gpu_memory_fraction(gpus_per_host, 
segments_per_host)
+        self.assertEqual(result, 0.9)
+
+        gpus_per_host = 2
+        segments_per_host = 6
+        result = self.subject.get_gpu_memory_fraction(gpus_per_host, 
segments_per_host)
+        self.assertEqual(result, 0.3)
+
+        gpus_per_host = 1
+        segments_per_host = 4
+        result = self.subject.get_gpu_memory_fraction(gpus_per_host, 
segments_per_host)
+        self.assertEqual(result, 0.225)
+
+
+    ## Negative Tests
     def test_parse_and_validate_compile_params_dict_metrics_fail(self):
         test_str = "optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics={'0':'accuracy'}"
 
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in 
b/src/ports/postgres/modules/utilities/utilities.py_in
index e57f407..e6e31c5 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -47,6 +47,24 @@ def get_seg_number():
         return max(1, count)
 # 
------------------------------------------------------------------------------
 
+def get_segments_per_host():
+    """ Find out how many primary segments(not include master segment) exist
+        per host. We assume every host has the same number of segments and
+        we only return the first one.
+    """
+    if is_platform_pg():
+        return 1
+    else:
+        count = plpy.execute("""
+            SELECT count(*) from gp_segment_configuration
+            WHERE role = 'p' and content != -1
+            GROUP BY hostname
+            LIMIT 1
+            """)[0]['count']
+        # in case some weird gpdb configuration happens, always returns
+        # primary segment number >= 1
+        return max(1, count)
+# 
------------------------------------------------------------------------------
 
 def is_orca():
     if has_function_properties():

[madlib] 01/02: DL: Replace use_gpu flag with gpus_per_host integer

Reply via email to