[GitHub] [madlib] reductionista commented on a change in pull request #532: DL: Add multiple variable support

GitBox Fri, 22 Jan 2021 14:14:26 -0800


reductionista commented on a change in pull request #532:
URL: https://github.com/apache/madlib/pull/532#discussion_r562896522




##########
File path: 
src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
##########
@@ -940,66 +910,13 @@ $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl(
-    source_table            VARCHAR,
-    output_table            VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
-    buffer_size             INTEGER,
-    normalizing_const       REAL,
-    num_classes             INTEGER
-) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, $5, $6, $7, 
NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl(
-    source_table            VARCHAR,
-    output_table            VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
-    buffer_size             INTEGER,
-    normalizing_const       REAL
-) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL, 
NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl(
-    source_table            VARCHAR,
-    output_table            VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
-    buffer_size             INTEGER
-) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, $5, 1.0, NULL, 
NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl(
-    source_table            VARCHAR,
-    output_table            VARCHAR,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR
-) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, NULL, 1.0, 
NULL, NULL);
-$$ LANGUAGE sql VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl(
-    message VARCHAR

Review comment:
       Thanks for cleaning up all these extra function definitions, so much 
easier to read!

##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
##########
@@ -86,18 +87,34 @@ class BasePredict():
         if self.pred_type == 1:
             rank_create_sql = ""
 
-        self.pred_vartype = self.dependent_vartype.strip('[]')
-        unnest_sql = ''
-        if self.pred_vartype in ['text', 'character varying', 'varchar']:
+        self.pred_vartype = [i.strip('[]') for i in self.dependent_vartype]
+        unnest_sql = []
+        full_class_name_list = []
+        full_class_value_list = []
+
+        for i in range(self.dependent_var_count):
+
+            if self.pred_vartype[i] in ['text', 'character varying', 
'varchar']:
+
+                unnest_sql.append("unnest(ARRAY{0}) AS {1}".format(
+                    ['NULL' if j is None else j for j in class_values[i]],
+                    self.dependent_varname[i]))

Review comment:
       I think this should be:
      quote_ident(self.dependent_varname[i])
   instead of:
      self.dependent_varname[i]

##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
##########
@@ -1811,6 +1697,48 @@ PythonFunctionBodyOnlyNoSchema(`deep_learning', 
`madlib_keras')
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.fit_transition_wide(
+    state                       BYTEA,
+    dependent_var1              BYTEA,
+    dependent_var2              BYTEA,
+    dependent_var3              BYTEA,
+    dependent_var4              BYTEA,
+    dependent_var5              BYTEA,
+    independent_var1            BYTEA,
+    independent_var2            BYTEA,
+    independent_var3            BYTEA,
+    independent_var4            BYTEA,
+    independent_var5            BYTEA,

Review comment:
       Have you tried using the VARIADIC keyword here to get past this 5 param 
limit?  I recently discovered it works in both gpdb5 and gpdb6.  Example:
   ```
   CREATE FUNCTION fit_transition_wide(dependent_var VARIADIC BYTEA[]) RETURNS 
INTEGER[] AS $$ SELECT ARRAY[LENGTH($1[1]), LENGTH($1[2]), LENGTH($1[3])]; $$ 
LANGUAGE SQL;
   
   SELECT fit_transition_wide('abcd'::BYTEA, 'def'::BYTEA, 'hijkml'::BYTEA);
    fit_transition_wide
   ---------------------
    {4,3,6}
   (1 row)
   
   or
   
   select fit_transition_wide(VARIADIC ARRAY['abcd'::BYTEA, 'def'::BYTEA, 
'hijkml'::BYTEA]);
    fit_transition_wide
   ---------------------
    {4,3,6}
   ```
   It basically works the same as *args in python.  It has to be the last 
parameter of the function, after all the others--the only catch is, there can 
only be one of them for each function.  But since they're both the same type, 
each with an equal number of params, it seems like we might be able to pass 
then both through the same VARIADIC param and have python split it in half 
after receiving it.
   
   Not sure if that might re-introduce the 1GB limit though.  If passing them 
as 10 separate params gives us a 10GB limit but passing them all through 
VARIADIC limits us to only 1GB then it wouldn't be worth it.  I'd assume most 
people won't care about having more than 5 anyway, but not sure.

##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
##########
@@ -116,63 +133,82 @@ class BasePredict():
 
         # Calling CREATE TABLE instead of CTAS, to ensure that the 
plan_cache_mode
         # guc codepath is called when passing in the weights
-        plpy.execute("""
+        sql = """
             CREATE TABLE {self.output_table}
                 ({self.id_col} {self.id_col_type},
-                 {self.dependent_varname} {self.pred_vartype},
+                 class_name TEXT,
+                 class_value TEXT,
                  {pred_col_name} {pred_col_type},
                  rank INTEGER)
-            """.format(**locals()))
+            """.format(**locals())
+        plpy.execute(sql)
+
+        independent_varname_sql = ["{0}::REAL[]".format(i) for i in 
self.independent_varname]

Review comment:
       quote_ident(self.independent_varname)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [madlib] reductionista commented on a change in pull request #532: DL: Add multiple variable support

Reply via email to