[1/2] incubator-madlib git commit: Summary: Add param to determine num of cols per run

riyer Thu, 08 Jun 2017 13:32:24 -0700

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 186aefa6d -> edc9b9f35



Summary: Add param to determine num of cols per run

JIRA: MADLIB-1117

Summary used a hard-coded parameter of a maximum of 15 columns per run.
This was put in place to avoid out-of-memory errors in most cases.
This, however, limits the run time since higher number of columns can be
summarized in a single run for a simpler data set (one which leads to
smaller sketch data structures).

This commit adds a new parameter allowing users to set this limit,
while retaining the old default of 15 columns.

Closes #138


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e05efaaa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e05efaaa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e05efaaa

Branch: refs/heads/master
Commit: e05efaaa626328535b3ed3bbc76d8f954ef7f7d2
Parents: 186aefa
Author: Rahul Iyer <ri...@apache.org>
Authored: Mon Jun 5 16:36:50 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Jun 8 13:30:16 2017 -0700

----------------------------------------------------------------------
 .../postgres/modules/summary/Summarizer.py_in   |  24 +--
 .../postgres/modules/summary/summary.py_in      | 148 +++++++++----------
 .../postgres/modules/summary/summary.sql_in     |  92 +++++++++---
 .../modules/summary/test/summary.sql_in         |   3 +
 4 files changed, 155 insertions(+), 112 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/Summarizer.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/Summarizer.py_in 
b/src/ports/postgres/modules/summary/Summarizer.py_in
index 11ab42a..3625123 100644
--- a/src/ports/postgres/modules/summary/Summarizer.py_in
+++ b/src/ports/postgres/modules/summary/Summarizer.py_in
@@ -11,7 +11,7 @@ class Summarizer:
     def __init__(self, schema_madlib, source_table, output_table,
                  target_cols, grouping_cols, distinctify, get_quartiles,
                  xtileify='Exact', ntile_array=None, how_many_mfv=10,
-                 get_mfv_quick=False):
+                 get_mfv_quick=False, n_cols_per_run=15):
         self._schema_madlib = schema_madlib
         self._source_table = source_table
         self._output_table = output_table
@@ -21,8 +21,9 @@ class Summarizer:
         self._get_quartiles = get_quartiles
         self._xtileify = xtileify
         self._ntile_array = ntile_array
-        self._how_many_mfv = how_many_mfv
+        self._how_many_mfv = int(how_many_mfv)
         self._get_mfv_quick = get_mfv_quick
+        self._n_cols_per_run = int(n_cols_per_run)
         self._columns = None
         self._column_names = None
         self._delimiter = '_.*.&.!.!.&.*_'
@@ -32,7 +33,7 @@ class Summarizer:
             lower_target_cols = [each_col.lower() for each_col in 
self._target_cols
                                  if '"' not in each_col]
             target_selection = "AND attname = ANY({0})".format(
-                    py_list_to_sql_string(lower_target_cols, 
array_type="text"))
+                py_list_to_sql_string(lower_target_cols, array_type="text"))
         else:
             target_selection = ""
         self._columns = plpy.execute("""
@@ -103,7 +104,9 @@ class Summarizer:
             plpy.error("""
                 Summary -- Invalid parameter: Number of most frequent values
                 required should be positive""")
-
+        if self._n_cols_per_run is None or self._n_cols_per_run <= 0:
+            plpy.error(" Summary -- Invalid parameter: Number of columns per 
run"
+                       "should be positive")
 
 # ----- End of argument validation functions -----------------------------
 
@@ -367,15 +370,14 @@ class Summarizer:
         #  issues when a lot of columns are computed concurrently
         #  We repeat the query multiple times till computation is complete for
         #  all columns.
-        actual_nCols = len(self._columns)
-        max_nCols = 15
+        actual_n_cols = len(self._columns)
         # ensuring an even spread of columns in each repeated attempt. For eg.
-        #  if max_nCols = 15, to simulate 31 cols we break it down as [11, 11, 
9]
+        #  if self._n_cols_per_run = 15, to simulate 31 cols we break it down 
as [11, 11, 9]
         #  instead of [15, 15, 1]. This ensures low memory usage in each 
subquery
-        nSplits = math.ceil(float(actual_nCols) / max_nCols)
-        subset_nCols = int(math.ceil(actual_nCols / nSplits))
-        subset_columns = [self._columns[pos: pos + subset_nCols]
-                          for pos in range(0, actual_nCols, subset_nCols)]
+        n_splits = math.ceil(float(actual_n_cols) / self._n_cols_per_run)
+        subset_n_cols = int(math.ceil(actual_n_cols / n_splits))
+        subset_columns = [self._columns[pos: pos + subset_n_cols]
+                          for pos in range(0, actual_n_cols, subset_n_cols)]
         for cols in subset_columns:
             for group_val in self._grouping_cols:
                 # summary treats the comma-separated list of grouping_cols as

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/summary.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.py_in 
b/src/ports/postgres/modules/summary/summary.py_in
index 8d5f1fa..ff813f3 100644
--- a/src/ports/postgres/modules/summary/summary.py_in
+++ b/src/ports/postgres/modules/summary/summary.py_in
@@ -7,84 +7,74 @@
 """
 import plpy
 from time import time
-from utilities.utilities import __mad_version
+
+from utilities.control import MinWarning
 from Summarizer import Summarizer
-version_wrapper = __mad_version()
-_get_vector = version_wrapper.select_vecfunc()
+
 
 def summary(schema_madlib, source_table, output_table, target_cols, 
grouping_cols,
-    get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates):
+            get_distinct, get_quartiles, ntile_array, how_many_mfv,
+            get_estimates, n_cols_per_run):
     """
-        Main summary function that is called by SQL to execute summary
+        Main summary function that is called by SQL to compute summary
         statistics on a table.
 
-        @param schema_madlib        Madlib Schema namespace
-        @param source_table         Name of input table
-        @param output_table         Name of output table
-        @param target_cols          Names of specific columns for which to get 
summary
-        @param grouping_cols        Names of columns on which to group-by
-                                        (no summary is provided for these 
columns)
-        @param get_distinct         Should summary include distinct count
-        @param get_quartiles        Should summary include quartile information
-        @param ntile_array          Array for quantiles to include in summary
-                                        (each element should be in [0, 1])
-        @param how_many_mfv         How many frequent values to output?
-        @param get_estimates        Should the summmary information be 
estimated or exact?
+        @param schema_madlib   Madlib Schema namespace
+        @param source_table    Name of input table
+        @param output_table    Name of output table
+        @param target_cols     Names of specific columns for which to get 
summary
+        @param grouping_cols   Names of columns on which to group-by
+                                   (no summary is provided for these columns)
+        @param get_distinct    Should summary include distinct count
+        @param get_quartiles   Should summary include quartile information
+        @param ntile_array     Array for quantiles to include in summary
+                                   (each element should be in [0, 1])
+        @param how_many_mfv    How many frequent values to output?
+        @param get_estimates   Should the summmary information be estimated or 
exact?
+        @param n_cols_per_run  Number of columns to compute statistics in a 
single run
 
     """
 
-    old_msg_level = plpy.execute(
-        """select setting from pg_settings where
-        name='client_min_messages'""")[0]['setting']
-    plpy.execute("set client_min_messages to error")
-
-    # 'Estimated', 'Exact', None
-    distinctify = 'Estimated'
-    xtileify = 'Exact'          # 'Estimated' not supported at present
-    get_mfv_quick = True
-
-    if not get_estimates:
-        distinctify = 'Exact'
-        #xtileify = 'Exact'
-        get_mfv_quick = False
-
-    if not get_distinct:
-        distinctify = 'Skip'
-
-    if (version_wrapper.is_pg_version_less_than('9.4') or
-            version_wrapper.is_gp_version_less_than('4.2.2')):
-        # PERCENTILE_CONT not available in PostgreSQL < 9.4 or GPDB < 4.2.2.
-        # The function is available in HAWQ 1.2.0 (even though HAWQ 1.2.0
-        #   is based on GPDB 4.2.0)
-        xtileify = 'Skip'
-
-    # GPDB < 4.2 and PG < 9.0 passes vector as a string.
-    # Ensure that ntile_array is a list of floats not a string
-    if ntile_array:
-        try:
-            ntile_array = [float(tile) for tile in _get_vector(ntile_array)]
-        except ValueError:
-            plpy.error("""Summary -- Invalid paramter:
-                        Percentile array has non-float data""")
-
-    # convert comma delimited string input to a list of column names
-    (target_cols, grouping_cols) = _analyze_str_inputs(target_cols, 
grouping_cols)
-
-    start = time()
-    summarizer = Summarizer(
-        schema_madlib, source_table, output_table, target_cols, grouping_cols,
-        distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv,
-        get_mfv_quick)
-    summarizer.run()
-    end = time()
-
-    row_count = plpy.execute(
-        "SELECT count(*) FROM {output_table}".format(
-            output_table = output_table))[0]['count']
-
-    plpy.execute("set client_min_messages to " + old_msg_level)
+    with MinWarning('error'):
+        # 'Estimated', 'Exact', None
+        distinctify = 'Estimated'
+        xtileify = 'Exact'          # 'Estimated' not supported at present
+        get_mfv_quick = True
+
+        if not get_estimates:
+            distinctify = 'Exact'
+            # xtileify = 'Exact'
+            get_mfv_quick = False
+
+        if not get_distinct:
+            distinctify = 'Skip'
+
+        # Ensure that ntile_array is a list of floats not a string
+        if ntile_array:
+            try:
+                ntile_array = [float(tile) for tile in ntile_array]
+            except ValueError:
+                plpy.error("""Summary -- Invalid paramter:
+                            Percentile array has non-float data""")
+
+        # convert comma delimited string input to a list of column names
+        (target_cols, grouping_cols) = _analyze_str_inputs(target_cols, 
grouping_cols)
+
+        start = time()
+        summarizer = Summarizer(
+            schema_madlib, source_table, output_table, target_cols, 
grouping_cols,
+            distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv,
+            get_mfv_quick, n_cols_per_run=n_cols_per_run)
+        summarizer.run()
+        end = time()
+
+        row_count = plpy.execute(
+            "SELECT count(*) FROM {output_table}".format(
+                output_table=output_table))[0]['count']
+
     return (output_table, row_count, end - start)
 
+
 # -----------------------------------------------------------------------
 # Input parameter checks and edits
 # -----------------------------------------------------------------------
@@ -102,9 +92,10 @@ def _analyze_str_inputs(target_cols, grouping_cols):
         grouping_cols = [None]
     else:
         grouping_cols = grouping_cols.replace(' ', '').split(',')
-        grouping_cols.append(None) # to return summary on complete table
+        grouping_cols.append(None)  # to return summary on complete table
     return (target_cols, grouping_cols)
 
+
 # -----------------------------------------------------------------------
 # Help messages
 # -----------------------------------------------------------------------
@@ -117,7 +108,7 @@ def summary_help_message(schema_madlib, message, **kwargs):
         return """
         Usage:
         -----------------------------------------------------------------------
-        SELECT {schema_madlib}.summary
+        SELECT {madlib}.summary
         (
             source_table            TEXT,       -- Source table name (Required)
             output_table            TEXT,       -- Output table name (Required)
@@ -133,9 +124,12 @@ def summary_help_message(schema_madlib, message, **kwargs):
                                                 --      (Default: NULL - 
Quantile array is not included in result)
             how_many_mfv            INTEGER,    -- How many 
most-frequent-values (MFVs) to compute?
                                                 --      (Default: 10)
-            get_estimates           BOOLEAN     -- Should we produce an 
estimated
+            get_estimates           BOOLEAN,    -- Should we produce an 
estimated
                                                 -- (as opposed to an exact but 
slow) value for distincts and MFVs?
-        )                                       --      (Default: True)
+                                                --      (Default: True)
+            n_cols_per_run          INTEGER     -- Number of columns to 
collect summary statistics in
+                                                -- one pass of the data
+        )
         -----------------------------------------------------------------------
         Output table will be in following format
         (presence of some columns depends on the arguments):
@@ -160,7 +154,7 @@ def summary_help_message(schema_madlib, message, **kwargs):
             - quantile_array        : Percentile values corresponding to 
ntile_array
             - most_frequent_values  : Most frequent values
             - mfv_frequencies       : Frequency of the most frequent values
-        """.format(schema_madlib = schema_madlib)
+        """.format(madlib=schema_madlib)
     elif message is not None and message.lower() in ('example', 'examples'):
         return """
             DROP TABLE IF EXISTS example_data;
@@ -196,17 +190,17 @@ def summary_help_message(schema_madlib, message, 
**kwargs):
             SELECT madlib.summary('example_data', 'example_data_output', NULL, 
NULL, True, True, array[0.1, 0.2, 0.3]);
             SELECT madlib.summary('example_data', 'example_data_output', NULL, 
NULL, True, True, array[0.1, 0.2, 0.3], 2);
             SELECT madlib.summary('example_data', 'example_data_output', NULL, 
NULL, True, True, array[0.1, 0.2, 0.3], 2, False);
+            SELECT madlib.summary('example_data', 'example_data_output', NULL, 
NULL, True, True, array[0.1, 0.2, 0.3], 2, False, 2);
          """
     else:
-       return """
+        return """
             'summary' is a generic function used to produce summary statistics
             of any data table.  The function invokes particular 'methods' from
             the MADlib library to provide an overview of the data.
             -------
             For an overview on usage, run:
-            SELECT {schema_madlib}.summary('usage');
+            SELECT {madlib}.summary('usage');
             -------
             For an example, run:
-            SELECT {schema_madlib}.summary('example')
-            """.format(schema_madlib = schema_madlib)
-
+            SELECT {madlib}.summary('example')
+            """.format(madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/summary.sql_in 
b/src/ports/postgres/modules/summary/summary.sql_in
index 9f56b4d..bdde214 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -42,7 +42,8 @@ summary ( source_table,
           get_quartiles,
           ntile_array,
           how_many_mfv,
-          get_estimates
+          get_estimates,
+          n_cols_per_run
         )
 </pre>
 The \b summary() function returns a composite type containing three fields:
@@ -162,31 +163,52 @@ following columns:
         <td>Array containing the frequency count for each of the most frequent 
values. </td>
     </tr>
 </table></dd>
+
 <dt>target_columns (optional)</dt>
 <dd>TEXT, default NULL. A comma-separated list of columns to summarize. If 
NULL, summaries are produced for all columns.</dd>
+
 <dt>grouping_cols (optional)</dt>
-<dd>TEXT, default: null. A comma-separated list of columns on which to 
+<dd>TEXT, default: null. A comma-separated list of columns on which to
 group results. If NULL, summaries are produced on the complete table.</dd>
-@note Please note that summary statistics are calculated for each grouping 
+@note Please note that summary statistics are calculated for each grouping
 column independently.  That is, grouping columns are not combined together
-as in the regular PostgreSQL style GROUP BY directive.  (This was done 
+as in the regular PostgreSQL style GROUP BY directive.  (This was done
 to reduce long run time and huge output table size which would otherwise
-result in the case of large input tables with a lot of grouping_cols and 
+result in the case of large input tables with a lot of grouping_cols and
 target_cols specified.)
+
 <dt>get_distinct (optional)</dt>
 <dd>BOOLEAN, default TRUE. If true, distinct values are counted.</dd>
+
 <dt>get_quartiles (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, quartiles are computed.</dd>
+
 <dt>ntile_array (optional)</dt>
 <dd>FLOAT8[], default NULL. An array of quantile values to compute. If NULL, 
quantile values are not computed.</dd>
-@note Quartile and quantile functions are not available for PostgreSQL 9.3 or 
-lower.  If you are using PostgreSQL 9.3 or lower, the output table will not 
-contain these values, even if you set 'get_quartiles' = TRUE or 
+@note Quartile and quantile functions are not available for PostgreSQL 9.3 or
+lower.  If you are using PostgreSQL 9.3 or lower, the output table will not
+contain these values, even if you set 'get_quartiles' = TRUE or
 provide an array of quantile values for the parameter 'ntile_array'.
+
 <dt>how_many_mfv (optional)</dt>
 <dd>INTEGER, default: 10. The number of most-frequent-values to compute.</dd>
+
 <dt>get_estimates (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for distinct 
values and most frequent values. If FALSE, exact values are calculated (may 
take longer to run depending on data size).</dd>
+
+<dt>n_cols_per_run (optional)</dt>
+<dd>INTEGER, default: 15. The number of columns to collect summary statistics 
in
+one pass of the data.
+This parameter determines the number of passes through the data. For e.g.,
+with a total of 40 columns to summarize and 'n_cols_per_run = 15', there will 
be
+3 passes through the data, with each pass summarizing a maximum of 15 columns.
+@note This parameter should be used with caution. Increasing this parameter 
could
+decrease the total run time (if number of passes decreases), but will increase
+the memory consumption during each run. Since Postgresql limits the memory 
available
+for a single aggregate run, this increased memory consumption could result in
+out-of-memory termination error.
+
+</dd>
 </DL>
 
 
@@ -294,15 +316,15 @@ string should be double-quoted; in this case the input 
would be '"MyTable"').
     in a slow but exact method. The most frequent values are computed using a
     faithful implementation that preserves the approximation guarantees of
     the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch).
-- Summary statistics are calculated for each grouping 
+- Summary statistics are calculated for each grouping
 column independently.  That is, grouping columns are not combined together
-as in the regular PostgreSQL style GROUP BY directive.  (This was done 
+as in the regular PostgreSQL style GROUP BY directive.  (This was done
 to reduce long run time and huge output table size which would otherwise
-result in the case of large input tables with a lot of grouping_cols and 
+result in the case of large input tables with a lot of grouping_cols and
 target_cols specified.)
-- Quartile and quantile functions are not available for PostgreSQL 9.3 or 
-lower.  If you are using PostgreSQL 9.3 or lower, the output table will not 
-contain these values, even if you set 'get_quartiles' = TRUE or 
+- Quartile and quantile functions are not available for PostgreSQL 9.3 or
+lower.  If you are using PostgreSQL 9.3 or lower, the output table will not
+contain these values, even if you set 'get_quartiles' = TRUE or
 provide an array of quantile values for the parameter 'ntile_array'.
 
 
@@ -338,6 +360,7 @@ CREATE TYPE MADLIB_SCHEMA.summary_result AS
  * @param ntile_array       Array of percentiles to compute
  * @param how_many_mfv      How many most frequent values to compute?
  * @param get_estimates     Should distinct counts be an estimated (faster) or 
exact count?
+ * @param n_cols_per_run    Number of columns to use per run of summary
  *
  * @usage
  *
@@ -346,7 +369,7 @@ CREATE TYPE MADLIB_SCHEMA.summary_result AS
  *       '<em>target_cols</em>', '<em>grouping_cols</em>',
  *       '<em>get_distinct</em>', '<em>get_quartiles</em>',
  *       '<em>ntile_array</em>', '<em>how_many_mfv</em>',
- *       '<em>get_estimates</em>'
+ *       '<em>get_estimates</em>', '<em>n_cols_per_run</em>'
  *   );
  *   SELECT * FROM '<em>output_table</em>'
  *  </pre>
@@ -362,14 +385,16 @@ MADLIB_SCHEMA.summary
     get_quartiles           BOOLEAN,    -- Are quartiles required
     ntile_array             FLOAT8[],   -- Array of quantiles to compute
     how_many_mfv            INTEGER,    -- How many most frequent values to 
compute?
-    get_estimates           BOOLEAN     -- Should we produce exact or estimated
+    get_estimates           BOOLEAN,    -- Should we produce exact or estimated
                                         --      values for distinct computation
+    n_cols_per_run          INTEGER     -- Number of columns to use per run of 
summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     PythonFunctionBodyOnly(`summary', `summary')
     return summary.summary(
         schema_madlib, source_table, output_table, target_cols, grouping_cols,
-        get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates)
+        get_distinct, get_quartiles, ntile_array, how_many_mfv,
+        get_estimates, n_cols_per_run)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -386,11 +411,30 @@ MADLIB_SCHEMA.summary
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN,
     ntile_array             FLOAT8[],
+    how_many_mfv            INTEGER,
+    get_estimates           BOOLEAN
+)
+RETURNS MADLIB_SCHEMA.summary_result AS $$
+    SELECT MADLIB_SCHEMA.summary(
+        $1, $2, $3, $4, $5, $6, $7, $8, $9, 15)
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION
+MADLIB_SCHEMA.summary
+(
+    source_table            TEXT,
+    output_table            TEXT,
+    target_cols             TEXT,
+    grouping_cols           TEXT,
+    get_distinct            BOOLEAN,
+    get_quartiles           BOOLEAN,
+    ntile_array             FLOAT8[],
     how_many_mfv            INTEGER
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, $3, $4, $5, $6, $7, $8, True)
+        $1, $2, $3, $4, $5, $6, $7, $8, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -407,7 +451,7 @@ MADLIB_SCHEMA.summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, $3, $4, $5, $6, $7, 10, True)
+        $1, $2, $3, $4, $5, $6, $7, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -423,7 +467,7 @@ MADLIB_SCHEMA.summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, $3, $4, $5, $6, NULL, 10, True)
+        $1, $2, $3, $4, $5, $6, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -438,7 +482,7 @@ MADLIB_SCHEMA.summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, $3, $4, $5, True, NULL, 10, True)
+        $1, $2, $3, $4, $5, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -452,7 +496,7 @@ MADLIB_SCHEMA.summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, $3, $4, True, True, NULL, 10, True)
+        $1, $2, $3, $4, True, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -465,7 +509,7 @@ MADLIB_SCHEMA.summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, $3, NULL, True, True, NULL, 10, True)
+        $1, $2, $3, NULL, True, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -477,7 +521,7 @@ MADLIB_SCHEMA.summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
-        $1, $2, NULL, NULL, True, True, NULL, 10, True)
+        $1, $2, NULL, NULL, True, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/test/summary.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/summary/test/summary.sql_in 
b/src/ports/postgres/modules/summary/test/summary.sql_in
index e4f3f65..fc4b787 100644
--- a/src/ports/postgres/modules/summary/test/summary.sql_in
+++ b/src/ports/postgres/modules/summary/test/summary.sql_in
@@ -61,3 +61,6 @@ SELECT * from example_data_summary;
 DROP TABLE IF EXISTS example_data_summary;
 SELECT summary('example_data', 'example_data_summary', NULL, NULL, True, True, 
array[0.1, 0.2, 0.3], 10, False);
 SELECT * from example_data_summary;
+DROP TABLE IF EXISTS example_data_summary;
+SELECT summary('example_data', 'example_data_summary', NULL, NULL, True, True, 
array[0.1, 0.2, 0.3], 10, False, 1);
+SELECT * from example_data_summary;

[1/2] incubator-madlib git commit: Summary: Add param to determine num of cols per run

Reply via email to