Repository: incubator-madlib Updated Branches: refs/heads/master 186aefa6d -> edc9b9f35
Summary: Add param to determine num of cols per run JIRA: MADLIB-1117 Summary used a hard-coded parameter of a maximum of 15 columns per run. This was put in place to avoid out-of-memory errors in most cases. This, however, limits the run time since higher number of columns can be summarized in a single run for a simpler data set (one which leads to smaller sketch data structures). This commit adds a new parameter allowing users to set this limit, while retaining the old default of 15 columns. Closes #138 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e05efaaa Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e05efaaa Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e05efaaa Branch: refs/heads/master Commit: e05efaaa626328535b3ed3bbc76d8f954ef7f7d2 Parents: 186aefa Author: Rahul Iyer <ri...@apache.org> Authored: Mon Jun 5 16:36:50 2017 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Thu Jun 8 13:30:16 2017 -0700 ---------------------------------------------------------------------- .../postgres/modules/summary/Summarizer.py_in | 24 +-- .../postgres/modules/summary/summary.py_in | 148 +++++++++---------- .../postgres/modules/summary/summary.sql_in | 92 +++++++++--- .../modules/summary/test/summary.sql_in | 3 + 4 files changed, 155 insertions(+), 112 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/Summarizer.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/summary/Summarizer.py_in b/src/ports/postgres/modules/summary/Summarizer.py_in index 11ab42a..3625123 100644 --- a/src/ports/postgres/modules/summary/Summarizer.py_in +++ b/src/ports/postgres/modules/summary/Summarizer.py_in @@ -11,7 +11,7 @@ class Summarizer: def __init__(self, schema_madlib, source_table, output_table, target_cols, grouping_cols, distinctify, get_quartiles, xtileify='Exact', ntile_array=None, how_many_mfv=10, - get_mfv_quick=False): + get_mfv_quick=False, n_cols_per_run=15): self._schema_madlib = schema_madlib self._source_table = source_table self._output_table = output_table @@ -21,8 +21,9 @@ class Summarizer: self._get_quartiles = get_quartiles self._xtileify = xtileify self._ntile_array = ntile_array - self._how_many_mfv = how_many_mfv + self._how_many_mfv = int(how_many_mfv) self._get_mfv_quick = get_mfv_quick + self._n_cols_per_run = int(n_cols_per_run) self._columns = None self._column_names = None self._delimiter = '_.*.&.!.!.&.*_' @@ -32,7 +33,7 @@ class Summarizer: lower_target_cols = [each_col.lower() for each_col in self._target_cols if '"' not in each_col] target_selection = "AND attname = ANY({0})".format( - py_list_to_sql_string(lower_target_cols, array_type="text")) + py_list_to_sql_string(lower_target_cols, array_type="text")) else: target_selection = "" self._columns = plpy.execute(""" @@ -103,7 +104,9 @@ class Summarizer: plpy.error(""" Summary -- Invalid parameter: Number of most frequent values required should be positive""") - + if self._n_cols_per_run is None or self._n_cols_per_run <= 0: + plpy.error(" Summary -- Invalid parameter: Number of columns per run" + "should be positive") # ----- End of argument validation functions ----------------------------- @@ -367,15 +370,14 @@ class Summarizer: # issues when a lot of columns are computed concurrently # We repeat the query multiple times till computation is complete for # all columns. - actual_nCols = len(self._columns) - max_nCols = 15 + actual_n_cols = len(self._columns) # ensuring an even spread of columns in each repeated attempt. For eg. - # if max_nCols = 15, to simulate 31 cols we break it down as [11, 11, 9] + # if self._n_cols_per_run = 15, to simulate 31 cols we break it down as [11, 11, 9] # instead of [15, 15, 1]. This ensures low memory usage in each subquery - nSplits = math.ceil(float(actual_nCols) / max_nCols) - subset_nCols = int(math.ceil(actual_nCols / nSplits)) - subset_columns = [self._columns[pos: pos + subset_nCols] - for pos in range(0, actual_nCols, subset_nCols)] + n_splits = math.ceil(float(actual_n_cols) / self._n_cols_per_run) + subset_n_cols = int(math.ceil(actual_n_cols / n_splits)) + subset_columns = [self._columns[pos: pos + subset_n_cols] + for pos in range(0, actual_n_cols, subset_n_cols)] for cols in subset_columns: for group_val in self._grouping_cols: # summary treats the comma-separated list of grouping_cols as http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/summary.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/summary/summary.py_in b/src/ports/postgres/modules/summary/summary.py_in index 8d5f1fa..ff813f3 100644 --- a/src/ports/postgres/modules/summary/summary.py_in +++ b/src/ports/postgres/modules/summary/summary.py_in @@ -7,84 +7,74 @@ """ import plpy from time import time -from utilities.utilities import __mad_version + +from utilities.control import MinWarning from Summarizer import Summarizer -version_wrapper = __mad_version() -_get_vector = version_wrapper.select_vecfunc() + def summary(schema_madlib, source_table, output_table, target_cols, grouping_cols, - get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates): + get_distinct, get_quartiles, ntile_array, how_many_mfv, + get_estimates, n_cols_per_run): """ - Main summary function that is called by SQL to execute summary + Main summary function that is called by SQL to compute summary statistics on a table. - @param schema_madlib Madlib Schema namespace - @param source_table Name of input table - @param output_table Name of output table - @param target_cols Names of specific columns for which to get summary - @param grouping_cols Names of columns on which to group-by - (no summary is provided for these columns) - @param get_distinct Should summary include distinct count - @param get_quartiles Should summary include quartile information - @param ntile_array Array for quantiles to include in summary - (each element should be in [0, 1]) - @param how_many_mfv How many frequent values to output? - @param get_estimates Should the summmary information be estimated or exact? + @param schema_madlib Madlib Schema namespace + @param source_table Name of input table + @param output_table Name of output table + @param target_cols Names of specific columns for which to get summary + @param grouping_cols Names of columns on which to group-by + (no summary is provided for these columns) + @param get_distinct Should summary include distinct count + @param get_quartiles Should summary include quartile information + @param ntile_array Array for quantiles to include in summary + (each element should be in [0, 1]) + @param how_many_mfv How many frequent values to output? + @param get_estimates Should the summmary information be estimated or exact? + @param n_cols_per_run Number of columns to compute statistics in a single run """ - old_msg_level = plpy.execute( - """select setting from pg_settings where - name='client_min_messages'""")[0]['setting'] - plpy.execute("set client_min_messages to error") - - # 'Estimated', 'Exact', None - distinctify = 'Estimated' - xtileify = 'Exact' # 'Estimated' not supported at present - get_mfv_quick = True - - if not get_estimates: - distinctify = 'Exact' - #xtileify = 'Exact' - get_mfv_quick = False - - if not get_distinct: - distinctify = 'Skip' - - if (version_wrapper.is_pg_version_less_than('9.4') or - version_wrapper.is_gp_version_less_than('4.2.2')): - # PERCENTILE_CONT not available in PostgreSQL < 9.4 or GPDB < 4.2.2. - # The function is available in HAWQ 1.2.0 (even though HAWQ 1.2.0 - # is based on GPDB 4.2.0) - xtileify = 'Skip' - - # GPDB < 4.2 and PG < 9.0 passes vector as a string. - # Ensure that ntile_array is a list of floats not a string - if ntile_array: - try: - ntile_array = [float(tile) for tile in _get_vector(ntile_array)] - except ValueError: - plpy.error("""Summary -- Invalid paramter: - Percentile array has non-float data""") - - # convert comma delimited string input to a list of column names - (target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols) - - start = time() - summarizer = Summarizer( - schema_madlib, source_table, output_table, target_cols, grouping_cols, - distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv, - get_mfv_quick) - summarizer.run() - end = time() - - row_count = plpy.execute( - "SELECT count(*) FROM {output_table}".format( - output_table = output_table))[0]['count'] - - plpy.execute("set client_min_messages to " + old_msg_level) + with MinWarning('error'): + # 'Estimated', 'Exact', None + distinctify = 'Estimated' + xtileify = 'Exact' # 'Estimated' not supported at present + get_mfv_quick = True + + if not get_estimates: + distinctify = 'Exact' + # xtileify = 'Exact' + get_mfv_quick = False + + if not get_distinct: + distinctify = 'Skip' + + # Ensure that ntile_array is a list of floats not a string + if ntile_array: + try: + ntile_array = [float(tile) for tile in ntile_array] + except ValueError: + plpy.error("""Summary -- Invalid paramter: + Percentile array has non-float data""") + + # convert comma delimited string input to a list of column names + (target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols) + + start = time() + summarizer = Summarizer( + schema_madlib, source_table, output_table, target_cols, grouping_cols, + distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv, + get_mfv_quick, n_cols_per_run=n_cols_per_run) + summarizer.run() + end = time() + + row_count = plpy.execute( + "SELECT count(*) FROM {output_table}".format( + output_table=output_table))[0]['count'] + return (output_table, row_count, end - start) + # ----------------------------------------------------------------------- # Input parameter checks and edits # ----------------------------------------------------------------------- @@ -102,9 +92,10 @@ def _analyze_str_inputs(target_cols, grouping_cols): grouping_cols = [None] else: grouping_cols = grouping_cols.replace(' ', '').split(',') - grouping_cols.append(None) # to return summary on complete table + grouping_cols.append(None) # to return summary on complete table return (target_cols, grouping_cols) + # ----------------------------------------------------------------------- # Help messages # ----------------------------------------------------------------------- @@ -117,7 +108,7 @@ def summary_help_message(schema_madlib, message, **kwargs): return """ Usage: ----------------------------------------------------------------------- - SELECT {schema_madlib}.summary + SELECT {madlib}.summary ( source_table TEXT, -- Source table name (Required) output_table TEXT, -- Output table name (Required) @@ -133,9 +124,12 @@ def summary_help_message(schema_madlib, message, **kwargs): -- (Default: NULL - Quantile array is not included in result) how_many_mfv INTEGER, -- How many most-frequent-values (MFVs) to compute? -- (Default: 10) - get_estimates BOOLEAN -- Should we produce an estimated + get_estimates BOOLEAN, -- Should we produce an estimated -- (as opposed to an exact but slow) value for distincts and MFVs? - ) -- (Default: True) + -- (Default: True) + n_cols_per_run INTEGER -- Number of columns to collect summary statistics in + -- one pass of the data + ) ----------------------------------------------------------------------- Output table will be in following format (presence of some columns depends on the arguments): @@ -160,7 +154,7 @@ def summary_help_message(schema_madlib, message, **kwargs): - quantile_array : Percentile values corresponding to ntile_array - most_frequent_values : Most frequent values - mfv_frequencies : Frequency of the most frequent values - """.format(schema_madlib = schema_madlib) + """.format(madlib=schema_madlib) elif message is not None and message.lower() in ('example', 'examples'): return """ DROP TABLE IF EXISTS example_data; @@ -196,17 +190,17 @@ def summary_help_message(schema_madlib, message, **kwargs): SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3]); SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2); SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False); + SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False, 2); """ else: - return """ + return """ 'summary' is a generic function used to produce summary statistics of any data table. The function invokes particular 'methods' from the MADlib library to provide an overview of the data. ------- For an overview on usage, run: - SELECT {schema_madlib}.summary('usage'); + SELECT {madlib}.summary('usage'); ------- For an example, run: - SELECT {schema_madlib}.summary('example') - """.format(schema_madlib = schema_madlib) - + SELECT {madlib}.summary('example') + """.format(madlib=schema_madlib) http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/summary.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in index 9f56b4d..bdde214 100644 --- a/src/ports/postgres/modules/summary/summary.sql_in +++ b/src/ports/postgres/modules/summary/summary.sql_in @@ -42,7 +42,8 @@ summary ( source_table, get_quartiles, ntile_array, how_many_mfv, - get_estimates + get_estimates, + n_cols_per_run ) </pre> The \b summary() function returns a composite type containing three fields: @@ -162,31 +163,52 @@ following columns: <td>Array containing the frequency count for each of the most frequent values. </td> </tr> </table></dd> + <dt>target_columns (optional)</dt> <dd>TEXT, default NULL. A comma-separated list of columns to summarize. If NULL, summaries are produced for all columns.</dd> + <dt>grouping_cols (optional)</dt> -<dd>TEXT, default: null. A comma-separated list of columns on which to +<dd>TEXT, default: null. A comma-separated list of columns on which to group results. If NULL, summaries are produced on the complete table.</dd> -@note Please note that summary statistics are calculated for each grouping +@note Please note that summary statistics are calculated for each grouping column independently. That is, grouping columns are not combined together -as in the regular PostgreSQL style GROUP BY directive. (This was done +as in the regular PostgreSQL style GROUP BY directive. (This was done to reduce long run time and huge output table size which would otherwise -result in the case of large input tables with a lot of grouping_cols and +result in the case of large input tables with a lot of grouping_cols and target_cols specified.) + <dt>get_distinct (optional)</dt> <dd>BOOLEAN, default TRUE. If true, distinct values are counted.</dd> + <dt>get_quartiles (optional)</dt> <dd>BOOLEAN, default TRUE. If TRUE, quartiles are computed.</dd> + <dt>ntile_array (optional)</dt> <dd>FLOAT8[], default NULL. An array of quantile values to compute. If NULL, quantile values are not computed.</dd> -@note Quartile and quantile functions are not available for PostgreSQL 9.3 or -lower. If you are using PostgreSQL 9.3 or lower, the output table will not -contain these values, even if you set 'get_quartiles' = TRUE or +@note Quartile and quantile functions are not available for PostgreSQL 9.3 or +lower. If you are using PostgreSQL 9.3 or lower, the output table will not +contain these values, even if you set 'get_quartiles' = TRUE or provide an array of quantile values for the parameter 'ntile_array'. + <dt>how_many_mfv (optional)</dt> <dd>INTEGER, default: 10. The number of most-frequent-values to compute.</dd> + <dt>get_estimates (optional)</dt> <dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for distinct values and most frequent values. If FALSE, exact values are calculated (may take longer to run depending on data size).</dd> + +<dt>n_cols_per_run (optional)</dt> +<dd>INTEGER, default: 15. The number of columns to collect summary statistics in +one pass of the data. +This parameter determines the number of passes through the data. For e.g., +with a total of 40 columns to summarize and 'n_cols_per_run = 15', there will be +3 passes through the data, with each pass summarizing a maximum of 15 columns. +@note This parameter should be used with caution. Increasing this parameter could +decrease the total run time (if number of passes decreases), but will increase +the memory consumption during each run. Since Postgresql limits the memory available +for a single aggregate run, this increased memory consumption could result in +out-of-memory termination error. + +</dd> </DL> @@ -294,15 +316,15 @@ string should be double-quoted; in this case the input would be '"MyTable"'). in a slow but exact method. The most frequent values are computed using a faithful implementation that preserves the approximation guarantees of the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch). -- Summary statistics are calculated for each grouping +- Summary statistics are calculated for each grouping column independently. That is, grouping columns are not combined together -as in the regular PostgreSQL style GROUP BY directive. (This was done +as in the regular PostgreSQL style GROUP BY directive. (This was done to reduce long run time and huge output table size which would otherwise -result in the case of large input tables with a lot of grouping_cols and +result in the case of large input tables with a lot of grouping_cols and target_cols specified.) -- Quartile and quantile functions are not available for PostgreSQL 9.3 or -lower. If you are using PostgreSQL 9.3 or lower, the output table will not -contain these values, even if you set 'get_quartiles' = TRUE or +- Quartile and quantile functions are not available for PostgreSQL 9.3 or +lower. If you are using PostgreSQL 9.3 or lower, the output table will not +contain these values, even if you set 'get_quartiles' = TRUE or provide an array of quantile values for the parameter 'ntile_array'. @@ -338,6 +360,7 @@ CREATE TYPE MADLIB_SCHEMA.summary_result AS * @param ntile_array Array of percentiles to compute * @param how_many_mfv How many most frequent values to compute? * @param get_estimates Should distinct counts be an estimated (faster) or exact count? + * @param n_cols_per_run Number of columns to use per run of summary * * @usage * @@ -346,7 +369,7 @@ CREATE TYPE MADLIB_SCHEMA.summary_result AS * '<em>target_cols</em>', '<em>grouping_cols</em>', * '<em>get_distinct</em>', '<em>get_quartiles</em>', * '<em>ntile_array</em>', '<em>how_many_mfv</em>', - * '<em>get_estimates</em>' + * '<em>get_estimates</em>', '<em>n_cols_per_run</em>' * ); * SELECT * FROM '<em>output_table</em>' * </pre> @@ -362,14 +385,16 @@ MADLIB_SCHEMA.summary get_quartiles BOOLEAN, -- Are quartiles required ntile_array FLOAT8[], -- Array of quantiles to compute how_many_mfv INTEGER, -- How many most frequent values to compute? - get_estimates BOOLEAN -- Should we produce exact or estimated + get_estimates BOOLEAN, -- Should we produce exact or estimated -- values for distinct computation + n_cols_per_run INTEGER -- Number of columns to use per run of summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ PythonFunctionBodyOnly(`summary', `summary') return summary.summary( schema_madlib, source_table, output_table, target_cols, grouping_cols, - get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates) + get_distinct, get_quartiles, ntile_array, how_many_mfv, + get_estimates, n_cols_per_run) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -386,11 +411,30 @@ MADLIB_SCHEMA.summary get_distinct BOOLEAN, get_quartiles BOOLEAN, ntile_array FLOAT8[], + how_many_mfv INTEGER, + get_estimates BOOLEAN +) +RETURNS MADLIB_SCHEMA.summary_result AS $$ + SELECT MADLIB_SCHEMA.summary( + $1, $2, $3, $4, $5, $6, $7, $8, $9, 15) +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION +MADLIB_SCHEMA.summary +( + source_table TEXT, + output_table TEXT, + target_cols TEXT, + grouping_cols TEXT, + get_distinct BOOLEAN, + get_quartiles BOOLEAN, + ntile_array FLOAT8[], how_many_mfv INTEGER ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, $3, $4, $5, $6, $7, $8, True) + $1, $2, $3, $4, $5, $6, $7, $8, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -407,7 +451,7 @@ MADLIB_SCHEMA.summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, $3, $4, $5, $6, $7, 10, True) + $1, $2, $3, $4, $5, $6, $7, 10, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -423,7 +467,7 @@ MADLIB_SCHEMA.summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, $3, $4, $5, $6, NULL, 10, True) + $1, $2, $3, $4, $5, $6, NULL, 10, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -438,7 +482,7 @@ MADLIB_SCHEMA.summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, $3, $4, $5, True, NULL, 10, True) + $1, $2, $3, $4, $5, True, NULL, 10, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -452,7 +496,7 @@ MADLIB_SCHEMA.summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, $3, $4, True, True, NULL, 10, True) + $1, $2, $3, $4, True, True, NULL, 10, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -465,7 +509,7 @@ MADLIB_SCHEMA.summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, $3, NULL, True, True, NULL, 10, True) + $1, $2, $3, NULL, True, True, NULL, 10, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -477,7 +521,7 @@ MADLIB_SCHEMA.summary ) RETURNS MADLIB_SCHEMA.summary_result AS $$ SELECT MADLIB_SCHEMA.summary( - $1, $2, NULL, NULL, True, True, NULL, 10, True) + $1, $2, NULL, NULL, True, True, NULL, 10, True, 15) $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e05efaaa/src/ports/postgres/modules/summary/test/summary.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/summary/test/summary.sql_in b/src/ports/postgres/modules/summary/test/summary.sql_in index e4f3f65..fc4b787 100644 --- a/src/ports/postgres/modules/summary/test/summary.sql_in +++ b/src/ports/postgres/modules/summary/test/summary.sql_in @@ -61,3 +61,6 @@ SELECT * from example_data_summary; DROP TABLE IF EXISTS example_data_summary; SELECT summary('example_data', 'example_data_summary', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 10, False); SELECT * from example_data_summary; +DROP TABLE IF EXISTS example_data_summary; +SELECT summary('example_data', 'example_data_summary', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 10, False, 1); +SELECT * from example_data_summary;