Repository: incubator-madlib Updated Branches: refs/heads/master a17d63c9b -> 0b5000ab0
Sample: Add stratified sampling JIRA: MADLIB-986 Add stratified sampling with the following options. - With or without grouping - With or without replacement - A specific set of target columns or all of them Closes #143 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/0b5000ab Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/0b5000ab Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/0b5000ab Branch: refs/heads/master Commit: 0b5000ab042e8346545cf1e9984b005869865ebf Parents: a17d63c Author: Orhan Kislal <okis...@pivotal.io> Authored: Wed Jun 21 16:07:08 2017 -0700 Committer: Orhan Kislal <okis...@pivotal.io> Committed: Tue Jun 27 18:14:34 2017 -0700 ---------------------------------------------------------------------- doc/mainpage.dox.in | 3 + src/config/Modules.yml | 1 + .../postgres/modules/sample/__init__.py_in | 0 .../modules/sample/stratified_sample.py_in | 306 +++++++++++++++++++ .../modules/sample/stratified_sample.sql_in | 248 +++++++++++++++ .../sample/test/stratified_sample.sql_in | 75 +++++ 6 files changed, 633 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/doc/mainpage.dox.in ---------------------------------------------------------------------- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index 407946e..274426a 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -248,6 +248,9 @@ complete matrix stored as a distributed table. @defgroup grp_pmml PMML Export @ingroup grp_utility_functions + @defgroup grp_strs Stratified Sampling + @ingroup grp_utility_functions + @defgroup grp_sessionize Sessionize @ingroup grp_utility_functions http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/config/Modules.yml ---------------------------------------------------------------------- diff --git a/src/config/Modules.yml b/src/config/Modules.yml index d5c336e..b70c8bd 100644 --- a/src/config/Modules.yml +++ b/src/config/Modules.yml @@ -15,6 +15,7 @@ modules: - name: glm depends: ['utilities'] - name: graph + depends: ['utilities'] - name: kmeans depends: ['array_ops', 'svec_util', 'sample'] - name: knn http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/__init__.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/sample/__init__.py_in b/src/ports/postgres/modules/sample/__init__.py_in new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/stratified_sample.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/sample/stratified_sample.py_in b/src/ports/postgres/modules/sample/stratified_sample.py_in new file mode 100644 index 0000000..e7762ef --- /dev/null +++ b/src/ports/postgres/modules/sample/stratified_sample.py_in @@ -0,0 +1,306 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.utilities import split_quoted_delimited_str +from utilities.validate_args import table_exists +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import table_is_empty +from utilities.validate_args import get_expr_type +from utilities.validate_args import get_cols +from graph.graph_utils import _check_groups +from graph.graph_utils import _grp_from_table + +m4_changequote(`<!', `!>') + +def stratified_sample(schema_madlib, source_table, output_table, proportion, + grouping_cols, target_cols, with_replacement, **kwargs): + + """ + Stratified sampling function + Args: + @param source_table Input table name. + @param output_table Output table name. + @param proportion The ratio of sample size to the number of + records. + @param grouping_cols (Default: NULL) The columns to distinguish + each strata. + @param target_cols (Default: NULL) The columns to include in + the output. + @param with_replacement (Default: FALSE) The sampling method. + + """ + with MinWarning("warning"): + label = unique_string(desp='label') + perc = unique_string(desp='perc') + + checkg_lp = "" + window = "" + grp_by = "" + grp_from_perc = "" + grp_comma = "" + glist = None + if grouping_cols is not None: + glist = split_quoted_delimited_str(grouping_cols) + checkg_lp = " AND " + _check_groups(label,perc,glist) + window = "PARTITION BY {0}".format(grouping_cols) + grp_by = "GROUP BY {0}".format(grouping_cols) + grp_from_perc = _grp_from_table(perc,glist) + " , " + grp_comma = grouping_cols + " , " + + validate_strs(source_table, output_table, proportion, glist, target_cols) + + if target_cols is None or target_cols is '*': + cols = get_cols(source_table) + if grouping_cols is not None: + cols = [item for item in cols if item not in glist] + target_cols = " , ".join(cols) + + plpy.execute("DROP TABLE IF EXISTS {0},{1}".format(label,perc)) + if not with_replacement : + if grouping_cols: + + # Create a random label for each record + sql1 = """ CREATE TEMP TABLE {label} AS ( + SELECT {target_cols},{grouping_cols},random() AS __samp_out_label + FROM {source_table})""".format(**locals()) + plpy.execute(sql1) + + # Find the cut-off label for the given proportion + sql2 = """ CREATE TEMP TABLE {perc} AS ( + SELECT {grouping_cols}, percentile_disc({proportion}) + WITHIN GROUP (ORDER BY __samp_out_label) AS __samp_out_label + FROM {label} GROUP BY {grouping_cols})""".format(**locals()) + plpy.execute(sql2) + + # Select every record that has a label under the threshold + sql3 = """ CREATE TABLE {output_table} AS ( + SELECT {grp_from_perc} {target_cols} + FROM {label} INNER JOIN {perc} ON ( + {label}.__samp_out_label <= {perc}.__samp_out_label + {checkg_lp}) )""".format(**locals()) + plpy.execute(sql3) + else: + + # Find the number of records to select + count = plpy.execute("SELECT count(*) AS count FROM {0}". + format(source_table))[0]['count'] + count = count * proportion + + # Order randomly and select the required number of records + sql1 = """ CREATE TABLE {output_table} AS ( + SELECT {target_cols} + FROM {source_table} + ORDER BY random() + LIMIT {count})""".format(**locals()) + plpy.execute(sql1) + else: + + # Set the row number as the label for each record + # OVER clause ensures that different groups have independent + # row_numbers + sql1 = """ CREATE TEMP TABLE {label} AS ( + SELECT {grp_comma} {target_cols}, + row_number() OVER ({window}) AS __samp_out_label + FROM {source_table})""".format(**locals()) + plpy.execute(sql1) + + # Generate a series of random values for each group based on their + # individual row counts. + # These random values are independent from each other and may have + # the same value. + + sql2 = """ CREATE TEMP TABLE {perc} AS ( + SELECT {grp_comma} + GENERATE_SERIES(0,(count*{proportion}-1)::int) AS __i, + ((random()*(count-1)+1)::int) AS __samp_out_label + FROM ( + SELECT {grp_comma} count(*) AS count + FROM {source_table} {grp_by}) AS sub) + """.format(**locals()) + plpy.execute(sql2) + + # Join the two tables to get the selected samples. + # If a random value is generated twice, the join will ensure that + # the record is selected twice + sql3 = """ CREATE TABLE {output_table} AS ( + SELECT {grp_from_perc} {target_cols} + FROM {label} INNER JOIN {perc} ON ( + {label}.__samp_out_label = {perc}.__samp_out_label + {checkg_lp}) )""".format(**locals()) + plpy.execute(sql3) + plpy.execute("DROP TABLE IF EXISTS {0},{1}".format(label,perc)) + + return + +def validate_strs (source_table, output_table, proportion, glist, target_cols): + + _assert(output_table and output_table.strip().lower() not in ('null', ''), + "Sample: Invalid output table name {output_table}!".format(**locals())) + _assert(not table_exists(output_table), + "Sample: Output table already exists!".format(**locals())) + + _assert(source_table and source_table.strip().lower() not in ('null', ''), + "Sample: Invalid Source table name!".format(**locals())) + _assert(table_exists(source_table), + "Sample: Source table ({source_table}) is missing!".format(**locals())) + _assert(not table_is_empty(source_table), + "Sample: Source table ({source_table}) is empty!".format(**locals())) + + _assert(proportion > 0 and proportion < 1, + "Sample: Proportion isn't in the range (0,1)!") + + if glist is not None: + _assert(columns_exist_in_table(source_table, glist), + ("""Sample: Not all columns from {glist} are present in source"""+ + """ table ({source_table}).""").format(**locals())) + + if not (target_cols is None or target_cols is '*'): + tlist = split_quoted_delimited_str(target_cols) + _assert(columns_exist_in_table(source_table, tlist), + ("""Sample: Not all columns from {target_cols} are present in"""+ + """ edge table ({source_table})""").format(**locals())) + return + + +def stratified_sample_help(schema_madlib, message, **kwargs): + """ + Help function for stratified_sample + + Args: + @param schema_madlib + @param message: string, Help message string + @param kwargs + + Returns: + String. Help/usage information + """ + if not message: + help_string = """ +----------------------------------------------------------------------- + SUMMARY +----------------------------------------------------------------------- + +Given a table, stratified sampling returns a proportion of records for +each group (strata). It is possible to use with or without replacement +sampling methods, specify a set of target columns, and assume the +whole table is a single strata. + +For more details on function usage: + SELECT {schema_madlib}.stratified_sample('usage'); + SELECT {schema_madlib}.stratified_sample('example'); + """ + elif message.lower() in ['usage', 'help', '?']: + help_string = """ + +Given a table, stratified sampling returns a proportion of records for +each group (strata). It is possible to use with or without replacement +sampling methods, specify a set of target columns, and assume the +whole table is a single strata. + +---------------------------------------------------------------------------- + USAGE +---------------------------------------------------------------------------- + + SELECT {schema_madlib}.stratified_sample( + source_table TEXT, -- Input table name. + output_table TEXT, -- Output table name. + proportion FLOAT8, -- The ratio of sample size to the number of + -- records. + grouping_cols TEXT -- (Default: NULL) The columns to distinguish + -- each strata. + target_cols TEXT, -- (Default: NULL) The columns to include in + -- the output. + with_replacement BOOLEAN -- (Default: FALSE) The sampling method. + +If grouping_cols is NULL, the whole table is treated as a single group and +sampled accordingly. + +If target_cols is NULL or '*', all of the columns will be included in the +output table. + +If with_replacement is TRUE, each sample is independent (the same row may +be selected in the sample set more than once). Else (if with_replacement +is FALSE), a row can be selected at most once. +); +""" + elif message.lower() in ("example", "examples"): + help_string = """ +---------------------------------------------------------------------------- + EXAMPLES +---------------------------------------------------------------------------- + +-- Create an input table +DROP TABLE IF EXISTS test; + +CREATE TABLE test( + id1 INTEGER, + id2 INTEGER, + gr1 INTEGER, + gr2 INTEGER +); + +INSERT INTO test VALUES +(1,0,1,1), +(2,0,1,1), +(3,0,1,1), +(4,0,1,1), +(5,0,1,1), +(6,0,1,1), +(7,0,1,1), +(8,0,1,1), +(9,0,1,1), +(9,0,1,1), +(9,0,1,1), +(9,0,1,1), +(0,1,1,2), +(0,2,1,2), +(0,3,1,2), +(0,4,1,2), +(0,5,1,2), +(0,6,1,2), +(10,10,2,2), +(20,20,2,2), +(30,30,2,2), +(40,40,2,2), +(50,50,2,2), +(60,60,2,2), +(70,70,2,2) +; + +-- Sample without replacement +DROP TABLE IF EXISTS out; +SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', + FALSE); +SELECT * FROM out; + +-- Sample with replacement +DROP TABLE IF EXISTS out; +SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', + TRUE); +SELECT * FROM out; +""" + else: + help_string = "No such option. Use {schema_madlib}.graph_sssp()" + + return help_string.format(schema_madlib=schema_madlib) http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/stratified_sample.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/sample/stratified_sample.sql_in b/src/ports/postgres/modules/sample/stratified_sample.sql_in new file mode 100644 index 0000000..7327c26 --- /dev/null +++ b/src/ports/postgres/modules/sample/stratified_sample.sql_in @@ -0,0 +1,248 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * + * @file stratified_sample.sql_in + * + * @brief SQL functions for stratified sampling. + * @date 07/19/2017 + * + * @sa Given a table, stratified sampling returns a proportion of records + * for each group (strata). + * + *//* ----------------------------------------------------------------------- */ + +m4_include(`SQLCommon.m4') + + +/** +@addtogroup grp_strs + +<div class="toc"><b>Contents</b> +<ul> +<li><a href="#strs">Stratified Sampling</a></li> +<li><a href="#notes">Notes</a></li> +<li><a href="#examples">Examples</a></li> +</ul> +</div> + +@brief Provides stratified sampling function. + +Given a table, stratified sampling returns a proportion of records for +each group (strata). It is possible to use with or without replacement +sampling methods, specify a set of target columns, and assume the +whole table is a single strata. + +@anchor strs +@par Stratified Sampling + +<pre class="syntax"> +stratified_sample( source_table, + output_table, + proportion, + grouping_cols, + target_cols, + with_replacement + ) +</pre> + +\b Arguments +<dl class="arglist"> +<dt>source_table</dt> +<dd>TEXT. Name of the table containing the input data. Must contain the +columns specified in the 'groupinhg_cols' and target_cols' parameters +below.</dd> +<dt>output_table</dt> +<dd>TEXT. Name of the table to store the sampled records.</dd> +<dt>proportion</dt> +<dd>FLOAT8. The ratio of sample size to the number of records.</dd> +<dt>grouping_cols</dt> +<dd>TEXT. (Default: NULL) The columns to distinguish each strata.</dd> +<dt>target_cols</dt> +<dd>TEXT. (Default: NULL) The columns to include in the output.</dd> +<dt>with_replacement</dt> +<dd>BOOLEAN. (Default: FALSE) The sampling method.</dd> +</dl> +@anchor notes +@par Notes + +If grouping_cols is NULL, the whole table is treated as a single group and +sampled accordingly. + +If target_cols is NULL or '*', all of the columns will be included in the +output table. + +If with_replacement is TRUE, each sample is independent (the same row may +be selected in the sample set more than once). Else (if with_replacement +is FALSE), a row can be selected at most once. +); + +@anchor examples +@par Examples + +Please note that, due to the random nature of sampling, the output records +might differ. + +-# Create an input table: +<pre class="syntax"> +DROP TABLE IF EXISTS test; +CREATE TABLE test( + id1 INTEGER, + id2 INTEGER, + gr1 INTEGER, + gr2 INTEGER +); +INSERT INTO test VALUES +(1,0,1,1), +(2,0,1,1), +(3,0,1,1), +(4,0,1,1), +(5,0,1,1), +(6,0,1,1), +(7,0,1,1), +(8,0,1,1), +(9,0,1,1), +(9,0,1,1), +(9,0,1,1), +(9,0,1,1), +(0,1,1,2), +(0,2,1,2), +(0,3,1,2), +(0,4,1,2), +(0,5,1,2), +(0,6,1,2), +(10,10,2,2), +(20,20,2,2), +(30,30,2,2), +(40,40,2,2), +(50,50,2,2), +(60,60,2,2), +(70,70,2,2); +</pre> + +-# Sample without replacement +<pre class="syntax"> +DROP TABLE IF EXISTS out; +SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', FALSE); +SELECT * FROM out ORDER BY gr1,gr2,id1,id2; +</pre> +<pre class="result"> + gr1 | gr2 | id1 | id2 +-----+-----+-----+----- + 1 | 1 | 2 | 0 + 1 | 1 | 4 | 0 + 1 | 1 | 7 | 0 + 1 | 1 | 8 | 0 + 1 | 1 | 9 | 0 + 1 | 1 | 9 | 0 + 1 | 2 | 0 | 2 + 1 | 2 | 0 | 3 + 1 | 2 | 0 | 4 + 2 | 2 | 20 | 20 + 2 | 2 | 30 | 30 + 2 | 2 | 40 | 40 + 2 | 2 | 60 | 60 +(13 rows) +</pre> + +-# Sample with replacement +<pre class="syntax"> +DROP TABLE IF EXISTS out; +SELECT madlib.stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', TRUE); +SELECT * FROM out ORDER BY gr1,gr2,id1,id2; +</pre> +<pre class="result"> + gr1 | gr2 | id1 | id2 +-------+-------+-------+------- + 1 | 1 | 3 | 0 + 1 | 1 | 6 | 0 + 1 | 1 | 6 | 0 + 1 | 1 | 7 | 0 + 1 | 1 | 7 | 0 + 1 | 1 | 9 | 0 + 1 | 2 | 0 | 1 + 1 | 2 | 0 | 2 + 1 | 2 | 0 | 6 + 2 | 2 | 20 | 20 + 2 | 2 | 30 | 30 + 2 | 2 | 50 | 50 + 2 | 2 | 50 | 50 +</pre> +*/ + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( + source_table TEXT, + output_table TEXT, + proportion FLOAT8, + grouping_cols TEXT, + target_cols TEXT, + with_replacement BOOLEAN +) RETURNS VOID AS $$ + PythonFunction(sample, stratified_sample, stratified_sample) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( + source_table TEXT, + output_table TEXT, + proportion FLOAT8, + grouping_cols TEXT, + target_cols TEXT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, $5, FALSE); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( + source_table TEXT, + output_table TEXT, + proportion FLOAT8, + grouping_cols TEXT +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, $4, NULL, FALSE); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample( + source_table TEXT, + output_table TEXT, + proportion FLOAT8 +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.stratified_sample($1, $2, $3, NULL, NULL, FALSE); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------------- + +-- Online help +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample_help( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunction(sample, stratified_sample, stratified_sample_help) +$$ LANGUAGE plpythonu IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); + +------------------------------------------------------------------------------- + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.stratified_sample_help() +RETURNS VARCHAR AS $$ + SELECT MADLIB_SCHEMA.stratified_sample_help(''); +$$ LANGUAGE sql IMMUTABLE +m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); +------------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/0b5000ab/src/ports/postgres/modules/sample/test/stratified_sample.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/sample/test/stratified_sample.sql_in b/src/ports/postgres/modules/sample/test/stratified_sample.sql_in new file mode 100644 index 0000000..b889260 --- /dev/null +++ b/src/ports/postgres/modules/sample/test/stratified_sample.sql_in @@ -0,0 +1,75 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + +DROP TABLE IF EXISTS test; + +CREATE TABLE test( + id1 INTEGER, + id2 INTEGER, + gr1 INTEGER, + gr2 INTEGER +); + +INSERT INTO test VALUES +(1,0,1,1), +(2,0,1,1), +(3,0,1,1), +(4,0,1,1), +(5,0,1,1), +(6,0,1,1), +(7,0,1,1), +(8,0,1,1), +(9,0,1,1), +(9,0,1,1), +(9,0,1,1), +(9,0,1,1), +(0,1,1,2), +(0,2,1,2), +(0,3,1,2), +(0,4,1,2), +(0,5,1,2), +(0,6,1,2), +(10,10,2,2), +(20,20,2,2), +(30,30,2,2), +(40,40,2,2), +(50,50,2,2), +(60,60,2,2), +(70,70,2,2) +; + +DROP TABLE IF EXISTS out; +SELECT stratified_sample('test', 'out', 0.5, NULL, 'id1,id2,gr1,gr2', FALSE); +SELECT assert(count(*) = 13, 'Wrong number of samples') FROM out; + +DROP TABLE IF EXISTS out; +SELECT stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', FALSE); +SELECT assert(count(DISTINCT (id1,id2)) = 3, 'Wrong number of samples') +FROM out WHERE gr1 = 1 AND gr2 = 2; + +DROP TABLE IF EXISTS out; +SELECT stratified_sample('test', 'out', 0.5, NULL, 'id1,id2,gr1,gr2', TRUE); +SELECT assert(count(*) = 13, 'Wrong number of samples') FROM out; + +DROP TABLE IF EXISTS out; +SELECT stratified_sample('test', 'out', 0.5, 'gr1,gr2', 'id1,id2', TRUE); +SELECT assert(count(*) = 6, 'Wrong number of samples') +FROM out WHERE gr1 = 1 AND gr2 = 1;