Github user orhankislal commented on a diff in the pull request: https://github.com/apache/madlib/pull/223#discussion_r161296957 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,994 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import math +import plpy +import re +from collections import defaultdict +from fractions import Fraction +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import unique_string +from utilities.validate_args import table_exists +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import table_is_empty +from utilities.validate_args import get_cols +from utilities.utilities import py_list_to_sql_string + + +m4_changequote(`<!', `!>') + +def balance_sample(schema_madlib, source_table, output_table, class_col, + class_sizes, output_table_size, grouping_cols, with_replacement, **kwargs): + + """ + Balance sampling function + Args: + @param source_table Input table name. + @param output_table Output table name. + @param class_col Name of the column containing the class to be + balanced. + @param class_size Parameter to define the size of the different + class values. + @param output_table_size Desired size of the output data set. + @param grouping_cols The columns columns that defines the grouping. + @param with_replacement The sampling method. + + """ + with MinWarning("warning"): + + class_counts = unique_string(desp='class_counts') + desired_sample_per_class = unique_string(desp='desired_sample_per_class') + desired_counts = unique_string(desp='desired_counts') + + if not class_sizes or class_sizes.strip().lower() in ('null', ''): + class_sizes = 'uniform' + + _validate_strs(source_table, output_table, class_col, class_sizes, + output_table_size, grouping_cols, with_replacement) + + source_table_columns = ','.join(get_cols(source_table)) + grp_by = "GROUP BY {0}".format(class_col) + + _create_frequency_distribution(class_counts, source_table, class_col) + temp_views = [class_counts] + + if class_sizes.lower() == 'undersample' and not with_replacement: + """ + Random undersample without replacement. + Randomly order the rows and give a unique (per class) + identifier to each one. + Select rows that have identifiers under the target limit. + """ + _undersampling_with_no_replacement(source_table, output_table, class_col, + class_sizes, output_table_size, grouping_cols, with_replacement, + class_counts, source_table_columns) + + _delete_temp_views(temp_views) + return + + """ + Create views for true and desired sample sizes of classes + """ + """ + include_unsampled_classes tracks is unsampled classes are desired or not. + include_unsampled_classes is always true in output_table_size Null cases but changes given values of desired sample class sizes in comma-delimited classsize paramter. --- End diff -- is -> if ?
---