Github user kaknikhil commented on a diff in the pull request:
https://github.com/apache/madlib/pull/218#discussion_r157799149
--- Diff: src/ports/postgres/modules/sample/test/balance_sample.sql_in ---
@@ -0,0 +1,103 @@
+/* -----------------------------------------------------------------------
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//*
----------------------------------------------------------------------- */
+
+DROP TABLE IF EXISTS "TEST_s" cascade;
+
+CREATE TABLE "TEST_s"(
+ id1 INTEGER,
+ "ID2" INTEGER,
+ gr1 INTEGER,
+ gr2 INTEGER
+);
+
+INSERT INTO "TEST_s" VALUES
+(1,0,1,1),
+(2,0,1,1),
+(3,0,1,1),
+(4,0,1,1),
+(5,0,1,1),
+(6,0,1,1),
+(7,0,1,1),
+(8,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(9,0,1,1),
+(0,1,1,2),
+(0,2,1,2),
+(0,3,1,2),
+(0,4,1,2),
+(0,5,1,2),
+(0,6,1,2),
+(10,10,2,2),
+(20,20,2,2),
+(30,30,2,2),
+(40,40,2,2),
+(50,50,2,2),
+(60,60,2,2),
+(70,70,2,2),
+(10,10,5,5),
+(50,50,5,5),
+(88,88,5,5),
+(40,40,5,6),
+(50,50,5,6),
+(60,60,5,6),
+(70,70,5,6),
+(10,10,6,6),
+(60,60,6,6),
+(30,30,6,6),
+(40,40,6,6),
+(50,50,6,6),
+(60,60,6,6),
+(70,70,6,6),
+(50,50,4,2),
+(60,60,4,2),
+(70,70,4,2),
+(50,50,3,2),
+(60,60,3,2),
+(70,70,3,2)
+;
+
+--- Test for random undersampling without replacement
+DROP TABLE IF EXISTS out_s;
+SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'undersample', NULL,
NULL, FALSE);
+SELECT assert(count(*) = 18, 'Wrong number of samples') FROM out_s;
+
+DROP TABLE IF EXISTS out_s1;
+SELECT balance_sample('"TEST_s"', 'out_s1', 'gr2', 'undersample', NULL,
NULL, FALSE);
+SELECT assert(count(*) = 12, 'Wrong number of samples') FROM out_s1;
+
+--- Test for random undersampling with replacement
+DROP TABLE IF EXISTS out_sr2;
+SELECT balance_sample('"TEST_s"', 'out_sr2', 'gr1', 'undersample', NULL,
NULL, TRUE);
+SELECT assert(sum(c) <= 18, 'Wrong number of samples') FROM
--- End diff --
Instead of sum, it might be better to check that all the classes have
exactly 3 tuples. You can change the query to something like
```
select count(*) from (select gr1, count(*) as c from out_sr2 group by
gr1) as foo where foo.c != 3;
```
The output of this should return 0 tuples. This applies to all the tests
including with and without replacement
---