Repository: spark Updated Branches: refs/heads/master 299262384 -> dda6d9f40
[SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countApproxDistinct Author: Vinod K C <vinod...@huawei.com> Closes #5974 from vinodkc/fix_countApproxDistinct_Validation and squashes the following commits: 3a3d59c [Vinod K C] Reverted removal of validation relativeSD<0.000017 799976e [Vinod K C] Removed testcase to assert IAE when relativeSD>3.7 8ddbfae [Vinod K C] Remove blank line b1b00a3 [Vinod K C] Removed relativeSD validation from python API,RDD.scala will do validation 122d378 [Vinod K C] Fixed validation of relativeSD in countApproxDistinct Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dda6d9f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dda6d9f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dda6d9f4 Branch: refs/heads/master Commit: dda6d9f4045fa2d1265abffa9d7dbdc967448417 Parents: 2992623 Author: Vinod K C <vinod...@huawei.com> Authored: Sat May 9 10:03:15 2015 +0100 Committer: Sean Owen <so...@cloudera.com> Committed: Sat May 9 10:03:15 2015 +0100 ---------------------------------------------------------------------- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 7 ++++--- core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala | 2 ++ python/pyspark/rdd.py | 2 -- python/pyspark/tests.py | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/core/src/main/scala/org/apache/spark/rdd/RDD.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 8baf199..7dad30e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1161,8 +1161,8 @@ abstract class RDD[T: ClassTag]( */ @Experimental def countApproxDistinct(p: Int, sp: Int): Long = withScope { - require(p >= 4, s"p ($p) must be at least 4") - require(sp <= 32, s"sp ($sp) cannot be greater than 32") + require(p >= 4, s"p ($p) must be >= 4") + require(sp <= 32, s"sp ($sp) must be <= 32") require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)") val zeroCounter = new HyperLogLogPlus(p, sp) aggregate(zeroCounter)( @@ -1187,8 +1187,9 @@ abstract class RDD[T: ClassTag]( * It must be greater than 0.000017. */ def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope { + require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017") val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt - countApproxDistinct(p, 0) + countApproxDistinct(if (p < 4) 4 else p, 0) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala ---------------------------------------------------------------------- diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index ef8c36a..afc11bd 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext { val simpleRdd = sc.makeRDD(uniformDistro, 10) assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2) assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1) + assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1) + assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22) } test("SparkContext.union") { http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/python/pyspark/rdd.py ---------------------------------------------------------------------- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index d254deb..545c5ad 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2239,8 +2239,6 @@ class RDD(object): """ if relativeSD < 0.000017: raise ValueError("relativeSD should be greater than 0.000017") - if relativeSD > 0.37: - raise ValueError("relativeSD should be smaller than 0.37") # the hash space in Java is 2^32 hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF) return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD) http://git-wip-us.apache.org/repos/asf/spark/blob/dda6d9f4/python/pyspark/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index ea63a39..09de4d1 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -644,7 +644,6 @@ class RDDTests(ReusedPySparkTestCase): self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22) self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001)) - self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5)) def test_histogram(self): # empty --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org