imatiach-msft commented on a change in pull request #21632: [SPARK-19591][ML][MLlib] Add sample weights to decision trees URL: https://github.com/apache/spark/pull/21632#discussion_r250458351
########## File path: mllib/src/test/scala/org/apache/spark/mllib/tree/ImpuritySuite.scala ########## @@ -18,23 +18,62 @@ package org.apache.spark.mllib.tree import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.tree.impurity.{EntropyAggregator, GiniAggregator} +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.tree.impurity._ /** * Test suites for `GiniAggregator` and `EntropyAggregator`. */ class ImpuritySuite extends SparkFunSuite { + + private val seed = 42 + test("Gini impurity does not support negative labels") { val gini = new GiniAggregator(2) intercept[IllegalArgumentException] { - gini.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0) + gini.update(Array(0.0, 1.0, 2.0), 0, -1, 3, 0.0) } } test("Entropy does not support negative labels") { val entropy = new EntropyAggregator(2) intercept[IllegalArgumentException] { - entropy.update(Array(0.0, 1.0, 2.0), 0, -1, 0.0) + entropy.update(Array(0.0, 1.0, 2.0), 0, -1, 3, 0.0) + } + } + + test("Classification impurities are insensitive to scaling") { + val rng = new scala.util.Random(seed) + val weightedCounts = Array.fill(5)(rng.nextDouble()) + val smallWeightedCounts = weightedCounts.map(_ * 0.0001) + val largeWeightedCounts = weightedCounts.map(_ * 10000) + Seq(Gini, Entropy).foreach { impurity => + val impurity1 = impurity.calculate(weightedCounts, weightedCounts.sum) + assert(impurity.calculate(smallWeightedCounts, smallWeightedCounts.sum) + ~== impurity1 relTol 0.005) + assert(impurity.calculate(largeWeightedCounts, largeWeightedCounts.sum) + ~== impurity1 relTol 0.005) } } + test("Regression impurities are insensitive to scaling") { Review comment: done, added newline ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org