Repository: spark Updated Branches: refs/heads/master 4c0ff5f58 -> ae4b91d1f
[SPARK-20039][ML] rename ChiSquare to ChiSquareTest ## What changes were proposed in this pull request? I realized that since ChiSquare is in the package stat, it's pretty unclear if it's the hypothesis test, distribution, or what. This PR renames it to ChiSquareTest to clarify this. ## How was this patch tested? Existing unit tests Author: Joseph K. Bradley <jos...@databricks.com> Closes #17368 from jkbradley/SPARK-20039. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ae4b91d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ae4b91d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ae4b91d1 Branch: refs/heads/master Commit: ae4b91d1f5734b9d66f3b851b71b3c179f3cdd76 Parents: 4c0ff5f Author: Joseph K. Bradley <jos...@databricks.com> Authored: Tue Mar 21 11:01:25 2017 -0700 Committer: Joseph K. Bradley <jos...@databricks.com> Committed: Tue Mar 21 11:01:25 2017 -0700 ---------------------------------------------------------------------- .../org/apache/spark/ml/stat/ChiSquare.scala | 81 ---------------- .../apache/spark/ml/stat/ChiSquareTest.scala | 81 ++++++++++++++++ .../apache/spark/ml/stat/ChiSquareSuite.scala | 98 -------------------- .../spark/ml/stat/ChiSquareTestSuite.scala | 98 ++++++++++++++++++++ 4 files changed, 179 insertions(+), 179 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala deleted file mode 100644 index c3865ce..0000000 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.stat - -import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} -import org.apache.spark.ml.util.SchemaUtils -import org.apache.spark.mllib.linalg.{Vectors => OldVectors} -import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} -import org.apache.spark.mllib.stat.{Statistics => OldStatistics} -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.col - - -/** - * :: Experimental :: - * - * Chi-square hypothesis testing for categorical data. - * - * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information - * on the Chi-squared test. - */ -@Experimental -@Since("2.2.0") -object ChiSquare { - - /** Used to construct output schema of tests */ - private case class ChiSquareResult( - pValues: Vector, - degreesOfFreedom: Array[Int], - statistics: Vector) - - /** - * Conduct Pearson's independence test for every feature against the label across the input RDD. - * For each feature, the (feature, label) pairs are converted into a contingency matrix for which - * the Chi-squared statistic is computed. All label and feature values must be categorical. - * - * The null hypothesis is that the occurrence of the outcomes is statistically independent. - * - * @param dataset DataFrame of categorical labels and categorical features. - * Real-valued features will be treated as categorical for each distinct value. - * @param featuresCol Name of features column in dataset, of type `Vector` (`VectorUDT`) - * @param labelCol Name of label column in dataset, of any numerical type - * @return DataFrame containing the test result for every feature against the label. - * This DataFrame will contain a single Row with the following fields: - * - `pValues: Vector` - * - `degreesOfFreedom: Array[Int]` - * - `statistics: Vector` - * Each of these fields has one value per feature. - */ - @Since("2.2.0") - def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { - val spark = dataset.sparkSession - import spark.implicits._ - - SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) - SchemaUtils.checkNumericType(dataset.schema, labelCol) - val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)] - .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) } - val testResults = OldStatistics.chiSqTest(rdd) - val pValues: Vector = Vectors.dense(testResults.map(_.pValue)) - val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) - val statistics: Vector = Vectors.dense(testResults.map(_.statistic)) - spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala new file mode 100644 index 0000000..21eba9a --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.stat + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} +import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.mllib.linalg.{Vectors => OldVectors} +import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} +import org.apache.spark.mllib.stat.{Statistics => OldStatistics} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col + + +/** + * :: Experimental :: + * + * Chi-square hypothesis testing for categorical data. + * + * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information + * on the Chi-squared test. + */ +@Experimental +@Since("2.2.0") +object ChiSquareTest { + + /** Used to construct output schema of tests */ + private case class ChiSquareResult( + pValues: Vector, + degreesOfFreedom: Array[Int], + statistics: Vector) + + /** + * Conduct Pearson's independence test for every feature against the label across the input RDD. + * For each feature, the (feature, label) pairs are converted into a contingency matrix for which + * the Chi-squared statistic is computed. All label and feature values must be categorical. + * + * The null hypothesis is that the occurrence of the outcomes is statistically independent. + * + * @param dataset DataFrame of categorical labels and categorical features. + * Real-valued features will be treated as categorical for each distinct value. + * @param featuresCol Name of features column in dataset, of type `Vector` (`VectorUDT`) + * @param labelCol Name of label column in dataset, of any numerical type + * @return DataFrame containing the test result for every feature against the label. + * This DataFrame will contain a single Row with the following fields: + * - `pValues: Vector` + * - `degreesOfFreedom: Array[Int]` + * - `statistics: Vector` + * Each of these fields has one value per feature. + */ + @Since("2.2.0") + def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { + val spark = dataset.sparkSession + import spark.implicits._ + + SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) + SchemaUtils.checkNumericType(dataset.schema, labelCol) + val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)] + .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) } + val testResults = OldStatistics.chiSqTest(rdd) + val pValues: Vector = Vectors.dense(testResults.map(_.pValue)) + val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) + val statistics: Vector = Vectors.dense(testResults.map(_.statistic)) + spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) + } +} http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala deleted file mode 100644 index b4bed82..0000000 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.stat - -import java.util.Random - -import org.apache.spark.{SparkException, SparkFunSuite} -import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.{Vector, Vectors} -import org.apache.spark.ml.util.DefaultReadWriteTest -import org.apache.spark.ml.util.TestingUtils._ -import org.apache.spark.mllib.stat.test.ChiSqTest -import org.apache.spark.mllib.util.MLlibTestSparkContext - -class ChiSquareSuite - extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - - import testImplicits._ - - test("test DataFrame of labeled points") { - // labels: 1.0 (2 / 6), 0.0 (4 / 6) - // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) - // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) - val data = Seq( - LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), - LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), - LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), - LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), - LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), - LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) - for (numParts <- List(2, 4, 6, 8)) { - val df = spark.createDataFrame(sc.parallelize(data, numParts)) - val chi = ChiSquare.test(df, "features", "label") - val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = - chi.select("pValues", "degreesOfFreedom", "statistics") - .as[(Vector, Array[Int], Vector)].head() - assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) - assert(degreesOfFreedom === Array(2, 3)) - assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) - } - } - - test("large number of features (SPARK-3087)") { - // Test that the right number of results is returned - val numCols = 1001 - val sparseData = Array( - LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), - LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) - val df = spark.createDataFrame(sparseData) - val chi = ChiSquare.test(df, "features", "label") - val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = - chi.select("pValues", "degreesOfFreedom", "statistics") - .as[(Vector, Array[Int], Vector)].head() - assert(pValues.size === numCols) - assert(degreesOfFreedom.length === numCols) - assert(statistics.size === numCols) - assert(pValues(1000) !== null) // SPARK-3087 - } - - test("fail on continuous features or labels") { - val tooManyCategories: Int = 100000 - assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + - "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") - - val random = new Random(11L) - val continuousLabel = Seq.fill(tooManyCategories)( - LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) - withClue("ChiSquare should throw an exception when given a continuous-valued label") { - intercept[SparkException] { - val df = spark.createDataFrame(continuousLabel) - ChiSquare.test(df, "features", "label") - } - } - val continuousFeature = Seq.fill(tooManyCategories)( - LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) - withClue("ChiSquare should throw an exception when given continuous-valued features") { - intercept[SparkException] { - val df = spark.createDataFrame(continuousFeature) - ChiSquare.test(df, "features", "label") - } - } - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala new file mode 100644 index 0000000..2d6aad0 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.stat + +import java.util.Random + +import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.util.DefaultReadWriteTest +import org.apache.spark.ml.util.TestingUtils._ +import org.apache.spark.mllib.stat.test.ChiSqTest +import org.apache.spark.mllib.util.MLlibTestSparkContext + +class ChiSquareTestSuite + extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + import testImplicits._ + + test("test DataFrame of labeled points") { + // labels: 1.0 (2 / 6), 0.0 (4 / 6) + // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) + // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) + val data = Seq( + LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), + LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), + LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), + LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), + LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), + LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) + for (numParts <- List(2, 4, 6, 8)) { + val df = spark.createDataFrame(sc.parallelize(data, numParts)) + val chi = ChiSquareTest.test(df, "features", "label") + val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = + chi.select("pValues", "degreesOfFreedom", "statistics") + .as[(Vector, Array[Int], Vector)].head() + assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) + assert(degreesOfFreedom === Array(2, 3)) + assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) + } + } + + test("large number of features (SPARK-3087)") { + // Test that the right number of results is returned + val numCols = 1001 + val sparseData = Array( + LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), + LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) + val df = spark.createDataFrame(sparseData) + val chi = ChiSquareTest.test(df, "features", "label") + val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = + chi.select("pValues", "degreesOfFreedom", "statistics") + .as[(Vector, Array[Int], Vector)].head() + assert(pValues.size === numCols) + assert(degreesOfFreedom.length === numCols) + assert(statistics.size === numCols) + assert(pValues(1000) !== null) // SPARK-3087 + } + + test("fail on continuous features or labels") { + val tooManyCategories: Int = 100000 + assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + + "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") + + val random = new Random(11L) + val continuousLabel = Seq.fill(tooManyCategories)( + LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) + withClue("ChiSquare should throw an exception when given a continuous-valued label") { + intercept[SparkException] { + val df = spark.createDataFrame(continuousLabel) + ChiSquareTest.test(df, "features", "label") + } + } + val continuousFeature = Seq.fill(tooManyCategories)( + LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) + withClue("ChiSquare should throw an exception when given continuous-valued features") { + intercept[SparkException] { + val df = spark.createDataFrame(continuousFeature) + ChiSquareTest.test(df, "features", "label") + } + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org