Repository: spark
Updated Branches:
  refs/heads/master 4c0ff5f58 -> ae4b91d1f


[SPARK-20039][ML] rename ChiSquare to ChiSquareTest

## What changes were proposed in this pull request?

I realized that since ChiSquare is in the package stat, it's pretty unclear if 
it's the hypothesis test, distribution, or what. This PR renames it to 
ChiSquareTest to clarify this.

## How was this patch tested?

Existing unit tests

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #17368 from jkbradley/SPARK-20039.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ae4b91d1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ae4b91d1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ae4b91d1

Branch: refs/heads/master
Commit: ae4b91d1f5734b9d66f3b851b71b3c179f3cdd76
Parents: 4c0ff5f
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Tue Mar 21 11:01:25 2017 -0700
Committer: Joseph K. Bradley <jos...@databricks.com>
Committed: Tue Mar 21 11:01:25 2017 -0700

----------------------------------------------------------------------
 .../org/apache/spark/ml/stat/ChiSquare.scala    | 81 ----------------
 .../apache/spark/ml/stat/ChiSquareTest.scala    | 81 ++++++++++++++++
 .../apache/spark/ml/stat/ChiSquareSuite.scala   | 98 --------------------
 .../spark/ml/stat/ChiSquareTestSuite.scala      | 98 ++++++++++++++++++++
 4 files changed, 179 insertions(+), 179 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala 
b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala
deleted file mode 100644
index c3865ce..0000000
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.stat
-
-import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
-import org.apache.spark.ml.util.SchemaUtils
-import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
-import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
-import org.apache.spark.mllib.stat.{Statistics => OldStatistics}
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions.col
-
-
-/**
- * :: Experimental ::
- *
- * Chi-square hypothesis testing for categorical data.
- *
- * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test";>Wikipedia</a> 
for more information
- * on the Chi-squared test.
- */
-@Experimental
-@Since("2.2.0")
-object ChiSquare {
-
-  /** Used to construct output schema of tests */
-  private case class ChiSquareResult(
-      pValues: Vector,
-      degreesOfFreedom: Array[Int],
-      statistics: Vector)
-
-  /**
-   * Conduct Pearson's independence test for every feature against the label 
across the input RDD.
-   * For each feature, the (feature, label) pairs are converted into a 
contingency matrix for which
-   * the Chi-squared statistic is computed. All label and feature values must 
be categorical.
-   *
-   * The null hypothesis is that the occurrence of the outcomes is 
statistically independent.
-   *
-   * @param dataset  DataFrame of categorical labels and categorical features.
-   *                 Real-valued features will be treated as categorical for 
each distinct value.
-   * @param featuresCol  Name of features column in dataset, of type `Vector` 
(`VectorUDT`)
-   * @param labelCol  Name of label column in dataset, of any numerical type
-   * @return DataFrame containing the test result for every feature against 
the label.
-   *         This DataFrame will contain a single Row with the following 
fields:
-   *          - `pValues: Vector`
-   *          - `degreesOfFreedom: Array[Int]`
-   *          - `statistics: Vector`
-   *         Each of these fields has one value per feature.
-   */
-  @Since("2.2.0")
-  def test(dataset: DataFrame, featuresCol: String, labelCol: String): 
DataFrame = {
-    val spark = dataset.sparkSession
-    import spark.implicits._
-
-    SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
-    SchemaUtils.checkNumericType(dataset.schema, labelCol)
-    val rdd = dataset.select(col(labelCol).cast("double"), 
col(featuresCol)).as[(Double, Vector)]
-      .rdd.map { case (label, features) => OldLabeledPoint(label, 
OldVectors.fromML(features)) }
-    val testResults = OldStatistics.chiSqTest(rdd)
-    val pValues: Vector = Vectors.dense(testResults.map(_.pValue))
-    val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom)
-    val statistics: Vector = Vectors.dense(testResults.map(_.statistic))
-    spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, 
statistics)))
-  }
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala 
b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala
new file mode 100644
index 0000000..21eba9a
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquareTest.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
+import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
+import org.apache.spark.mllib.stat.{Statistics => OldStatistics}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.col
+
+
+/**
+ * :: Experimental ::
+ *
+ * Chi-square hypothesis testing for categorical data.
+ *
+ * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test";>Wikipedia</a> 
for more information
+ * on the Chi-squared test.
+ */
+@Experimental
+@Since("2.2.0")
+object ChiSquareTest {
+
+  /** Used to construct output schema of tests */
+  private case class ChiSquareResult(
+      pValues: Vector,
+      degreesOfFreedom: Array[Int],
+      statistics: Vector)
+
+  /**
+   * Conduct Pearson's independence test for every feature against the label 
across the input RDD.
+   * For each feature, the (feature, label) pairs are converted into a 
contingency matrix for which
+   * the Chi-squared statistic is computed. All label and feature values must 
be categorical.
+   *
+   * The null hypothesis is that the occurrence of the outcomes is 
statistically independent.
+   *
+   * @param dataset  DataFrame of categorical labels and categorical features.
+   *                 Real-valued features will be treated as categorical for 
each distinct value.
+   * @param featuresCol  Name of features column in dataset, of type `Vector` 
(`VectorUDT`)
+   * @param labelCol  Name of label column in dataset, of any numerical type
+   * @return DataFrame containing the test result for every feature against 
the label.
+   *         This DataFrame will contain a single Row with the following 
fields:
+   *          - `pValues: Vector`
+   *          - `degreesOfFreedom: Array[Int]`
+   *          - `statistics: Vector`
+   *         Each of these fields has one value per feature.
+   */
+  @Since("2.2.0")
+  def test(dataset: DataFrame, featuresCol: String, labelCol: String): 
DataFrame = {
+    val spark = dataset.sparkSession
+    import spark.implicits._
+
+    SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
+    SchemaUtils.checkNumericType(dataset.schema, labelCol)
+    val rdd = dataset.select(col(labelCol).cast("double"), 
col(featuresCol)).as[(Double, Vector)]
+      .rdd.map { case (label, features) => OldLabeledPoint(label, 
OldVectors.fromML(features)) }
+    val testResults = OldStatistics.chiSqTest(rdd)
+    val pValues: Vector = Vectors.dense(testResults.map(_.pValue))
+    val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom)
+    val statistics: Vector = Vectors.dense(testResults.map(_.statistic))
+    spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, 
statistics)))
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala
deleted file mode 100644
index b4bed82..0000000
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareSuite.scala
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.stat
-
-import java.util.Random
-
-import org.apache.spark.{SparkException, SparkFunSuite}
-import org.apache.spark.ml.feature.LabeledPoint
-import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
-import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.stat.test.ChiSqTest
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-
-class ChiSquareSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
-
-  import testImplicits._
-
-  test("test DataFrame of labeled points") {
-    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
-    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
-    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
-    val data = Seq(
-      LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
-      LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
-      LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
-      LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
-      LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
-      LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
-    for (numParts <- List(2, 4, 6, 8)) {
-      val df = spark.createDataFrame(sc.parallelize(data, numParts))
-      val chi = ChiSquare.test(df, "features", "label")
-      val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
-        chi.select("pValues", "degreesOfFreedom", "statistics")
-          .as[(Vector, Array[Int], Vector)].head()
-      assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4)
-      assert(degreesOfFreedom === Array(2, 3))
-      assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4)
-    }
-  }
-
-  test("large number of features (SPARK-3087)") {
-    // Test that the right number of results is returned
-    val numCols = 1001
-    val sparseData = Array(
-      LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
-      LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0)))))
-    val df = spark.createDataFrame(sparseData)
-    val chi = ChiSquare.test(df, "features", "label")
-    val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
-      chi.select("pValues", "degreesOfFreedom", "statistics")
-        .as[(Vector, Array[Int], Vector)].head()
-    assert(pValues.size === numCols)
-    assert(degreesOfFreedom.length === numCols)
-    assert(statistics.size === numCols)
-    assert(pValues(1000) !== null)  // SPARK-3087
-  }
-
-  test("fail on continuous features or labels") {
-    val tooManyCategories: Int = 100000
-    assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test 
requires that " +
-      "tooManyCategories be large enough to cause ChiSqTest to throw an 
exception.")
-
-    val random = new Random(11L)
-    val continuousLabel = Seq.fill(tooManyCategories)(
-      LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2))))
-    withClue("ChiSquare should throw an exception when given a 
continuous-valued label") {
-      intercept[SparkException] {
-        val df = spark.createDataFrame(continuousLabel)
-        ChiSquare.test(df, "features", "label")
-      }
-    }
-    val continuousFeature = Seq.fill(tooManyCategories)(
-      LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble())))
-    withClue("ChiSquare should throw an exception when given continuous-valued 
features") {
-      intercept[SparkException] {
-        val df = spark.createDataFrame(continuousFeature)
-        ChiSquare.test(df, "features", "label")
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/ae4b91d1/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala
new file mode 100644
index 0000000..2d6aad0
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/ChiSquareTestSuite.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import java.util.Random
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.stat.test.ChiSqTest
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class ChiSquareTestSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("test DataFrame of labeled points") {
+    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
+    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
+    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
+      LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
+      LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
+      LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
+      LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
+      LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
+    for (numParts <- List(2, 4, 6, 8)) {
+      val df = spark.createDataFrame(sc.parallelize(data, numParts))
+      val chi = ChiSquareTest.test(df, "features", "label")
+      val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
+        chi.select("pValues", "degreesOfFreedom", "statistics")
+          .as[(Vector, Array[Int], Vector)].head()
+      assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4)
+      assert(degreesOfFreedom === Array(2, 3))
+      assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4)
+    }
+  }
+
+  test("large number of features (SPARK-3087)") {
+    // Test that the right number of results is returned
+    val numCols = 1001
+    val sparseData = Array(
+      LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
+      LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0)))))
+    val df = spark.createDataFrame(sparseData)
+    val chi = ChiSquareTest.test(df, "features", "label")
+    val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
+      chi.select("pValues", "degreesOfFreedom", "statistics")
+        .as[(Vector, Array[Int], Vector)].head()
+    assert(pValues.size === numCols)
+    assert(degreesOfFreedom.length === numCols)
+    assert(statistics.size === numCols)
+    assert(pValues(1000) !== null)  // SPARK-3087
+  }
+
+  test("fail on continuous features or labels") {
+    val tooManyCategories: Int = 100000
+    assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test 
requires that " +
+      "tooManyCategories be large enough to cause ChiSqTest to throw an 
exception.")
+
+    val random = new Random(11L)
+    val continuousLabel = Seq.fill(tooManyCategories)(
+      LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2))))
+    withClue("ChiSquare should throw an exception when given a 
continuous-valued label") {
+      intercept[SparkException] {
+        val df = spark.createDataFrame(continuousLabel)
+        ChiSquareTest.test(df, "features", "label")
+      }
+    }
+    val continuousFeature = Seq.fill(tooManyCategories)(
+      LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble())))
+    withClue("ChiSquare should throw an exception when given continuous-valued 
features") {
+      intercept[SparkException] {
+        val df = spark.createDataFrame(continuousFeature)
+        ChiSquareTest.test(df, "features", "label")
+      }
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to