[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557464 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) + MkX += deltaX * (x - xAvg) + MkY += deltaY * (y - yAvg) --- End diff -- `deltaY * deltaY` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557462 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) --- End diff -- `Ck += deltaX * deltaY` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557466 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -68,13 +76,23 @@ private[sql] object StatFunctions { swith dataType ${data.get.dataType} not supported.) } val columns = cols.map(n = Column(Cast(Column(n).expr, DoubleType))) -val counts = df.select(columns:_*).rdd.aggregate(new CovarianceCounter)( +df.select(columns:_*).rdd.aggregate(new CovarianceCounter)( --- End diff -- space after `:` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557468 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala --- @@ -43,12 +43,18 @@ class DataFrameStatSuite extends FunSuite { val singleColResults = df.stat.freqItems(Array(negDoubles), 0.1) val items2 = singleColResults.collect().head items2.getSeq[Double](0) should contain (-1.0) + } + test(pearson correlation) { +val df = Seq.tabulate(10)(i = (i, 2 * i, i * -1.0)).toDF(a, b, c) +val corr1 = df.stat.corr(a, b, pearson) +assert(math.abs(corr1 - 1.0) 1e-6) --- End diff -- Same here, `1e-12`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557463 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) + MkX += deltaX * (x - xAvg) --- End diff -- `deltaX * deltaX` --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557460 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) --- End diff -- Oh, they canceled each other. Could you add a non-trivial test to Scala? Now it only has `0.0`, `1.0`, and `-1.0`. In your test, please provide Python/R commands in comments to reproduce the result. This makes easier for others to verify. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557454 --- Diff: python/pyspark/sql/dataframe.py --- @@ -875,6 +875,27 @@ def fillna(self, value, subset=None): return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) +def corr(self, col1, col2, method=None): + +Calculates the correlation of two columns of a DataFrame as a double value. Currently only +supports the Pearson Correlation Coefficient. +:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases. + +:param col1: The name of the first column +:param col2: The name of the second column +:param method: The correlation method. Currently only supports pearson + +if not isinstance(col1, str): --- End diff -- Accept `Column` as well? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557467 --- Diff: sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java --- @@ -188,6 +188,13 @@ public void testFrequentItems() { } @Test + public void testCorrelation() { +DataFrame df = context.table(testData2); +Double pearsonCorr = df.stat().corr(a, b, pearson); +Assert.assertTrue(Math.abs(pearsonCorr) 1e-6); --- End diff -- The numerical error should be close to machine precision. So let's change `1e-6` to `1e-12`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557457 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala --- @@ -28,6 +28,32 @@ import org.apache.spark.sql.execution.stat._ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** + * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson + * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in + * MLlib's Statistics. + * + * @param col1 the name of the column + * @param col2 the name of the column to calculate the correlation against + * @return The Pearson Correlation Coefficient as a Double. + */ + def corr(col1: String, col2: String, method: String): Double = { --- End diff -- Ditto. Accept `Column` as input? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557544 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) + MkX += deltaX * (x - xAvg) --- End diff -- Same here --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557541 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) --- End diff -- Umm, we need to use the updated `yAvg`not the old one. `deltaX = x_n - \bar(x)_{n - 1}`, `deltaY = y_n - \bar(y)_{n - 1}`. `C_n = C_{n - 1} + (y_n - \bar(y)_n) * (x_n - \bar(x)_{n - 1})`. Notice how the `y`s, don't match. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557546 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) + MkX += deltaX * (x - xAvg) + MkY += deltaY * (y - yAvg) --- End diff -- Same here --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98587876 Thanks. Merging in master. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/5858 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98587654 LGTM. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29565877 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) --- End diff -- Sorry, I didn't see `yAvg` is changed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29565880 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala --- @@ -43,19 +43,40 @@ class DataFrameStatSuite extends FunSuite { val singleColResults = df.stat.freqItems(Array(negDoubles), 0.1) val items2 = singleColResults.collect().head items2.getSeq[Double](0) should contain (-1.0) + } + test(pearson correlation) { +val df = Seq.tabulate(10)(i = (i, 2 * i, i * -1.0)).toDF(a, b, c) +val corr1 = df.stat.corr(a, b, pearson) +assert(math.abs(corr1 - 1.0) 1e-12) +val corr2 = df.stat.corr(a, c, pearson) +assert(math.abs(corr2 + 1.0) 1e-12) +// non-trivial example. To reproduce in python, use: +// from scipy.stats import pearsonr +// import numpy as np +// a = np.array(range(20)) +// b = np.array([x * x - 2 * x + 3.5 for x in range(20)]) +// pearsonr(a, b) +// (0.95723391394758572, 3.8902121417802199e-11) +// In R, use: +// a - 0:19 +// b - mapply(function(x) x * x - 2 * x + 3.5, a) +// cor(a, b) +// [1] 0.957233913947585835 --- End diff -- Thanks for adding this test! --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557714 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) + } + /** Helper class to simplify tracking and merging counts. */ private class CovarianceCounter extends Serializable { -var xAvg = 0.0 -var yAvg = 0.0 -var Ck = 0.0 -var count = 0L +var xAvg = 0.0 // the mean of all examples seen so far in col1 +var yAvg = 0.0 // the mean of all examples seen so far in col2 +var Ck = 0.0 // the co-moment after k examples +var MkX = 0.0 // sum of squares of differences from the (current) mean for col1 +var MkY = 0.0 // sum of squares of differences from the (current) mean for col1 +var count = 0L // count of observed examples // add an example to the calculation def add(x: Double, y: Double): this.type = { - val oldX = xAvg + val deltaX = x - xAvg + val deltaY = y - yAvg count += 1 - xAvg += (x - xAvg) / count - yAvg += (y - yAvg) / count - Ck += (y - yAvg) * (x - oldX) + xAvg += deltaX / count + yAvg += deltaY / count + Ck += deltaX * (y - yAvg) + MkX += deltaX * (x - xAvg) --- End diff -- `xAvg` changes two lines above. This is using the new `xAvg`. `deltaX` is using the previous `xAvg`. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98528217 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31698/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98528214 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98528200 [Test build #31698 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31698/consoleFull) for PR 5858 at commit [`285b838`](https://github.com/apache/spark/commit/285b8384e3e2654d23bfd5dfcbecb0caefb1aec2). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29557960 --- Diff: python/pyspark/sql/dataframe.py --- @@ -875,6 +875,27 @@ def fillna(self, value, subset=None): return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) +def corr(self, col1, col2, method=None): + +Calculates the correlation of two columns of a DataFrame as a double value. Currently only +supports the Pearson Correlation Coefficient. +:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases. + +:param col1: The name of the first column +:param col2: The name of the second column +:param method: The correlation method. Currently only supports pearson + +if not isinstance(col1, str): --- End diff -- If @rxin is okay with it, I can add those in a follow up PR for all the methods that we added. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98509008 [Test build #31698 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31698/consoleFull) for PR 5858 at commit [`285b838`](https://github.com/apache/spark/commit/285b8384e3e2654d23bfd5dfcbecb0caefb1aec2). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98508502 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98508551 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98323820 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31652/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98323796 [Test build #31652 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31652/consoleFull) for PR 5858 at commit [`4fe693b`](https://github.com/apache/spark/commit/4fe693b6d1bdbe31af58e18de1c4d575b559db28). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98323817 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29546427 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala --- @@ -43,7 +43,15 @@ class DataFrameStatSuite extends FunSuite { val singleColResults = df.stat.freqItems(Array(negDoubles), 0.1) val items2 = singleColResults.collect().head items2.getSeq[Double](0) should contain (-1.0) + } + test(pearson correlation) { +val df = sqlCtx.sparkContext.parallelize( + Array.tabulate(10)(i = (i, 2 * i, i * -1.0))).toDF(a, b, c) --- End diff -- this is amazing :) --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98338441 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98338438 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98349734 [Test build #31662 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31662/consoleFull) for PR 5858 at commit [`d10babb`](https://github.com/apache/spark/commit/d10babb3c52a10bac0aacace97cc06cba3c2a501). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98349740 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31662/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98349739 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user brkyvz commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29549773 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) --- End diff -- It is, isn't it? The n - 1's cancel. I tested with sciPy Pearsonr method. That's why I have the non trivial (i, sqrt(i)) test. On May 2, 2015 10:32 AM, Xiangrui Meng notificati...@github.com wrote: In sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala https://github.com/apache/spark/pull/5858#discussion_r29549089: @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) Should be the sample correlation as well. In the unit tests, please provide R commands that compute the correlation and the result, and verify that we output the same value. â Reply to this email directly or view it on GitHub https://github.com/apache/spark/pull/5858/files#r29549089. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29549089 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -23,29 +23,43 @@ import org.apache.spark.sql.types.{DoubleType, NumericType} private[sql] object StatFunctions { + /** Calculate the Pearson Correlation Coefficient for the given columns */ + private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = { +val counts = collectStatisticalData(df, cols) +counts.Ck / math.sqrt(counts.MkX * counts.MkY) --- End diff -- Should be the sample correlation as well. In the unit tests, please provide R commands that compute the correlation and the result, and verify that we output the same value. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98338769 [Test build #31662 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31662/consoleFull) for PR 5858 at commit [`d10babb`](https://github.com/apache/spark/commit/d10babb3c52a10bac0aacace97cc06cba3c2a501). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
GitHub user brkyvz opened a pull request: https://github.com/apache/spark/pull/5858 [SPARK-7241] Pearson correlation for DataFrames submitting this PR from a phone, excuse the brevity. adds Pearson correlation to Dataframes, reusing the covariance calculation code cc @mengxr @rxin You can merge this pull request into a Git repository by running: $ git pull https://github.com/brkyvz/spark df-corr Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/5858.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #5858 commit a682d06a99fc78eabc98b88a7f52f838a5b0811b Author: Burak Yavuz brk...@gmail.com Date: 2015-05-02T02:14:46Z ready for PR --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98293750 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98293759 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98293798 [Test build #31646 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31646/consoleFull) for PR 5858 at commit [`a682d06`](https://github.com/apache/spark/commit/a682d06a99fc78eabc98b88a7f52f838a5b0811b). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29544400 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala --- @@ -28,6 +28,32 @@ import org.apache.spark.sql.execution.stat._ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** + * Calculate the correlation of two columns of a DataFrame. Currently only supports the Pearson --- End diff -- Calculates --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29544396 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala --- @@ -28,6 +28,32 @@ import org.apache.spark.sql.execution.stat._ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** + * Calculate the correlation of two columns of a DataFrame. Currently only supports the Pearson + * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in + * MLlib's Statistics. + * + * @param col1 the name of the column + * @param col2 the name of the column to calculate the correlation against + * @return The Pearson Correlation Coefficient as a Double. + */ + def corr(col1: String, col2: String, method: String): Double = { +assert(method == pearson, Currently only the calculation of the Pearson Correlation + + coefficient is supported.) +StatFunctions.pearsonCorrelation(df, Seq(col1, col2)) + } + + /** + * Java Friendly implementation to calculate the Pearson correlation coefficient of two columns. --- End diff -- what's not java friendly? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98302115 Test PASSed. Refer to this link for build results (access rights to CI server needed): https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31646/ Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98304079 I will let @mengxr comment on the math part. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29545209 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala --- @@ -43,7 +43,15 @@ class DataFrameStatSuite extends FunSuite { val singleColResults = df.stat.freqItems(Array(negDoubles), 0.1) val items2 = singleColResults.collect().head items2.getSeq[Double](0) should contain (-1.0) + } + test(pearson correlation) { +val df = sqlCtx.sparkContext.parallelize( + Array.tabulate(10)(i = (i, 2 * i, i * -1.0))).toDF(a, b, c) --- End diff -- fyi we have implicits to add toDF on Seq[Tuples], so you can just replace Array with Seq, and then remove all the sparkContext.parallelize stuff. Maybe do it for the frequent items above also. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98302114 Merged build finished. Test PASSed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98302103 [Test build #31646 has finished](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31646/consoleFull) for PR 5858 at commit [`a682d06`](https://github.com/apache/spark/commit/a682d06a99fc78eabc98b88a7f52f838a5b0811b). * This patch **passes all tests**. * This patch merges cleanly. * This patch adds no public classes. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29545205 --- Diff: python/pyspark/sql/dataframe.py --- @@ -875,6 +875,25 @@ def fillna(self, value, subset=None): return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) +def corr(self, col1, col2, method=pearson): + +Calculate the correlation of two columns of a DataFrame as a double value. Currently only --- End diff -- Calculates --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29545194 --- Diff: python/pyspark/sql/dataframe.py --- @@ -875,6 +875,25 @@ def fillna(self, value, subset=None): return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) +def corr(self, col1, col2, method=pearson): --- End diff -- similar to the other PR, might be better to make method=None here. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98302293 Merged build triggered. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user AmplabJenkins commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98302295 Merged build started. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user SparkQA commented on the pull request: https://github.com/apache/spark/pull/5858#issuecomment-98302301 [Test build #31652 has started](https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/31652/consoleFull) for PR 5858 at commit [`4fe693b`](https://github.com/apache/spark/commit/4fe693b6d1bdbe31af58e18de1c4d575b559db28). --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request: [SPARK-7241] Pearson correlation for DataFrame...
Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/5858#discussion_r29545190 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala --- @@ -28,6 +28,32 @@ import org.apache.spark.sql.execution.stat._ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** + * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson + * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in + * MLlib's Statistics. + * + * @param col1 the name of the column + * @param col2 the name of the column to calculate the correlation against + * @return The Pearson Correlation Coefficient as a Double. + */ + def corr(col1: String, col2: String, method: String): Double = { +assert(method == pearson, Currently only the calculation of the Pearson Correlation + --- End diff -- require. assert can be turned off ... and assert should only be used to check internal invariants. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org