Repository: spark
Updated Branches:
  refs/heads/master 4374a46bf -> 5eef1e6c6


[SPARK-15660][CORE] Update RDD `variance/stdev` description and add 
popVariance/popStdev

## What changes were proposed in this pull request?

In Spark-11490, `variance/stdev` are redefined as the **sample** 
`variance/stdev` instead of population ones. This PR updates the other old 
documentations to prevent users from misunderstanding. This will update the 
following Scala/Java API docs.

- 
http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.api.java.JavaDoubleRDD
- 
http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.rdd.DoubleRDDFunctions
- 
http://spark.apache.org/docs/2.0.0-preview/api/scala/index.html#org.apache.spark.util.StatCounter
- 
http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/api/java/JavaDoubleRDD.html
- 
http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/rdd/DoubleRDDFunctions.html
- 
http://spark.apache.org/docs/2.0.0-preview/api/java/org/apache/spark/util/StatCounter.html

Also, this PR adds them `popVariance` and `popStdev` functions clearly.

## How was this patch tested?

Pass the updated Jenkins tests.

Author: Dongjoon Hyun <dongj...@apache.org>

Closes #13403 from dongjoon-hyun/SPARK-15660.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5eef1e6c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5eef1e6c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5eef1e6c

Branch: refs/heads/master
Commit: 5eef1e6c6a8b6202fc6db4a90c4caab5169e86c6
Parents: 4374a46
Author: Dongjoon Hyun <dongj...@apache.org>
Authored: Thu Jun 23 11:07:34 2016 +0100
Committer: Sean Owen <so...@cloudera.com>
Committed: Thu Jun 23 11:07:34 2016 +0100

----------------------------------------------------------------------
 .../apache/spark/api/java/JavaDoubleRDD.scala   | 17 +++++++++++++--
 .../apache/spark/rdd/DoubleRDDFunctions.scala   | 21 +++++++++++++++++--
 .../org/apache/spark/util/StatCounter.scala     | 22 ++++++++++++++++----
 .../java/org/apache/spark/JavaAPISuite.java     |  2 ++
 .../org/apache/spark/PartitioningSuite.scala    |  4 ++++
 5 files changed, 58 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5eef1e6c/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala 
b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index 0d3a523..0026fc9 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -22,6 +22,7 @@ import java.lang.{Double => JDouble}
 import scala.language.implicitConversions
 import scala.reflect.ClassTag
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.Partitioner
 import org.apache.spark.api.java.function.{Function => JFunction}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
@@ -184,10 +185,10 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
   /** Compute the mean of this RDD's elements. */
   def mean(): JDouble = srdd.mean()
 
-  /** Compute the variance of this RDD's elements. */
+  /** Compute the population variance of this RDD's elements. */
   def variance(): JDouble = srdd.variance()
 
-  /** Compute the standard deviation of this RDD's elements. */
+  /** Compute the population standard deviation of this RDD's elements. */
   def stdev(): JDouble = srdd.stdev()
 
   /**
@@ -202,6 +203,18 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    */
   def sampleVariance(): JDouble = srdd.sampleVariance()
 
+  /**
+   * Compute the population standard deviation of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popStdev(): JDouble = srdd.popStdev()
+
+  /**
+   * Compute the population variance of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popVariance(): JDouble = srdd.popVariance()
+
   /** Return the approximate mean of the elements in this RDD. */
   def meanApprox(timeout: Long, confidence: JDouble): 
PartialResult[BoundedDouble] =
     srdd.meanApprox(timeout, confidence)

http://git-wip-us.apache.org/repos/asf/spark/blob/5eef1e6c/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala 
b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index 368916a..a05a770 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.rdd
 
+import org.apache.spark.annotation.Since
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.partial.BoundedDouble
@@ -47,12 +48,12 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging 
with Serializable {
     stats().mean
   }
 
-  /** Compute the variance of this RDD's elements. */
+  /** Compute the population variance of this RDD's elements. */
   def variance(): Double = self.withScope {
     stats().variance
   }
 
-  /** Compute the standard deviation of this RDD's elements. */
+  /** Compute the population standard deviation of this RDD's elements. */
   def stdev(): Double = self.withScope {
     stats().stdev
   }
@@ -74,6 +75,22 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging 
with Serializable {
   }
 
   /**
+   * Compute the population standard deviation of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popStdev(): Double = self.withScope {
+    stats().popStdev
+  }
+
+  /**
+   * Compute the population variance of this RDD's elements.
+   */
+  @Since("2.1.0")
+  def popVariance(): Double = self.withScope {
+    stats().popVariance
+  }
+
+  /**
    * Approximate operation to return the mean within a timeout.
    */
   def meanApprox(

http://git-wip-us.apache.org/repos/asf/spark/blob/5eef1e6c/core/src/main/scala/org/apache/spark/util/StatCounter.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala 
b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
index 8586da1..4538136 100644
--- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala
+++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util
 
+import org.apache.spark.annotation.Since
+
 /**
  * A class for tracking the statistics of a set of numbers (count, mean and 
variance) in a
  * numerically robust way. Includes support for merging two StatCounters. 
Based on Welford
@@ -104,8 +106,14 @@ class StatCounter(values: TraversableOnce[Double]) extends 
Serializable {
 
   def min: Double = minValue
 
-  /** Return the variance of the values. */
-  def variance: Double = {
+  /** Return the population variance of the values. */
+  def variance: Double = popVariance
+
+  /**
+   * Return the population variance of the values.
+   */
+  @Since("2.1.0")
+  def popVariance: Double = {
     if (n == 0) {
       Double.NaN
     } else {
@@ -125,8 +133,14 @@ class StatCounter(values: TraversableOnce[Double]) extends 
Serializable {
     }
   }
 
-  /** Return the standard deviation of the values. */
-  def stdev: Double = math.sqrt(variance)
+  /** Return the population standard deviation of the values. */
+  def stdev: Double = popStdev
+
+  /**
+   * Return the population standard deviation of the values.
+   */
+  @Since("2.1.0")
+  def popStdev: Double = math.sqrt(popVariance)
 
   /**
    * Return the sample standard deviation of the values, which corrects for 
bias in estimating the

http://git-wip-us.apache.org/repos/asf/spark/blob/5eef1e6c/core/src/test/java/org/apache/spark/JavaAPISuite.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java 
b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 7bac068..533025b 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -733,8 +733,10 @@ public class JavaAPISuite implements Serializable {
     assertEquals(20/6.0, rdd.mean(), 0.01);
     assertEquals(20/6.0, rdd.mean(), 0.01);
     assertEquals(6.22222, rdd.variance(), 0.01);
+    assertEquals(rdd.variance(), rdd.popVariance(), 1e-14);
     assertEquals(7.46667, rdd.sampleVariance(), 0.01);
     assertEquals(2.49444, rdd.stdev(), 0.01);
+    assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14);
     assertEquals(2.73252, rdd.sampleStdev(), 0.01);
 
     rdd.first();

http://git-wip-us.apache.org/repos/asf/spark/blob/5eef1e6c/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala 
b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 3d31c78..c5d4968 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -244,6 +244,10 @@ class PartitioningSuite extends SparkFunSuite with 
SharedSparkContext with Priva
     assert(abs(6.0/2 - rdd.mean) < 0.01)
     assert(abs(1.0 - rdd.variance) < 0.01)
     assert(abs(1.0 - rdd.stdev) < 0.01)
+    assert(abs(rdd.variance - rdd.popVariance) < 1e-14)
+    assert(abs(rdd.stdev - rdd.popStdev) < 1e-14)
+    assert(abs(2.0 - rdd.sampleVariance) < 1e-14)
+    assert(abs(Math.sqrt(2.0) - rdd.sampleStdev) < 1e-14)
     assert(stats.max === 4.0)
     assert(stats.min === 2.0)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to