Repository: spark Updated Branches: refs/heads/master 7c5fc28af -> ee07541e9
SPARK-2748 [MLLIB] [GRAPHX] Loss of precision for small arguments to Math.exp, Math.log In a few places in MLlib, an expression of the form `log(1.0 + p)` is evaluated. When p is so small that `1.0 + p == 1.0`, the result is 0.0. However the correct answer is very near `p`. This is why `Math.log1p` exists. Similarly for one instance of `exp(m) - 1` in GraphX; there's a special `Math.expm1` method. While the errors occur only for very small arguments, given their use in machine learning algorithms, this is entirely possible. Also note the related PR for Python: https://github.com/apache/spark/pull/1652 Author: Sean Owen <sro...@gmail.com> Closes #1659 from srowen/SPARK-2748 and squashes the following commits: c5926d4 [Sean Owen] Use log1p, expm1 for better precision for tiny arguments Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee07541e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee07541e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee07541e Branch: refs/heads/master Commit: ee07541e99f0d262bf662b669b6542cf302ff39c Parents: 7c5fc28 Author: Sean Owen <sro...@gmail.com> Authored: Wed Jul 30 08:55:15 2014 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Wed Jul 30 08:55:15 2014 -0700 ---------------------------------------------------------------------- .../scala/org/apache/spark/graphx/util/GraphGenerators.scala | 6 ++++-- .../scala/org/apache/spark/mllib/optimization/Gradient.scala | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/ee07541e/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala ---------------------------------------------------------------------- diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala index 635514f..6014954 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala @@ -100,8 +100,10 @@ object GraphGenerators { */ private def sampleLogNormal(mu: Double, sigma: Double, maxVal: Int): Int = { val rand = new Random() - val m = math.exp(mu + (sigma * sigma) / 2.0) - val s = math.sqrt((math.exp(sigma*sigma) - 1) * math.exp(2*mu + sigma*sigma)) + val sigmaSq = sigma * sigma + val m = math.exp(mu + sigmaSq / 2.0) + // expm1 is exp(m)-1 with better accuracy for tiny m + val s = math.sqrt(math.expm1(sigmaSq) * math.exp(2*mu + sigmaSq)) // Z ~ N(0, 1) var X: Double = maxVal http://git-wip-us.apache.org/repos/asf/spark/blob/ee07541e/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala index 679842f..9d82f01 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala @@ -68,9 +68,9 @@ class LogisticGradient extends Gradient { val gradient = brzData * gradientMultiplier val loss = if (label > 0) { - math.log(1 + math.exp(margin)) + math.log1p(math.exp(margin)) // log1p is log(1+p) but more accurate for small p } else { - math.log(1 + math.exp(margin)) - margin + math.log1p(math.exp(margin)) - margin } (Vectors.fromBreeze(gradient), loss) @@ -89,9 +89,9 @@ class LogisticGradient extends Gradient { brzAxpy(gradientMultiplier, brzData, cumGradient.toBreeze) if (label > 0) { - math.log(1 + math.exp(margin)) + math.log1p(math.exp(margin)) } else { - math.log(1 + math.exp(margin)) - margin + math.log1p(math.exp(margin)) - margin } } }