Github user mengxr commented on a diff in the pull request: https://github.com/apache/spark/pull/2455#discussion_r18536682 --- Diff: core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala --- @@ -18,96 +18,547 @@ package org.apache.spark.util.random import java.util.Random +import scala.collection.mutable.ArrayBuffer import cern.jet.random.Poisson -import org.scalatest.{BeforeAndAfter, FunSuite} -import org.scalatest.mock.EasyMockSugar - -class RandomSamplerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar { - - val a = List(1, 2, 3, 4, 5, 6, 7, 8, 9) - - var random: Random = _ - var poisson: Poisson = _ - - before { - random = mock[Random] - poisson = mock[Poisson] - } - - test("BernoulliSamplerWithRange") { - expecting { - for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) { - random.nextDouble().andReturn(x) - } - } - whenExecuting(random) { - val sampler = new BernoulliSampler[Int](0.25, 0.55) - sampler.rng = random - assert(sampler.sample(a.iterator).toList == List(3, 4, 5)) - } - } - - test("BernoulliSamplerWithRangeInverse") { - expecting { - for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) { - random.nextDouble().andReturn(x) - } - } - whenExecuting(random) { - val sampler = new BernoulliSampler[Int](0.25, 0.55, true) - sampler.rng = random - assert(sampler.sample(a.iterator).toList === List(1, 2, 6, 7, 8, 9)) - } - } - - test("BernoulliSamplerWithRatio") { - expecting { - for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) { - random.nextDouble().andReturn(x) - } - } - whenExecuting(random) { - val sampler = new BernoulliSampler[Int](0.35) - sampler.rng = random - assert(sampler.sample(a.iterator).toList == List(1, 2, 3)) - } - } - - test("BernoulliSamplerWithComplement") { - expecting { - for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) { - random.nextDouble().andReturn(x) - } - } - whenExecuting(random) { - val sampler = new BernoulliSampler[Int](0.25, 0.55, true) - sampler.rng = random - assert(sampler.sample(a.iterator).toList == List(1, 2, 6, 7, 8, 9)) - } - } - - test("BernoulliSamplerSetSeed") { - expecting { - random.setSeed(10L) - } - whenExecuting(random) { - val sampler = new BernoulliSampler[Int](0.2) - sampler.rng = random - sampler.setSeed(10L) - } - } - - test("PoissonSampler") { - expecting { - for(x <- Seq(0, 1, 2, 0, 1, 1, 0, 0, 0)) { - poisson.nextInt().andReturn(x) - } - } - whenExecuting(poisson) { - val sampler = new PoissonSampler[Int](0.2) - sampler.rng = poisson - assert(sampler.sample(a.iterator).toList == List(2, 3, 3, 5, 6)) - } +import cern.jet.random.engine.DRand + +import org.scalatest.{FunSuite, Matchers} + +class RandomSamplerSuite extends FunSuite with Matchers { + // My statistical testing methodology is to run a Kolmogorov-Smirnov (KS) test + // between the random samplers and simple reference samplers (known to work correctly). + // The sampling gap sizes between chosen samples should show up as having the same + // distributions between test and reference, if things are working properly. That is, + // the KS test will fail to strongly reject the null hypothesis that the distributions of + // sampling gaps are the same. + // There are no actual KS tests implemented for scala (that I can find) - and so what I + // have done here is pre-compute "D" - the KS statistic - that corresponds to a "weak" + // p-value for a particular sample size. I can then test that my measured KS stats + // are less than D. Computing D-values is easy, and implemented below. + // + // I used the scipy 'kstwobign' distribution to pre-compute my D value: + // + // def ksdval(q=0.1, n=1000): + // en = np.sqrt(float(n) / 2.0) + // return stats.kstwobign.isf(float(q)) / (en + 0.12 + 0.11 / en) + // + // When comparing KS stats I take the median of a small number of independent test runs + // to compensate for the issue that any sampled statistic will show "false positive" with + // some probability. Even when two distributions are the same, they will register as + // different 10% of the time at a p-value of 0.1 + + // This D value is the precomputed KS statistic for p-value 0.1, sample size 1000: + val sampleSize = 1000 + val D = 0.0544280747619 + + // I'm not a big fan of fixing seeds, but unit testing based on running statistical tests + // will always fail with some nonzero probability, so I'll fix the seed to prevent these + // tests from generating random failure noise in CI testing, etc. + val rngSeed: Random = RandomSampler.rngDefault + rngSeed.setSeed(235711) + //rngSeed.setSeed(System.nanoTime) + + // Reference implementation of sampling without replacement (bernoulli) + def sample[T](data: Iterator[T], f: Double): Iterator[T] = { + val rng: Random = RandomSampler.rngDefault + rng.setSeed(rngSeed.nextLong) + data.filter(_ => (Math.random() <= f)) + } + + // Reference implementation of sampling with replacement + def sample_replace[T](data: Iterator[T], f: Double): Iterator[T] = { + val rng = new Poisson(f, new DRand(rngSeed.nextInt)) + data.flatMap(v => { + val rep = rng.nextInt() + if (rep == 0) Iterator.empty else Iterator.fill(rep)(v) + }) + } + + def gaps(data: Iterator[Int]): Iterator[Int] = { + // this function assumes samples are emitted in non-decreasing order, which + // works because that is how I generate them, and the samplers preserve input order + data.scanLeft((0,0))((l:(Int,Int), r:Int) => (r, r-l._1)).drop(2).map(_._2) + } + + def chist(hist: Array[Int]): Array[Double] = { + val n = hist.sum.toDouble + assert(n > 0.0) + hist.scanLeft(0)(_ + _).drop(1).map(_.toDouble/n) + } + + def cumulants(d1: Array[Int], d2: Array[Int], --- End diff -- need doc
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org