[GitHub] spark pull request: [SPARK-3250] Implement Gap Sampling optimizati...

mengxr Tue, 07 Oct 2014 10:56:14 -0700

Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/2455#discussion_r18536682
  
    --- Diff: 
core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala ---
    @@ -18,96 +18,547 @@
     package org.apache.spark.util.random
     
     import java.util.Random
    +import scala.collection.mutable.ArrayBuffer
     
     import cern.jet.random.Poisson
    -import org.scalatest.{BeforeAndAfter, FunSuite}
    -import org.scalatest.mock.EasyMockSugar
    -
    -class RandomSamplerSuite extends FunSuite with BeforeAndAfter with 
EasyMockSugar {
    -
    -  val a = List(1, 2, 3, 4, 5, 6, 7, 8, 9)
    -
    -  var random: Random = _
    -  var poisson: Poisson = _
    -
    -  before {
    -    random = mock[Random]
    -    poisson = mock[Poisson]
    -  }
    -
    -  test("BernoulliSamplerWithRange") {
    -    expecting {
    -      for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
    -        random.nextDouble().andReturn(x)
    -      }
    -    }
    -    whenExecuting(random) {
    -      val sampler = new BernoulliSampler[Int](0.25, 0.55)
    -      sampler.rng = random
    -      assert(sampler.sample(a.iterator).toList == List(3, 4, 5))
    -    }
    -  }
    -
    -  test("BernoulliSamplerWithRangeInverse") {
    -    expecting {
    -      for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
    -        random.nextDouble().andReturn(x)
    -      }
    -    }
    -    whenExecuting(random) {
    -      val sampler = new BernoulliSampler[Int](0.25, 0.55, true)
    -      sampler.rng = random
    -      assert(sampler.sample(a.iterator).toList === List(1, 2, 6, 7, 8, 9))
    -    }
    -  }
    -
    -  test("BernoulliSamplerWithRatio") {
    -    expecting {
    -      for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
    -        random.nextDouble().andReturn(x)
    -      }
    -    }
    -    whenExecuting(random) {
    -      val sampler = new BernoulliSampler[Int](0.35)
    -      sampler.rng = random
    -      assert(sampler.sample(a.iterator).toList == List(1, 2, 3))
    -    }
    -  }
    -
    -  test("BernoulliSamplerWithComplement") {
    -    expecting {
    -      for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
    -        random.nextDouble().andReturn(x)
    -      }
    -    }
    -    whenExecuting(random) {
    -      val sampler = new BernoulliSampler[Int](0.25, 0.55, true)
    -      sampler.rng = random
    -      assert(sampler.sample(a.iterator).toList == List(1, 2, 6, 7, 8, 9))
    -    }
    -  }
    -
    -  test("BernoulliSamplerSetSeed") {
    -    expecting {
    -      random.setSeed(10L)
    -    }
    -    whenExecuting(random) {
    -      val sampler = new BernoulliSampler[Int](0.2)
    -      sampler.rng = random
    -      sampler.setSeed(10L)
    -    }
    -  }
    -
    -  test("PoissonSampler") {
    -    expecting {
    -      for(x <- Seq(0, 1, 2, 0, 1, 1, 0, 0, 0)) {
    -        poisson.nextInt().andReturn(x)
    -      }
    -    }
    -    whenExecuting(poisson) {
    -      val sampler = new PoissonSampler[Int](0.2)
    -      sampler.rng = poisson
    -      assert(sampler.sample(a.iterator).toList == List(2, 3, 3, 5, 6))
    -    }
    +import cern.jet.random.engine.DRand
    +
    +import org.scalatest.{FunSuite, Matchers}
    +
    +class RandomSamplerSuite extends FunSuite with Matchers {
    +  // My statistical testing methodology is to run a Kolmogorov-Smirnov 
(KS) test
    +  // between the random samplers and simple reference samplers (known to 
work correctly).
    +  // The sampling gap sizes between chosen samples should show up as 
having the same
    +  // distributions between test and reference, if things are working 
properly.  That is,
    +  // the KS test will fail to strongly reject the null hypothesis that the 
distributions of
    +  // sampling gaps are the same.
    +  // There are no actual KS tests implemented for scala (that I can find) 
- and so what I
    +  // have done here is pre-compute "D" - the KS statistic - that 
corresponds to a "weak"
    +  // p-value for a particular sample size.  I can then test that my 
measured KS stats
    +  // are less than D.  Computing D-values is easy, and implemented below.
    +  //
    +  // I used the scipy 'kstwobign' distribution to pre-compute my D value:
    +  //
    +  // def ksdval(q=0.1, n=1000):
    +  //     en = np.sqrt(float(n) / 2.0)
    +  //     return stats.kstwobign.isf(float(q)) / (en + 0.12 + 0.11 / en)
    +  //
    +  // When comparing KS stats I take the median of a small number of 
independent test runs
    +  // to compensate for the issue that any sampled statistic will show 
"false positive" with
    +  // some probability.  Even when two distributions are the same, they 
will register as
    +  // different 10% of the time at a p-value of 0.1
    +
    +  // This D value is the precomputed KS statistic for p-value 0.1, sample 
size 1000:
    +  val sampleSize = 1000
    +  val D = 0.0544280747619
    +
    +  // I'm not a big fan of fixing seeds, but unit testing based on running 
statistical tests
    +  // will always fail with some nonzero probability, so I'll fix the seed 
to prevent these
    +  // tests from generating random failure noise in CI testing, etc.
    +  val rngSeed: Random = RandomSampler.rngDefault
    +  rngSeed.setSeed(235711)
    +  //rngSeed.setSeed(System.nanoTime)
    +
    +  // Reference implementation of sampling without replacement (bernoulli)
    +  def sample[T](data: Iterator[T], f: Double): Iterator[T] = {
    +    val rng: Random = RandomSampler.rngDefault
    +    rng.setSeed(rngSeed.nextLong)
    +    data.filter(_ => (Math.random() <= f))
    +  }
    +
    +  // Reference implementation of sampling with replacement
    +  def sample_replace[T](data: Iterator[T], f: Double): Iterator[T] = {
    +    val rng = new Poisson(f, new DRand(rngSeed.nextInt))
    +    data.flatMap(v => {
    +      val rep = rng.nextInt()
    +      if (rep == 0) Iterator.empty else Iterator.fill(rep)(v)
    +    })
    +  }
    +
    +  def gaps(data: Iterator[Int]): Iterator[Int] = {
    +    // this function assumes samples are emitted in non-decreasing order, 
which
    +    // works because that is how I generate them, and the samplers 
preserve input order
    +    data.scanLeft((0,0))((l:(Int,Int), r:Int) => (r, 
r-l._1)).drop(2).map(_._2)
    +  }
    +
    +  def chist(hist: Array[Int]): Array[Double] = {
    +    val n = hist.sum.toDouble
    +    assert(n > 0.0)
    +    hist.scanLeft(0)(_ + _).drop(1).map(_.toDouble/n)
    +  }
    +
    +  def cumulants(d1: Array[Int], d2: Array[Int],
    --- End diff --
    
    need doc



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request: [SPARK-3250] Implement Gap Sampling optimizati...

Reply via email to