Updated Branches: refs/heads/master 2576896c9 -> 14132b093
CRUNCH-98. Sampling Scala PCollection. Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/14132b09 Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/14132b09 Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/14132b09 Branch: refs/heads/master Commit: 14132b093cd5201b0f323d7fb9c7a9ab4a58a679 Parents: 2576896 Author: Kiyan Ahmadizadeh <[email protected]> Authored: Tue Oct 16 15:29:41 2012 -0700 Committer: Kiyan Ahmadizadeh <[email protected]> Committed: Tue Oct 16 16:10:47 2012 -0700 ---------------------------------------------------------------------- .../apache/crunch/scrunch/PCollectionTest.scala | 28 +++++++++++++++ .../org/apache/crunch/scrunch/PCollection.scala | 8 ++++ 2 files changed, 36 insertions(+), 0 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala index 4c25298..94ac917 100644 --- a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala +++ b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala @@ -69,4 +69,32 @@ class PCollectionTest extends CrunchTestSupport with JUnitSuite { assertEquals("Wrong last line in Shakespeare.", lastLineInShakespeare, lines(linesInShakespeare - 1)) } + + /** + * Tests sampling elements from a PCollection using some acceptance probability. + */ + @Test def testSampling { + // Get the collection and sample ten percent. + val shakespeare = shakespeareCollection + val sampledCollection = shakespeare.sample(0.10) + val length = sampledCollection.length().value() + // The number of lines in the sampled collection should be about ten percent of the lines in + // the original collection. We use a tolerance of +- 50. + val lower = linesInShakespeare * 0.10 - 50 + val upper = linesInShakespeare * 0.10 + 50 + assertTrue("Sampled collection contains too few elements.", lower <= length) + assertTrue("Sampled collection contains too many elements.", length <= upper) + } + + /** + * Tests sampling elements from a PCollection using some acceptance probability and a seed. + */ + @Test def testSamplingWithSeed { + // Get the collection and sample ten percent. + val shakespeare = shakespeareCollection + // With a seed of 1L, 380 elements should be sampled. + val sampledCollection = shakespeare.sample(0.10, 1L) + val length = sampledCollection.length().value() + assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, length) + } } http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala ---------------------------------------------------------------------- diff --git a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala index 89959ea..ac2242f 100644 --- a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala +++ b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala @@ -76,6 +76,14 @@ class PCollection[S](val native: JCollection[S]) extends PCollectionLike[S, PCol def min()(implicit converter: Converter[S, S]) = PObject(Aggregate.min(native))(converter) + def sample(acceptanceProbability: Double) = { + wrap(native.sample(acceptanceProbability)) + } + + def sample(acceptanceProbability: Double, seed: Long) = { + wrap(native.sample(acceptanceProbability, seed)) + } + def pType = native.getPType() }
