Updated Branches:
  refs/heads/master 2576896c9 -> 14132b093

CRUNCH-98. Sampling Scala PCollection.


Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/14132b09
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/14132b09
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/14132b09

Branch: refs/heads/master
Commit: 14132b093cd5201b0f323d7fb9c7a9ab4a58a679
Parents: 2576896
Author: Kiyan Ahmadizadeh <[email protected]>
Authored: Tue Oct 16 15:29:41 2012 -0700
Committer: Kiyan Ahmadizadeh <[email protected]>
Committed: Tue Oct 16 16:10:47 2012 -0700

----------------------------------------------------------------------
 .../apache/crunch/scrunch/PCollectionTest.scala    |   28 +++++++++++++++
 .../org/apache/crunch/scrunch/PCollection.scala    |    8 ++++
 2 files changed, 36 insertions(+), 0 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
----------------------------------------------------------------------
diff --git 
a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala 
b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
index 4c25298..94ac917 100644
--- 
a/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
+++ 
b/crunch-scrunch/src/it/scala/org/apache/crunch/scrunch/PCollectionTest.scala
@@ -69,4 +69,32 @@ class PCollectionTest extends CrunchTestSupport with 
JUnitSuite {
     assertEquals("Wrong last line in Shakespeare.", lastLineInShakespeare,
         lines(linesInShakespeare - 1))
   }
+
+  /**
+   * Tests sampling elements from a PCollection using some acceptance 
probability.
+   */
+  @Test def testSampling {
+    // Get the collection and sample ten percent.
+    val shakespeare = shakespeareCollection
+    val sampledCollection = shakespeare.sample(0.10)
+    val length = sampledCollection.length().value()
+    // The number of lines in the sampled collection should be about ten 
percent of the lines in
+    // the original collection. We use a tolerance of +- 50.
+    val lower = linesInShakespeare * 0.10 - 50
+    val upper = linesInShakespeare * 0.10 + 50
+    assertTrue("Sampled collection contains too few elements.", lower <= 
length)
+    assertTrue("Sampled collection contains too many elements.", length <= 
upper)
+  }
+
+  /**
+   * Tests sampling elements from a PCollection using some acceptance 
probability and a seed.
+   */
+  @Test def testSamplingWithSeed {
+    // Get the collection and sample ten percent.
+    val shakespeare = shakespeareCollection
+    // With a seed of 1L, 380 elements should be sampled.
+    val sampledCollection = shakespeare.sample(0.10, 1L)
+    val length = sampledCollection.length().value()
+    assertEquals("Incorrect number of elements sampled with seed 1L.", 380L, 
length)
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/14132b09/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
----------------------------------------------------------------------
diff --git 
a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala 
b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
index 89959ea..ac2242f 100644
--- a/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
+++ b/crunch-scrunch/src/main/scala/org/apache/crunch/scrunch/PCollection.scala
@@ -76,6 +76,14 @@ class PCollection[S](val native: JCollection[S]) extends 
PCollectionLike[S, PCol
 
   def min()(implicit converter: Converter[S, S]) = 
PObject(Aggregate.min(native))(converter)
 
+  def sample(acceptanceProbability: Double) = {
+    wrap(native.sample(acceptanceProbability))
+  }
+
+  def sample(acceptanceProbability: Double, seed: Long) = {
+    wrap(native.sample(acceptanceProbability, seed))
+  }
+
   def pType = native.getPType()
 }
 

Reply via email to