git commit: [SPARK-1784] Add a new partitioner to allow specifying # of keys per partition

rxin Tue, 13 May 2014 21:52:29 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-1.0 618b3e6e7 -> 66fe4797a



[SPARK-1784] Add a new partitioner to allow specifying # of keys per partition

This change adds a new partitioner which allows users
to specify # of keys per partition.

Author: Syed Hashmi <shas...@cloudera.com>

Closes #721 from syedhashmi/master and squashes the following commits:

4ca94cc [Syed Hashmi] [SPARK-1784] Add a new partitioner

(cherry picked from commit 92cebada09a7e5a00ab48bcb350a9462949c33eb)
Signed-off-by: Reynold Xin <r...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/66fe4797
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/66fe4797
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/66fe4797

Branch: refs/heads/branch-1.0
Commit: 66fe4797a845bb1a2728dcdb2d7371f0e90da867
Parents: 618b3e6
Author: Syed Hashmi <shas...@cloudera.com>
Authored: Tue May 13 21:24:23 2014 -0700
Committer: Reynold Xin <r...@apache.org>
Committed: Tue May 13 21:25:01 2014 -0700

----------------------------------------------------------------------
 .../scala/org/apache/spark/Partitioner.scala    | 61 ++++++++++++++++++++
 .../org/apache/spark/PartitioningSuite.scala    | 34 +++++++++++
 2 files changed, 95 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/66fe4797/core/src/main/scala/org/apache/spark/Partitioner.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala 
b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 9155159..6274796 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -156,3 +156,64 @@ class RangePartitioner[K : Ordering : ClassTag, V](
       false
   }
 }
+
+/**
+ * A [[org.apache.spark.Partitioner]] that partitions records into specified 
bounds
+ * Default value is 1000. Once all partitions have bounds elements, the 
partitioner
+ * allocates 1 element per partition so eventually the smaller partitions are 
at most
+ * off by 1 key compared to the larger partitions.
+ */
+class BoundaryPartitioner[K : Ordering : ClassTag, V](
+                                                    partitions: Int,
+                                                    @transient rdd: RDD[_ <: 
Product2[K,V]],
+                                                    private val boundary: Int 
= 1000)
+  extends Partitioner {
+
+  // this array keeps track of keys assigned to a partition
+  // counts[0] refers to # of keys in partition 0 and so on
+  private val counts: Array[Int] = {
+    new Array[Int](numPartitions)
+  }
+
+  def numPartitions = math.abs(partitions)
+
+  /*
+  * Ideally, this should've been calculated based on # partitions and total 
keys
+  * But we are not calling count on RDD here to avoid calling an action.
+   * User has the flexibility of calling count and passing in any appropriate 
boundary
+   */
+  def keysPerPartition = boundary
+
+  var currPartition = 0
+
+  /*
+  * Pick current partition for the key until we hit the bound for keys / 
partition,
+  * start allocating to next partition at that time.
+  *
+  * NOTE: In case where we have lets say 2000 keys and user says 3 partitions 
with 500
+  * passed in as boundary, the first 500 will goto P1, 501-1000 go to P2, 
1001-1500 go to P3,
+  * after that, next keys go to one partition at a time. So 1501 goes to P1, 
1502 goes to P2,
+  * 1503 goes to P3 and so on.
+   */
+  def getPartition(key: Any): Int = {
+    val partition = currPartition
+    counts(partition) = counts(partition) + 1
+    /*
+    * Since we are filling up a partition before moving to next one (this 
helps in maintaining
+    * order of keys, in certain cases, it is possible to end up with empty 
partitions, like
+    * 3 partitions, 500 keys / partition and if rdd has 700 keys, 1 partition 
will be entirely
+    * empty.
+     */
+    if(counts(currPartition) >= keysPerPartition)
+      currPartition = (currPartition + 1) % numPartitions
+    partition
+  }
+
+  override def equals(other: Any): Boolean = other match {
+    case r: BoundaryPartitioner[_,_] =>
+      (r.counts.sameElements(counts) && r.boundary == boundary
+        && r.currPartition == currPartition)
+    case _ =>
+      false
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/66fe4797/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
----------------------------------------------------------------------
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala 
b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 7c30626..7d40395 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -66,6 +66,40 @@ class PartitioningSuite extends FunSuite with 
SharedSparkContext with PrivateMet
     assert(descendingP4 != p4)
   }
 
+  test("BoundaryPartitioner equality") {
+    // Make an RDD where all the elements are the same so that the partition 
range bounds
+    // are deterministically all the same.
+    val rdd = sc.parallelize(1.to(4000)).map(x => (x, x))
+
+    val p2 = new BoundaryPartitioner(2, rdd, 1000)
+    val p4 = new BoundaryPartitioner(4, rdd, 1000)
+    val anotherP4 = new BoundaryPartitioner(4, rdd)
+
+    assert(p2 === p2)
+    assert(p4 === p4)
+    assert(p2 != p4)
+    assert(p4 != p2)
+    assert(p4 === anotherP4)
+    assert(anotherP4 === p4)
+  }
+
+  test("BoundaryPartitioner getPartition") {
+    val rdd = sc.parallelize(1.to(2000)).map(x => (x, x))
+    val partitioner = new BoundaryPartitioner(4, rdd,  500)
+    1.to(2000).map { element => {
+      val partition = partitioner.getPartition(element)
+      if (element <= 500) {
+        assert(partition === 0)
+      } else if (element > 501 && element <= 1000) {
+        assert(partition === 1)
+      } else if (element > 1001 && element <= 1500) {
+        assert(partition === 2)
+      } else if (element > 1501 && element <= 2000) {
+        assert(partition === 3)
+      }
+    }}
+  }
+
   test("RangePartitioner getPartition") {
     val rdd = sc.parallelize(1.to(2000)).map(x => (x, x))
     // We have different behaviour of getPartition for partitions with less 
than 1000 and more than

git commit: [SPARK-1784] Add a new partitioner to allow specifying # of keys per partition

Reply via email to