Repository: spark Updated Branches: refs/heads/branch-1.2 8cefb63c1 -> 3b07c483a
[SPARK-4304] [PySpark] Fix sort on empty RDD This PR fix sortBy()/sortByKey() on empty RDD. This should be back ported into 1.1/1.2 Author: Davies Liu <dav...@databricks.com> Closes #3162 from davies/fix_sort and squashes the following commits: 84f64b7 [Davies Liu] add tests 52995b5 [Davies Liu] fix sortByKey() on empty RDD (cherry picked from commit 7779109796c90d789464ab0be35917f963bbe867) Signed-off-by: Josh Rosen <joshro...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b07c483 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b07c483 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b07c483 Branch: refs/heads/branch-1.2 Commit: 3b07c483aa98965ac9dc8fdcc40e593e4edb97fd Parents: 8cefb63 Author: Davies Liu <dav...@databricks.com> Authored: Fri Nov 7 20:53:03 2014 -0800 Committer: Josh Rosen <joshro...@databricks.com> Committed: Fri Nov 7 20:53:34 2014 -0800 ---------------------------------------------------------------------- python/pyspark/rdd.py | 2 ++ python/pyspark/tests.py | 3 +++ 2 files changed, 5 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/3b07c483/python/pyspark/rdd.py ---------------------------------------------------------------------- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 879655d..08d0474 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -521,6 +521,8 @@ class RDD(object): # the key-space into bins such that the bins have roughly the same # number of (key, value) pairs falling into them rddSize = self.count() + if not rddSize: + return self # empty RDD maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner fraction = min(maxSampleSize / max(rddSize, 1), 1.0) samples = self.sample(False, fraction, 1).map(lambda (k, v): k).collect() http://git-wip-us.apache.org/repos/asf/spark/blob/3b07c483/python/pyspark/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 9f625c5..491e445 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -649,6 +649,9 @@ class RDDTests(ReusedPySparkTestCase): self.assertEquals(result.getNumPartitions(), 5) self.assertEquals(result.count(), 3) + def test_sort_on_empty_rdd(self): + self.assertEqual([], self.sc.parallelize(zip([], [])).sortByKey().collect()) + def test_sample(self): rdd = self.sc.parallelize(range(0, 100), 4) wo = rdd.sample(False, 0.1, 2).collect() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org