Github user holdenk commented on a diff in the pull request: https://github.com/apache/spark/pull/22010#discussion_r220674552 --- Diff: core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala --- @@ -95,6 +95,18 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext { assert(!deserial.toString().isEmpty()) } + test("distinct with known partitioner preserves partitioning") { + val rdd = sc.parallelize(1.to(100), 10).map(x => (x % 10, x % 10)).sortByKey() + val initialPartitioner = rdd.partitioner + val distinctRdd = rdd.distinct() + val resultingPartitioner = distinctRdd.partitioner + assert(initialPartitioner === resultingPartitioner) + val distinctRddDifferent = rdd.distinct(5) + val distinctRddDifferentPartitioner = distinctRddDifferent.partitioner + assert(initialPartitioner != distinctRddDifferentPartitioner) + assert(distinctRdd.collect().sorted === distinctRddDifferent.collect().sorted) --- End diff -- We could, but we don't need to.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org