Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20456#discussion_r165651046 --- Diff: python/pyspark/sql/dataframe.py --- @@ -667,6 +667,92 @@ def repartition(self, numPartitions, *cols): else: raise TypeError("numPartitions should be an int or Column") + @since("2.3.0") + def repartitionByRange(self, numPartitions, *cols): + """ + Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The + resulting DataFrame is range partitioned. + + ``numPartitions`` can be an int to specify the target number of partitions or a Column. + If it is a Column, it will be used as the first partitioning column. If not specified, + the default number of partitions is used. + + At least one partition-by expression must be specified. + When no explicit sort order is specified, "ascending nulls first" is assumed. + + >>> df.repartitionByRange(2, "age").rdd.getNumPartitions() + 2 + >>> df.show() + +---+-----+ + |age| name| + +---+-----+ + | 2|Alice| + | 5| Bob| + +---+-----+ + >>> df.repartitionByRange(1, "age").rdd.getNumPartitions() + 1 + >>> data = df.union(df) + >>> data.show() + +---+-----+ + |age| name| + +---+-----+ + | 2|Alice| + | 5| Bob| + | 2|Alice| + | 5| Bob| + +---+-----+ + >>> data = data.repartitionByRange(3, "age") + >>> data.show() + +---+-----+ + |age| name| + +---+-----+ + | 2|Alice| + | 2|Alice| + | 5| Bob| + | 5| Bob| + +---+-----+ + >>> data.rdd.getNumPartitions() + 3 + >>> data = data.repartitionByRange("age") + >>> data.rdd.getNumPartitions() + 3 + >>> data2 = df.union(df).union(df) --- End diff -- I think we don't need `union` too.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org