Repository: spark Updated Branches: refs/heads/master 99693fef0 -> a24477996
[SPARK-11690][PYSPARK] Add pivot to python api This PR adds pivot to the python api of GroupedData with the same syntax as Scala/Java. Author: Andrew Ray <ray.and...@gmail.com> Closes #9653 from aray/sql-pivot-python. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2447799 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2447799 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2447799 Branch: refs/heads/master Commit: a24477996e936b0861819ffb420f763f80f0b1da Parents: 99693fe Author: Andrew Ray <ray.and...@gmail.com> Authored: Fri Nov 13 10:31:17 2015 -0800 Committer: Yin Huai <yh...@databricks.com> Committed: Fri Nov 13 10:31:17 2015 -0800 ---------------------------------------------------------------------- python/pyspark/sql/group.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a2447799/python/pyspark/sql/group.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 71c0bcc..227f40b 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -17,7 +17,7 @@ from pyspark import since from pyspark.rdd import ignore_unicode_prefix -from pyspark.sql.column import Column, _to_seq +from pyspark.sql.column import Column, _to_seq, _to_java_column, _create_column_from_literal from pyspark.sql.dataframe import DataFrame from pyspark.sql.types import * @@ -167,6 +167,23 @@ class GroupedData(object): [Row(sum(age)=7, sum(height)=165)] """ + @since(1.6) + def pivot(self, pivot_col, *values): + """Pivots a column of the current DataFrame and preform the specified aggregation. + + :param pivot_col: Column to pivot + :param values: Optional list of values of pivotColumn that will be translated to columns in + the output data frame. If values are not provided the method with do an immediate call + to .distinct() on the pivot column. + >>> df4.groupBy("year").pivot("course", "dotNET", "Java").sum("earnings").collect() + [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)] + >>> df4.groupBy("year").pivot("course").sum("earnings").collect() + [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)] + """ + jgd = self._jdf.pivot(_to_java_column(pivot_col), + _to_seq(self.sql_ctx._sc, values, _create_column_from_literal)) + return GroupedData(jgd, self.sql_ctx) + def _test(): import doctest @@ -182,6 +199,11 @@ def _test(): StructField('name', StringType())])) globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85)]).toDF() + globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000), + Row(course="Java", year=2012, earnings=20000), + Row(course="dotNET", year=2012, earnings=5000), + Row(course="dotNET", year=2013, earnings=48000), + Row(course="Java", year=2013, earnings=30000)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.group, globs=globs, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org