This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 7f724c3bc75 [SPARK-43213][PYTHON] Add `DataFrame.offset` to vanilla PySpark 7f724c3bc75 is described below commit 7f724c3bc7567b0cddc09d5bed11b79879533368 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Fri Apr 21 12:55:29 2023 +0800 [SPARK-43213][PYTHON] Add `DataFrame.offset` to vanilla PySpark ### What changes were proposed in this pull request? Add `DataFrame.offset` to PySpark ### Why are the changes needed? `DataFrame.offset` was supported in Scala side and Spark Connect since 3.4, but it is missing in vanilla PySpark. ### Does this PR introduce _any_ user-facing change? yes, new API ### How was this patch tested? added doctests Closes #40873 from zhengruifeng/python_df_offset. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- .../source/reference/pyspark.sql/dataframe.rst | 1 + python/pyspark/sql/dataframe.py | 35 ++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/docs/source/reference/pyspark.sql/dataframe.rst b/python/docs/source/reference/pyspark.sql/dataframe.rst index 98bf9465f80..1d6712bb042 100644 --- a/python/docs/source/reference/pyspark.sql/dataframe.rst +++ b/python/docs/source/reference/pyspark.sql/dataframe.rst @@ -79,6 +79,7 @@ DataFrame DataFrame.melt DataFrame.na DataFrame.observe + DataFrame.offset DataFrame.orderBy DataFrame.persist DataFrame.printSchema diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 0edccb1069f..ae5fd514f27 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1273,6 +1273,41 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): jdf = self._jdf.limit(num) return DataFrame(jdf, self.sparkSession) + def offset(self, num: int) -> "DataFrame": + """Returns a new :class: `DataFrame` by skipping the first `n` rows. + + .. versionadded:: 3.5.0 + + Parameters + ---------- + num : int + Number of records to skip. + + Returns + ------- + :class:`DataFrame` + Subset of the records + + Examples + -------- + >>> df = spark.createDataFrame( + ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + >>> df.offset(1).show() + +---+-----+ + |age| name| + +---+-----+ + | 23|Alice| + | 16| Bob| + +---+-----+ + >>> df.offset(10).show() + +---+----+ + |age|name| + +---+----+ + +---+----+ + """ + jdf = self._jdf.offset(num) + return DataFrame(jdf, self.sparkSession) + def take(self, num: int) -> List[Row]: """Returns the first ``num`` rows as a :class:`list` of :class:`Row`. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org