This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 4027474cc744 [SPARK-45221][PYTHON][DOCS] Refine docstring of DataFrameReader.parquet 4027474cc744 is described below commit 4027474cc74438b29b0eae38f07ab03aeab99f5a Author: Hyukjin Kwon <gurwls...@gmail.com> AuthorDate: Thu Oct 12 09:24:27 2023 +0900 [SPARK-45221][PYTHON][DOCS] Refine docstring of DataFrameReader.parquet ### What changes were proposed in this pull request? This PR refines the docstring of DataFrameReader.parquet by adding more examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? doctest ### Was this patch authored or co-authored using generative AI tooling? No Closes #43301 from allisonwang-db/spark-45221-refine-parquet. Lead-authored-by: Hyukjin Kwon <gurwls...@gmail.com> Co-authored-by: allisonwang-db <allison.w...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/readwriter.py | 68 ++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index cfac8fdbc68b..ea429a75e157 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -495,6 +495,7 @@ class DataFrameReader(OptionUtils): Parameters ---------- paths : str + One or more file paths to read the Parquet files from. Other Parameters ---------------- @@ -505,24 +506,71 @@ class DataFrameReader(OptionUtils): .. # noqa + Returns + ------- + :class:`DataFrame` + A DataFrame containing the data from the Parquet files. + Examples -------- + Create sample dataframes. + + >>> df = spark.createDataFrame( + ... [(10, "Alice"), (15, "Bob"), (20, "Tom")], schema=["age", "name"]) + >>> df2 = spark.createDataFrame([(70, "Alice"), (80, "Bob")], schema=["height", "name"]) + Write a DataFrame into a Parquet file and read it back. >>> import tempfile >>> with tempfile.TemporaryDirectory() as d: - ... # Write a DataFrame into a Parquet file - ... spark.createDataFrame( - ... [{"age": 100, "name": "Hyukjin Kwon"}] - ... ).write.mode("overwrite").format("parquet").save(d) + ... # Write a DataFrame into a Parquet file. + ... df.write.mode("overwrite").format("parquet").save(d) ... ... # Read the Parquet file as a DataFrame. - ... spark.read.parquet(d).show() - +---+------------+ - |age| name| - +---+------------+ - |100|Hyukjin Kwon| - +---+------------+ + ... spark.read.parquet(d).orderBy("name").show() + +---+-----+ + |age| name| + +---+-----+ + | 10|Alice| + | 15| Bob| + | 20| Tom| + +---+-----+ + + Read a Parquet file with a specific column. + + >>> with tempfile.TemporaryDirectory() as d: + ... df.write.mode("overwrite").format("parquet").save(d) + ... + ... # Read the Parquet file with only the 'name' column. + ... spark.read.schema("name string").parquet(d).orderBy("name").show() + +-----+ + | name| + +-----+ + |Alice| + | Bob| + | Tom| + +-----+ + + Read multiple Parquet files and merge schema. + + >>> with tempfile.TemporaryDirectory() as d1, tempfile.TemporaryDirectory() as d2: + ... df.write.mode("overwrite").format("parquet").save(d1) + ... df2.write.mode("overwrite").format("parquet").save(d2) + ... + ... spark.read.option( + ... "mergeSchema", "true" + ... ).parquet(d1, d2).select( + ... "name", "age", "height" + ... ).orderBy("name", "age").show() + +-----+----+------+ + | name| age|height| + +-----+----+------+ + |Alice|NULL| 70| + |Alice| 10| NULL| + | Bob|NULL| 80| + | Bob| 15| NULL| + | Tom| 20| NULL| + +-----+----+------+ """ mergeSchema = options.get("mergeSchema", None) pathGlobFilter = options.get("pathGlobFilter", None) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org