This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new b3108279ffb [SPARK-42022][CONNECT][PYTHON] Fix createDataFrame to autogenerate missing column names b3108279ffb is described below commit b3108279ffb201364d480ab3a104e835001ef0b1 Author: Takuya UESHIN <ues...@databricks.com> AuthorDate: Wed Mar 8 09:35:47 2023 +0900 [SPARK-42022][CONNECT][PYTHON] Fix createDataFrame to autogenerate missing column names ### What changes were proposed in this pull request? Fixes `createDataFrame` to autogenerate missing column names. ### Why are the changes needed? Currently the number of the column names specified to `createDataFrame` does not match the actual number of columns, it raises an error: ```py >>> spark.createDataFrame([["a", "b"]], ["col1"]) Traceback (most recent call last): ... ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements ``` but it should auto-generate the missing column names. ### Does this PR introduce _any_ user-facing change? It will auto-generate the missing columns: ```py >>> spark.createDataFrame([["a", "b"]], ["col1"]) DataFrame[col1: string, _2: string] ``` ### How was this patch tested? Enabled the related test. Closes #40310 from ueshin/issues/SPARK-42022/columns. Authored-by: Takuya UESHIN <ues...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit 056ed5dc67a660bf0808af6edc8668715b64d2a1) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/connect/session.py | 7 +++++++ python/pyspark/sql/tests/connect/test_parity_types.py | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py index 04e4f9e88d0..dd1c2d3c510 100644 --- a/python/pyspark/sql/connect/session.py +++ b/python/pyspark/sql/connect/session.py @@ -235,6 +235,10 @@ class SparkSession: # If no schema supplied by user then get the names of columns only if schema is None: _cols = [str(x) if not isinstance(x, str) else x for x in data.columns] + elif isinstance(schema, (list, tuple)) and cast(int, _num_cols) < len(data.columns): + assert isinstance(_cols, list) + _cols.extend([f"_{i + 1}" for i in range(cast(int, _num_cols), len(data.columns))]) + _num_cols = len(_cols) # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): @@ -309,6 +313,9 @@ class SparkSession: _inferred_schema = self._inferSchemaFromList(_data, _cols) + if _cols is not None and cast(int, _num_cols) < len(_cols): + _num_cols = len(_cols) + if _has_nulltype(_inferred_schema): # For cases like createDataFrame([("Alice", None, 80.1)], schema) # we can not infer the schema from the data itself. diff --git a/python/pyspark/sql/tests/connect/test_parity_types.py b/python/pyspark/sql/tests/connect/test_parity_types.py index 3d54d488a5d..67d5a17660e 100644 --- a/python/pyspark/sql/tests/connect/test_parity_types.py +++ b/python/pyspark/sql/tests/connect/test_parity_types.py @@ -90,11 +90,6 @@ class TypesParityTests(TypesTestsMixin, ReusedConnectTestCase): def test_infer_schema(self): super().test_infer_schema() - # TODO(SPARK-42022): createDataFrame should autogenerate missing column names - @unittest.skip("Fails in Spark Connect, should enable.") - def test_infer_schema_not_enough_names(self): - super().test_infer_schema_not_enough_names() - # TODO(SPARK-42020): createDataFrame with UDT @unittest.skip("Fails in Spark Connect, should enable.") def test_infer_schema_specification(self): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org