This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new 6e22f1108bf4 [SPARK-48248][PYTHON] Fix nested array to respect legacy conf of inferArrayTypeFromFirstElement 6e22f1108bf4 is described below commit 6e22f1108bf4c0d28b03f2618e308cde6fc7faa0 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Mon May 13 17:15:28 2024 +0900 [SPARK-48248][PYTHON] Fix nested array to respect legacy conf of inferArrayTypeFromFirstElement This PR fixes a bug that does not respect `spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled` in nested arrays, introduced by https://github.com/apache/spark/pull/36545. To have a way to restore the original behaviour. Yes, it fixes the regression when `spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled` is set to `True`. Unittest added. No. Closes #46548 from HyukjinKwon/SPARK-48248. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> (cherry picked from commit b2140d0f25d81e64a968df83c5da5089051acaac) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/sql/tests/test_types.py | 7 +++++++ python/pyspark/sql/types.py | 18 ++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index c5a6a1756976..02e3476ffd3b 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -1192,6 +1192,13 @@ class TypesTestsMixin: for n, (a, e) in enumerate(zip(actual, expected)): self.assertEqual(a, e, "%s does not match with %s" % (exprs[n], expected[n])) + def test_infer_array_element_type_with_struct(self): + # SPARK-48248: Nested array to respect legacy conf of inferArrayTypeFromFirstElement + with self.sql_conf( + {"spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled": True} + ): + self.assertEqual([[1, None]], self.spark.createDataFrame([[[[1, "a"]]]]).first()[0]) + class DataTypeTests(unittest.TestCase): # regression test for SPARK-6055 diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 21fef0a2c08a..c668c5c59eca 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1508,13 +1508,27 @@ def _infer_type( if len(obj) > 0: if infer_array_from_first_element: return ArrayType( - _infer_type(obj[0], infer_dict_as_struct, prefer_timestamp_ntz), True + _infer_type( + obj[0], + infer_dict_as_struct, + infer_array_from_first_element, + prefer_timestamp_ntz, + ), + True, ) else: return ArrayType( reduce( _merge_type, - (_infer_type(v, infer_dict_as_struct, prefer_timestamp_ntz) for v in obj), + ( + _infer_type( + v, + infer_dict_as_struct, + infer_array_from_first_element, + prefer_timestamp_ntz, + ) + for v in obj + ), ), True, ) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org