cloud-fan commented on code in PR #55962:
URL: https://github.com/apache/spark/pull/55962#discussion_r3270663535


##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCustomMetadataStructSuite.scala:
##########
@@ -336,6 +338,55 @@ class FileSourceCustomMetadataStructSuite extends 
SharedSparkSession {
     }
   }
 
+  test("[SPARK-56931] complex constant metadata fields (array<struct>, struct) 
on row path") {
+    withTempData("parquet", FILE_SCHEMA) { (_, f0, f1) =>
+      val permElement = StructType(Seq(
+        StructField("email", StringType),
+        StructField("role", StringType)))
+      val locationStruct = StructType(Seq(
+        StructField("country", StringType),
+        StructField("city", StringType)))
+      val complexFields = Seq(
+        FileSourceConstantMetadataStructField("perms", ArrayType(permElement, 
containsNull = true)),
+        FileSourceConstantMetadataStructField("location", locationStruct))
+      val format = new TestFileFormat(complexFields)
+
+      // Build per-file values in catalyst form.
+      def perms(email: String, role: String): InternalRow =
+        InternalRow(UTF8String.fromString(email), UTF8String.fromString(role))
+      def loc(country: String, city: String): InternalRow =
+        InternalRow(UTF8String.fromString(country), 
UTF8String.fromString(city))
+
+      val files = Seq(
+        FileStatusWithMetadata(f0, Map(
+          "perms" -> new GenericArrayData(Array[Any](perms("a@x", "r"), 
perms("b@x", "w"))),
+          "location" -> loc("US", "SFO"))),
+        FileStatusWithMetadata(f1, Map(
+          "perms" -> new GenericArrayData(Array[Any](perms("c@x", "r"))),
+          "location" -> loc("CA", "YYZ"))))
+      val df = createDF(format, files)
+
+      // Force the row materialization path (Batched=false) so we exercise the
+      // updateMetadataInternalRow -> getFileConstantMetadataColumnValue -> 
Literal.create
+      // change end-to-end. The query touches a subset of each subfield to 
also exercise
+      // the metadata-schema pruning preservation rule.
+      withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
+        checkAnswer(
+          df.selectExpr(
+            "fileNum",
+            "_metadata.perms[0].email AS first_email",
+            "_metadata.perms[1].role AS second_role",
+            "_metadata.location.country AS country",

Review Comment:
   **Test breadth — StructType preservation.** This query touches only 
`location.country`, which sits at index 0 of `location: StructType(country, 
city)`. A buggy implementation that shaved `city` from the kept sub-attribute's 
inner struct would still produce `"US"` here — the extractor's 
`InternalRow("US", "SFO")` has `"US"` at index 0 regardless. To actually 
exercise the preservation rule for the StructType case (as opposed to the array 
case, which `perms[1].role` does verify), the query needs to touch a non-first 
sub-field of `location`. Either swap `country` for `city`, or add 
`_metadata.location.city AS city` alongside.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to