This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new a81428240e Add check for empty schema in 
`parquet::schema::types::from_thrift_helper` (#6990)
a81428240e is described below

commit a81428240e87f85f12ea8d1bc160598d9af1ba35
Author: Ed Seidl <[email protected]>
AuthorDate: Sun Jan 19 04:43:43 2025 -0800

    Add check for empty schema in `parquet::schema::types::from_thrift_helper` 
(#6990)
    
    * fix for empty schema
    
    * use Schema::empty per review comment
    
    * check schema and number of batches as well (per suggestion from @alamb)
---
 parquet/src/arrow/arrow_writer/mod.rs | 30 ++++++++++++++++++++++++++++++
 parquet/src/schema/types.rs           |  7 +++++++
 2 files changed, 37 insertions(+)

diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index 41f15569fd..5e0d318322 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -3409,4 +3409,34 @@ mod tests {
             "Arrow: Incompatible type. Field 'temperature' has type Float64, 
array has type Int32"
         );
     }
+
+    #[test]
+    // https://github.com/apache/arrow-rs/issues/6988
+    fn test_roundtrip_empty_schema() {
+        // create empty record batch with empty schema
+        let empty_batch = RecordBatch::try_new_with_options(
+            Arc::new(Schema::empty()),
+            vec![],
+            &RecordBatchOptions::default().with_row_count(Some(0)),
+        )
+        .unwrap();
+
+        // write to parquet
+        let mut parquet_bytes: Vec<u8> = Vec::new();
+        let mut writer =
+            ArrowWriter::try_new(&mut parquet_bytes, empty_batch.schema(), 
None).unwrap();
+        writer.write(&empty_batch).unwrap();
+        writer.close().unwrap();
+
+        // read from parquet
+        let bytes = Bytes::from(parquet_bytes);
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes).unwrap();
+        assert_eq!(reader.schema(), &empty_batch.schema());
+        let batches: Vec<_> = reader
+            .build()
+            .unwrap()
+            .collect::<ArrowResult<Vec<_>>>()
+            .unwrap();
+        assert_eq!(batches.len(), 0);
+    }
 }
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index d9e9b22e80..68492e19f4 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -1211,6 +1211,13 @@ fn from_thrift_helper(elements: &[SchemaElement], index: 
usize) -> Result<(usize
         ));
     }
     let element = &elements[index];
+
+    // Check for empty schema
+    if let (true, None | Some(0)) = (is_root_node, element.num_children) {
+        let builder = Type::group_type_builder(&element.name);
+        return Ok((index + 1, Arc::new(builder.build().unwrap())));
+    }
+
     let converted_type = ConvertedType::try_from(element.converted_type)?;
     // LogicalType is only present in v2 Parquet files. ConvertedType is always
     // populated, regardless of the version of the file (v1 or v2).

Reply via email to