This is an automated email from the ASF dual-hosted git repository.

Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d628e58b21 fix: handle Avro reader schema with no fields (#9611)
d628e58b21 is described below

commit d628e58b21461ef40d3f6652688710a78dd627b8
Author: Mikhail Zabaluev <[email protected]>
AuthorDate: Fri Jun 19 14:19:43 2026 +0300

    fix: handle Avro reader schema with no fields (#9611)
    
    # Which issue does this PR close?
    
    - Closes #9608.
    
    # Rationale for this change
    
    In the degenerate case when the Avro reader schema has no fields, the
    `RecordDecoder` should be able to produce empty record batches with the
    number of rows counted from the data. As an optimization for OCF, the
    reader could skip decoding altogether, relying on record counts provided
    by data blocks.
    
    # What changes are included in this PR?
    
    A row counter is run in the `RecordDecoder` state.
    
    # Are these changes tested?
    
    Added tests to verify decoder behavior given an empty reader schema for
    the data files in the test suite.
    
    # Are there any user-facing changes?
    
    No.
    
    Co-authored-by: Jeffrey Vo <[email protected]>
---
 arrow-avro/src/reader/async_reader/mod.rs | 57 +++++++++++++++++++++++++++++++
 arrow-avro/src/reader/record.rs           |  8 ++++-
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arrow-avro/src/reader/async_reader/mod.rs 
b/arrow-avro/src/reader/async_reader/mod.rs
index 31df7cdf11..43ff931a05 100644
--- a/arrow-avro/src/reader/async_reader/mod.rs
+++ b/arrow-avro/src/reader/async_reader/mod.rs
@@ -1532,6 +1532,38 @@ mod tests {
         assert_eq!(id.values(), &[4, 5, 6, 7, 2, 3, 0, 1]);
     }
 
+    #[tokio::test]
+    async fn test_alltypes_with_empty_schema_large_batch() {
+        // With an empty reader schema -- should count rows but produce no 
columns
+        let file = arrow_test_data("avro/alltypes_plain.avro");
+        let schema = Arc::new(Schema::new(Vec::<Field>::new()));
+        let batches = read_async_file(&file, 1024, None, Some(schema), None)
+            .await
+            .unwrap();
+        assert_eq!(batches.len(), 1);
+        let batch = &batches[0];
+
+        assert_eq!(batch.num_rows(), 8);
+        assert_eq!(batch.num_columns(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_alltypes_with_empty_schema_small_batch() {
+        // With an empty reader schema -- should count rows but produce no 
columns
+        let file = arrow_test_data("avro/alltypes_plain.avro");
+        let schema = Arc::new(Schema::new(Vec::<Field>::new()));
+        let batches = read_async_file(&file, 5, None, Some(schema), None)
+            .await
+            .unwrap();
+
+        assert_eq!(batches.len(), 2);
+
+        assert_eq!(batches[0].num_rows(), 5);
+        assert_eq!(batches[0].num_columns(), 0);
+        assert_eq!(batches[1].num_rows(), 3);
+        assert_eq!(batches[1].num_columns(), 0);
+    }
+
     #[tokio::test]
     async fn test_nested_no_schema_no_projection() {
         // No reader schema, no projection
@@ -1597,6 +1629,31 @@ mod tests {
         assert_eq!(batch.schema().field(2).name(), "f1");
     }
 
+    #[tokio::test]
+    async fn test_nested_with_empty_schema() {
+        // With an empty reader schema -- should count rows but produce no 
columns
+        let file = arrow_test_data("avro/nested_records.avro");
+        let schema = Arc::new(
+            Schema::new(Vec::<Field>::new()).with_metadata(HashMap::from([(
+                SCHEMA_METADATA_KEY.into(),
+                r#"{
+                    "type": "record",
+                    "namespace": "ns1",
+                    "name": "record1",
+                    "fields": []
+                }"#
+                .to_owned(),
+            )])),
+        );
+        let batches = read_async_file(&file, 1024, None, Some(schema), None)
+            .await
+            .unwrap();
+        let batch = &batches[0];
+
+        assert_eq!(batch.num_rows(), 2);
+        assert_eq!(batch.num_columns(), 0);
+    }
+
     #[tokio::test]
     async fn test_projection_error_out_of_bounds() {
         let file = arrow_test_data("avro/alltypes_plain.avro");
diff --git a/arrow-avro/src/reader/record.rs b/arrow-avro/src/reader/record.rs
index b71a6bdc7c..306c777182 100644
--- a/arrow-avro/src/reader/record.rs
+++ b/arrow-avro/src/reader/record.rs
@@ -94,6 +94,7 @@ pub(crate) struct RecordDecoder {
     schema: SchemaRef,
     fields: Vec<Decoder>,
     projector: Option<Projector>,
+    row_count: usize,
 }
 
 impl RecordDecoder {
@@ -136,6 +137,7 @@ impl RecordDecoder {
                     schema: Arc::new(ArrowSchema::new(arrow_fields)),
                     fields: encodings,
                     projector,
+                    row_count: 0,
                 })
             }
             other => Err(AvroError::ParseError(format!(
@@ -166,6 +168,7 @@ impl RecordDecoder {
                 }
             }
         }
+        self.row_count += count;
         Ok(cursor.position())
     }
 
@@ -176,7 +179,10 @@ impl RecordDecoder {
             .iter_mut()
             .map(|x| x.flush(None))
             .collect::<Result<Vec<_>, _>>()?;
-        RecordBatch::try_new(self.schema.clone(), arrays).map_err(Into::into)
+        let batch_options = 
RecordBatchOptions::new().with_row_count(Some(self.row_count));
+        self.row_count = 0;
+        RecordBatch::try_new_with_options(self.schema.clone(), arrays, 
&batch_options)
+            .map_err(Into::into)
     }
 }
 

Reply via email to