This is an automated email from the ASF dual-hosted git repository.

Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new f36bfbbd3a fix(parquet_derive): support raw identifiers as column 
names (#10113)
f36bfbbd3a is described below

commit f36bfbbd3a2275ae205435ccb4d314bb65fce4cd
Author: cbmixx <[email protected]>
AuthorDate: Fri Jun 19 17:48:44 2026 +0800

    fix(parquet_derive): support raw identifiers as column names (#10113)
    
    # Which issue does this PR close?
    
    - Closes #10112.
    
    # Rationale for this change
    
    `#[derive(ParquetRecordReader)]` and `#[derive(ParquetRecordWriter)]`
    could not
    handle a Parquet column whose name is a Rust keyword (e.g. `type`). The
    only way
    to spell such a field in Rust is a raw identifier (`r#type`), but the
    derives
    stringified the identifier including the `r#` prefix:
    
    - The reader's column-index lookup used
    `name_to_index.get(stringify!(#field_names))`,
      and `stringify!(r#type)` yields `"r#type"`, so reading failed with
    `ParquetError::General("column name 'r#type' is not found in parquet
    file!")`.
    - The writer's `Field::parquet_type()` used `self.ident.to_string()`,
    which keeps
    the `r#` prefix, so the written schema got a column literally named
    `r#type`.
    
    This made it impossible to read or write Parquet columns whose names are
    Rust
    keywords, e.g. files produced by other Parquet writers with a column
    named `type`.
    
    # What changes are included in this PR?
    
    Unraw the identifier (via `syn::ext::IdentExt::unraw`, already available
    through
    the existing `syn` dependency) wherever it is used as a column name,
    while keeping
    the raw identifier for field access in the generated code:
    
    - `parquet_derive/src/lib.rs`: the reader derive builds a parallel list
    of unrawed
    field-name strings for the `name_to_index` lookup and its error message.
    - `parquet_derive/src/parquet_field.rs`: `Field::parquet_type()` uses
      `self.ident.unraw().to_string()` for the schema column name.
    
    # Are these changes tested?
    
    Yes. Added a unit test (`test_parquet_type_with_raw_identifier`) and an
    integration round-trip test (`test_parquet_derive_raw_identifiers`)
    covering a
    struct with a raw-identifier field (`r#type`) alongside a normal field,
    asserting
    the schema columns are named `type`/`count`. I verified both tests fail
    without
    the fix (the writer emits a column named `r#type`) and pass with it.
    
    # Are there any user-facing changes?
    
    Structs with raw-identifier fields now read and write columns named
    without the
    `r#` prefix. This is a bug fix; there are no public API changes. Code
    that somehow
    relied on the previous `r#`-prefixed column names would change behavior,
    but such
    names could not be produced by any other Parquet writer.
    
    ---
    
    *AI disclosure (per CONTRIBUTING.md): this change was developed with the
    assistance of an AI coding tool. I reviewed every line, verified the fix
    against
    the failing/passing tests described above, and own the change.*
    
    Co-authored-by: Claude Fable 5 <[email protected]>
---
 parquet_derive/src/lib.rs           | 18 +++++++++++---
 parquet_derive/src/parquet_field.rs | 25 ++++++++++++++++++-
 parquet_derive_test/src/lib.rs      | 49 +++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs
index 1aaa1abfd2..a959507d90 100644
--- a/parquet_derive/src/lib.rs
+++ b/parquet_derive/src/lib.rs
@@ -34,7 +34,7 @@ extern crate quote;
 
 extern crate parquet;
 
-use ::syn::{Data, DataStruct, DeriveInput, parse_macro_input};
+use ::syn::{Data, DataStruct, DeriveInput, ext::IdentExt, parse_macro_input};
 
 mod parquet_field;
 
@@ -234,6 +234,18 @@ pub fn parquet_record_reader(input: 
proc_macro::TokenStream) -> proc_macro::Toke
 
     let field_infos: Vec<_> = 
fields.iter().map(parquet_field::Field::from).collect();
     let field_names: Vec<_> = fields.iter().map(|f| f.ident.clone()).collect();
+    // unraw the identifiers, so raw identifiers like `r#type` are looked
+    // up by their column name `type` in the parquet file
+    let field_names_str: Vec<_> = fields
+        .iter()
+        .map(|f| {
+            f.ident
+                .as_ref()
+                .expect("Only structs with named fields are currently 
supported")
+                .unraw()
+                .to_string()
+        })
+        .collect();
     let reader_snippets: Vec<proc_macro2::TokenStream> =
         field_infos.iter().map(|x| x.reader_snippet()).collect();
 
@@ -270,10 +282,10 @@ pub fn parquet_record_reader(input: 
proc_macro::TokenStream) -> proc_macro::Toke
 
         #(
           {
-              let idx: usize = match 
name_to_index.get(stringify!(#field_names)) {
+              let idx: usize = match name_to_index.get(#field_names_str) {
                 Some(&col_idx) => col_idx,
                 None => {
-                  let error_msg = format!("column name '{}' is not found in 
parquet file!", stringify!(#field_names));
+                  let error_msg = format!("column name '{}' is not found in 
parquet file!", #field_names_str);
                   return 
Err(::parquet::errors::ParquetError::General(error_msg));
                 }
               };
diff --git a/parquet_derive/src/parquet_field.rs 
b/parquet_derive/src/parquet_field.rs
index 17b8d85437..e332ea21aa 100644
--- a/parquet_derive/src/parquet_field.rs
+++ b/parquet_derive/src/parquet_field.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use syn::ext::IdentExt;
+
 #[derive(Debug, PartialEq)]
 pub struct Field {
     ident: syn::Ident,
@@ -293,7 +295,9 @@ impl Field {
         // TODO: Support group types
         // TODO: Add length if dealing with fixedlenbinary
 
-        let field_name = &self.ident.to_string();
+        // unraw the identifier, so a raw identifier like `r#type`
+        // becomes a column named `type` in the parquet schema
+        let field_name = self.ident.unraw().to_string();
         let physical_type = match self.ty.physical_type() {
             parquet::basic::Type::BOOLEAN => quote! {
                 ::parquet::basic::Type::BOOLEAN
@@ -880,6 +884,25 @@ mod test {
         )
     }
 
+    #[test]
+    fn test_parquet_type_with_raw_identifier() {
+        let snippet: proc_macro2::TokenStream = quote! {
+          struct ABoringStruct {
+            r#type: i32,
+          }
+        };
+
+        let fields = extract_fields(snippet);
+        let r#type = Field::from(&fields[0]);
+
+        // the raw identifier `r#type` is named `type` in the parquet schema
+        let snippet = r#type.parquet_type().to_string();
+        assert!(
+            snippet.contains("primitive_type_builder (\"type\""),
+            "{snippet}"
+        );
+    }
+
     #[test]
     fn test_optional_to_writer_snippet() {
         let struct_def: proc_macro2::TokenStream = quote! {
diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs
index fe96fa0e61..e8462874f3 100644
--- a/parquet_derive_test/src/lib.rs
+++ b/parquet_derive_test/src/lib.rs
@@ -109,6 +109,15 @@ struct APrunedRecord {
     pub isize: isize,
 }
 
+// This struct has a field declared with a raw identifier,
+// which maps to a parquet column named without the `r#` prefix
+// (e.g. a column named `type`, as written by other tools)
+#[derive(PartialEq, ParquetRecordWriter, ParquetRecordReader, Debug)]
+struct ARecordWithRawIdentifiers {
+    pub r#type: i32,
+    pub count: i32,
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -356,6 +365,46 @@ mod tests {
         assert_eq!(drs[0].isize, out[0].isize);
     }
 
+    #[test]
+    fn test_parquet_derive_raw_identifiers() {
+        let file = get_temp_file("test_parquet_derive_raw_identifiers", &[]);
+        let drs = vec![ARecordWithRawIdentifiers {
+            r#type: 456,
+            count: 123,
+        }];
+
+        let generated_schema = drs.as_slice().schema().unwrap();
+
+        // raw identifiers are written without the `r#` prefix,
+        // while normal identifiers are unchanged
+        assert_eq!(
+            vec!["type", "count"],
+            generated_schema
+                .get_fields()
+                .iter()
+                .map(|field| field.name())
+                .collect::<Vec<_>>()
+        );
+
+        let props = Default::default();
+        let mut writer =
+            SerializedFileWriter::new(file.try_clone().unwrap(), 
generated_schema, props).unwrap();
+
+        let mut row_group = writer.next_row_group().unwrap();
+        drs.as_slice().write_to_row_group(&mut row_group).unwrap();
+        row_group.close().unwrap();
+        writer.close().unwrap();
+
+        use parquet::file::{reader::FileReader, 
serialized_reader::SerializedFileReader};
+        let reader = SerializedFileReader::new(file).unwrap();
+        let mut out: Vec<ARecordWithRawIdentifiers> = Vec::new();
+
+        let mut row_group = reader.get_row_group(0).unwrap();
+        out.read_from_row_group(&mut *row_group, 1).unwrap();
+
+        assert_eq!(drs, out);
+    }
+
     #[test]
     fn test_aliased_result() {
         // Issue 7547, Where aliasing the `Result` led to

Reply via email to