This is an automated email from the ASF dual-hosted git repository.
Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new f36bfbbd3a fix(parquet_derive): support raw identifiers as column
names (#10113)
f36bfbbd3a is described below
commit f36bfbbd3a2275ae205435ccb4d314bb65fce4cd
Author: cbmixx <[email protected]>
AuthorDate: Fri Jun 19 17:48:44 2026 +0800
fix(parquet_derive): support raw identifiers as column names (#10113)
# Which issue does this PR close?
- Closes #10112.
# Rationale for this change
`#[derive(ParquetRecordReader)]` and `#[derive(ParquetRecordWriter)]`
could not
handle a Parquet column whose name is a Rust keyword (e.g. `type`). The
only way
to spell such a field in Rust is a raw identifier (`r#type`), but the
derives
stringified the identifier including the `r#` prefix:
- The reader's column-index lookup used
`name_to_index.get(stringify!(#field_names))`,
and `stringify!(r#type)` yields `"r#type"`, so reading failed with
`ParquetError::General("column name 'r#type' is not found in parquet
file!")`.
- The writer's `Field::parquet_type()` used `self.ident.to_string()`,
which keeps
the `r#` prefix, so the written schema got a column literally named
`r#type`.
This made it impossible to read or write Parquet columns whose names are
Rust
keywords, e.g. files produced by other Parquet writers with a column
named `type`.
# What changes are included in this PR?
Unraw the identifier (via `syn::ext::IdentExt::unraw`, already available
through
the existing `syn` dependency) wherever it is used as a column name,
while keeping
the raw identifier for field access in the generated code:
- `parquet_derive/src/lib.rs`: the reader derive builds a parallel list
of unrawed
field-name strings for the `name_to_index` lookup and its error message.
- `parquet_derive/src/parquet_field.rs`: `Field::parquet_type()` uses
`self.ident.unraw().to_string()` for the schema column name.
# Are these changes tested?
Yes. Added a unit test (`test_parquet_type_with_raw_identifier`) and an
integration round-trip test (`test_parquet_derive_raw_identifiers`)
covering a
struct with a raw-identifier field (`r#type`) alongside a normal field,
asserting
the schema columns are named `type`/`count`. I verified both tests fail
without
the fix (the writer emits a column named `r#type`) and pass with it.
# Are there any user-facing changes?
Structs with raw-identifier fields now read and write columns named
without the
`r#` prefix. This is a bug fix; there are no public API changes. Code
that somehow
relied on the previous `r#`-prefixed column names would change behavior,
but such
names could not be produced by any other Parquet writer.
---
*AI disclosure (per CONTRIBUTING.md): this change was developed with the
assistance of an AI coding tool. I reviewed every line, verified the fix
against
the failing/passing tests described above, and own the change.*
Co-authored-by: Claude Fable 5 <[email protected]>
---
parquet_derive/src/lib.rs | 18 +++++++++++---
parquet_derive/src/parquet_field.rs | 25 ++++++++++++++++++-
parquet_derive_test/src/lib.rs | 49 +++++++++++++++++++++++++++++++++++++
3 files changed, 88 insertions(+), 4 deletions(-)
diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs
index 1aaa1abfd2..a959507d90 100644
--- a/parquet_derive/src/lib.rs
+++ b/parquet_derive/src/lib.rs
@@ -34,7 +34,7 @@ extern crate quote;
extern crate parquet;
-use ::syn::{Data, DataStruct, DeriveInput, parse_macro_input};
+use ::syn::{Data, DataStruct, DeriveInput, ext::IdentExt, parse_macro_input};
mod parquet_field;
@@ -234,6 +234,18 @@ pub fn parquet_record_reader(input:
proc_macro::TokenStream) -> proc_macro::Toke
let field_infos: Vec<_> =
fields.iter().map(parquet_field::Field::from).collect();
let field_names: Vec<_> = fields.iter().map(|f| f.ident.clone()).collect();
+ // unraw the identifiers, so raw identifiers like `r#type` are looked
+ // up by their column name `type` in the parquet file
+ let field_names_str: Vec<_> = fields
+ .iter()
+ .map(|f| {
+ f.ident
+ .as_ref()
+ .expect("Only structs with named fields are currently
supported")
+ .unraw()
+ .to_string()
+ })
+ .collect();
let reader_snippets: Vec<proc_macro2::TokenStream> =
field_infos.iter().map(|x| x.reader_snippet()).collect();
@@ -270,10 +282,10 @@ pub fn parquet_record_reader(input:
proc_macro::TokenStream) -> proc_macro::Toke
#(
{
- let idx: usize = match
name_to_index.get(stringify!(#field_names)) {
+ let idx: usize = match name_to_index.get(#field_names_str) {
Some(&col_idx) => col_idx,
None => {
- let error_msg = format!("column name '{}' is not found in
parquet file!", stringify!(#field_names));
+ let error_msg = format!("column name '{}' is not found in
parquet file!", #field_names_str);
return
Err(::parquet::errors::ParquetError::General(error_msg));
}
};
diff --git a/parquet_derive/src/parquet_field.rs
b/parquet_derive/src/parquet_field.rs
index 17b8d85437..e332ea21aa 100644
--- a/parquet_derive/src/parquet_field.rs
+++ b/parquet_derive/src/parquet_field.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+use syn::ext::IdentExt;
+
#[derive(Debug, PartialEq)]
pub struct Field {
ident: syn::Ident,
@@ -293,7 +295,9 @@ impl Field {
// TODO: Support group types
// TODO: Add length if dealing with fixedlenbinary
- let field_name = &self.ident.to_string();
+ // unraw the identifier, so a raw identifier like `r#type`
+ // becomes a column named `type` in the parquet schema
+ let field_name = self.ident.unraw().to_string();
let physical_type = match self.ty.physical_type() {
parquet::basic::Type::BOOLEAN => quote! {
::parquet::basic::Type::BOOLEAN
@@ -880,6 +884,25 @@ mod test {
)
}
+ #[test]
+ fn test_parquet_type_with_raw_identifier() {
+ let snippet: proc_macro2::TokenStream = quote! {
+ struct ABoringStruct {
+ r#type: i32,
+ }
+ };
+
+ let fields = extract_fields(snippet);
+ let r#type = Field::from(&fields[0]);
+
+ // the raw identifier `r#type` is named `type` in the parquet schema
+ let snippet = r#type.parquet_type().to_string();
+ assert!(
+ snippet.contains("primitive_type_builder (\"type\""),
+ "{snippet}"
+ );
+ }
+
#[test]
fn test_optional_to_writer_snippet() {
let struct_def: proc_macro2::TokenStream = quote! {
diff --git a/parquet_derive_test/src/lib.rs b/parquet_derive_test/src/lib.rs
index fe96fa0e61..e8462874f3 100644
--- a/parquet_derive_test/src/lib.rs
+++ b/parquet_derive_test/src/lib.rs
@@ -109,6 +109,15 @@ struct APrunedRecord {
pub isize: isize,
}
+// This struct has a field declared with a raw identifier,
+// which maps to a parquet column named without the `r#` prefix
+// (e.g. a column named `type`, as written by other tools)
+#[derive(PartialEq, ParquetRecordWriter, ParquetRecordReader, Debug)]
+struct ARecordWithRawIdentifiers {
+ pub r#type: i32,
+ pub count: i32,
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -356,6 +365,46 @@ mod tests {
assert_eq!(drs[0].isize, out[0].isize);
}
+ #[test]
+ fn test_parquet_derive_raw_identifiers() {
+ let file = get_temp_file("test_parquet_derive_raw_identifiers", &[]);
+ let drs = vec![ARecordWithRawIdentifiers {
+ r#type: 456,
+ count: 123,
+ }];
+
+ let generated_schema = drs.as_slice().schema().unwrap();
+
+ // raw identifiers are written without the `r#` prefix,
+ // while normal identifiers are unchanged
+ assert_eq!(
+ vec!["type", "count"],
+ generated_schema
+ .get_fields()
+ .iter()
+ .map(|field| field.name())
+ .collect::<Vec<_>>()
+ );
+
+ let props = Default::default();
+ let mut writer =
+ SerializedFileWriter::new(file.try_clone().unwrap(),
generated_schema, props).unwrap();
+
+ let mut row_group = writer.next_row_group().unwrap();
+ drs.as_slice().write_to_row_group(&mut row_group).unwrap();
+ row_group.close().unwrap();
+ writer.close().unwrap();
+
+ use parquet::file::{reader::FileReader,
serialized_reader::SerializedFileReader};
+ let reader = SerializedFileReader::new(file).unwrap();
+ let mut out: Vec<ARecordWithRawIdentifiers> = Vec::new();
+
+ let mut row_group = reader.get_row_group(0).unwrap();
+ out.read_from_row_group(&mut *row_group, 1).unwrap();
+
+ assert_eq!(drs, out);
+ }
+
#[test]
fn test_aliased_result() {
// Issue 7547, Where aliasing the `Result` led to