[I] [parquet] Relax type restriction to allow writing dictionary/native batches for same column [arrow-rs]

via GitHub Fri, 25 Jul 2025 12:24:21 -0700


albertlockett opened a new issue, #8004:
URL: https://github.com/apache/arrow-rs/issues/8004


   **Is your feature request related to a problem or challenge? Please describe 
what you are trying to do.**
   I'm reading from a stream of `RecordBatches` and writing to a parquet file. 
In my record batches, the encoding of certain columns can change between 
dictionary encoding and the native array type. 
   
   **Describe the solution you'd like**
   I'd like to be able to write these batches using the `ArrowWriter` and have 
it handle it this without an error. Internally, the column writer will take the 
values out of the dictionary anyway, so I think that a given 
`ArrowColumnWriter` should be able to handle both the dictionary encoded and 
the native type.
   
   ```rs
           let schema = Arc::new(Schema::new(vec![Field::new(
               "a",
               DataType::Dictionary(Box::new(DataType::UInt8), 
Box::new(DataType::Utf8)),
               false,
           )]));
   
           let rb1 = RecordBatch::try_new(
               schema.clone(),
               vec![Arc::new(DictionaryArray::new(
                   UInt8Array::from_iter_values(vec![0, 1, 0]),
                   Arc::new(StringArray::from_iter_values(vec!["parquet", 
"barquet"])),
               ))],
           )
           .unwrap();
   
           let file = tempfile().unwrap();
           let mut writer =
               ArrowWriter::try_new(file.try_clone().unwrap(), rb1.schema(), 
None).unwrap();
           writer.write(&rb1).unwrap();
   
           // check can append another record batch where the field has the 
same type
           // as the dictionary values from the first batch
           let schema2 = Arc::new(Schema::new(vec![Field::new("a", 
DataType::Utf8, false)]));
           let rb2 = RecordBatch::try_new(
               schema2,
               vec![Arc::new(StringArray::from_iter_values(vec![
                   "barquet", "curious",
               ]))],
           )
           .unwrap();
           writer.write(&rb2).unwrap();
   
           writer.close().unwrap();
   
           let mut record_batch_reader =
               ParquetRecordBatchReader::try_new(file.try_clone().unwrap(), 
1024).unwrap();
           let actual_batch = record_batch_reader.next().unwrap().unwrap();
   
           let expected_batch = RecordBatch::try_new(
               schema,
               vec![Arc::new(DictionaryArray::new(
                   UInt8Array::from_iter_values(vec![0, 1, 0, 1, 2]),
                   Arc::new(StringArray::from_iter_values(vec![
                       "parquet", "barquet", "curious",
                   ])),
               ))],
           )
           .unwrap();
   
           assert_eq!(actual_batch, expected_batch)
   ```
   
   Currently this returns an error like:
   ```
   Incompatible type. Field 'a' has type "Dictionary(UInt8, Utf8), array has 
type Utf8"
   ```
   
   **Describe alternatives you've considered**
   I could convert the array before passing it to the writer, but I'd like to 
avoid allocating this array before writing.
   
   I also considered creating the levels directly and managing the instances of 
`ArrowColumnWriter` myself, but I'd like to not have to duplicate the logic in 
`ArrowWriter`:
   ```rs
   use std::{fs::File, sync::Arc};
   
   use arrow_array::{ArrayRef, DictionaryArray, Int32Array, RecordBatch, 
StringArray, UInt8Array};
   use arrow_schema::{DataType, Field, Schema};
   use parquet::{
       arrow::{
           arrow_reader::ParquetRecordBatchReader,
           arrow_writer::{compute_leaves, get_column_writers},
           ArrowSchemaConverter,
       },
       file::{
           properties::WriterProperties,
           reader::SerializedFileReader,
           writer::{SerializedFileWriter, SerializedRowGroupWriter},
       },
   };
   
   fn main() {
       let schema1 = Arc::new(Schema::new(vec![Field::new(
           "a",
           DataType::Dictionary(Box::new(DataType::UInt8), 
Box::new(DataType::Utf8)),
           false,
       )]));
   
       let writer_props = Arc::new(WriterProperties::default());
       let parquet_schema = ArrowSchemaConverter::new()
           .with_coerce_types(writer_props.coerce_types())
           .convert(&schema1)
           .unwrap();
   
       let mut col_writers = get_column_writers(&parquet_schema, &writer_props, 
&schema1).unwrap();
   
       let root_schema = parquet_schema.root_schema_ptr();
   
       let mut file = File::create_new("/tmp/test.parquet").unwrap();
       let mut writer =
           SerializedFileWriter::new(&mut file, root_schema, 
writer_props.clone()).unwrap();
   
       // write dict
       let col1_arr = Arc::new(DictionaryArray::new(
           UInt8Array::from_iter_values(vec![0, 1, 1]),
           Arc::new(StringArray::from_iter_values(vec!["barquet", "curious"])),
       )) as ArrayRef;
       let leaves = compute_leaves(schema1.field(0), &col1_arr).unwrap();
       for leaf in leaves {
           let col_writer = col_writers.get_mut(0).unwrap();
           col_writer.write(&leaf).unwrap();
       }
   
       // write native
       // let col1_arr = Arc::new(StringArray::from_iter_values(vec!["barquet", 
"very curious"])) as ArrayRef;
       // let col1_field = Field::new("a", DataType::Utf8, false);
   
       let col1_arr = Arc::new(Int32Array::from_iter_values(vec![1, 2, 3])) as 
ArrayRef;
       let col1_field = Field::new("a", DataType::Int32, false);
   
       let leaves = compute_leaves(&col1_field, &col1_arr).unwrap();
       for leaf in leaves {
           let col_writer = col_writers.get_mut(0).unwrap();
           col_writer.write(&leaf).unwrap();
       }
   
       let mut row_group_writer: SerializedRowGroupWriter<'_, _> = 
writer.next_row_group().unwrap();
       for col_writer in col_writers {
           let chunk = col_writer.close().unwrap();
           chunk.append_to_row_group(&mut row_group_writer).unwrap();
       }
   
       row_group_writer.close().unwrap();
       writer.close().unwrap();
   
       // read back
       let reader = 
ParquetRecordBatchReader::try_new(file.try_clone().unwrap(), 1024).unwrap();
       for batch in reader {
           println!("got batch: {:?}", batch.unwrap());
       }
   }
   
   ```
   
   **Additional context**
   This is related to otel-arrow's parquet exporter
   
https://github.com/open-telemetry/otel-arrow/blob/main/rust/otap-dataflow/crates/otap/src/parquet_exporter.rs
   which is receiving a stream of adaptive arrays similar to what is described 
here:
   
https://arrow.apache.org/blog/2023/06/26/our-journey-at-f5-with-apache-arrow-part-2/
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[I] [parquet] Relax type restriction to allow writing dictionary/native batches for same column [arrow-rs]

Reply via email to