devinjdangelo commented on code in PR #4859:
URL: https://github.com/apache/arrow-rs/pull/4859#discussion_r1338444801
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -376,31 +388,56 @@ impl ArrowRowGroupWriter {
props: &WriterPropertiesPtr,
arrow: &SchemaRef,
) -> Result<Self> {
- let mut writers = Vec::with_capacity(arrow.fields.len());
+ let mut writers_and_buffers = Vec::with_capacity(arrow.fields.len());
let mut leaves = parquet.columns().iter();
for field in &arrow.fields {
- get_arrow_column_writer(field.data_type(), props, &mut leaves,
&mut writers)?;
+ get_arrow_column_writer(
+ field.data_type(),
+ props,
+ &mut leaves,
+ &mut writers_and_buffers,
+ )?;
}
+ let (shared_buffers, writers): (Vec<_>, Vec<_>) =
+ writers_and_buffers.into_iter().unzip();
Ok(Self {
writers,
+ shared_buffers,
schema: arrow.clone(),
buffered_rows: 0,
})
}
pub fn write(&mut self, batch: &RecordBatch) -> Result<()> {
self.buffered_rows += batch.num_rows();
- let mut writers = self.writers.iter_mut().map(|(_, x)| x);
+ let mut writers = self.writers.iter_mut();
for (array, field) in batch.columns().iter().zip(&self.schema.fields) {
let mut levels = calculate_array_levels(array, field)?.into_iter();
write_leaves(&mut writers, &mut levels, array.as_ref())?;
}
Ok(())
}
+ pub fn schema(&self) -> &Arc<Schema> {
+ &self.schema
+ }
+
+ /// Takes ownership of all [ArrowColumnWriter]s from this
[ArrowRowGroupWriter]
+ /// Caller must restore ownership with give_col_writers before calling
close method.
+ pub fn take_col_writers(&mut self) -> Vec<ArrowColumnWriter> {
+ self.writers.drain(..).collect()
+ }
+
+ /// Restores ownership of all [ArrowColumnWriter]s. Caller is responsible
for
+ /// returning the [Vec] in the same order returned by take_col_writers
method.
+ pub fn give_col_writers(&mut self, writers: Vec<ArrowColumnWriter>) {
+ self.writers = writers;
+ }
Review Comment:
I agree. Let me know if you come up with something more elegant!
My original attempt was just to provide a mutable reference to the writers,
but it is more challenging to handle mutable references safely vs. passing
ownership in parallel async tasks (I considered giving
https://docs.rs/async-scoped/latest/async_scoped/ a try but decided against
it).
I moved away from entirely deconstructing the `ArrowRowGroupWriter`in order
to keep the `SharedColumnChunk` private.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]