comphead commented on code in PR #2929:
URL: https://github.com/apache/datafusion-comet/pull/2929#discussion_r2646579257


##########
native/core/src/execution/operators/parquet_writer.rs:
##########
@@ -49,6 +53,134 @@ use parquet::{
 
 use crate::execution::shuffle::CompressionCodec;
 
+/// Enum representing different types of Arrow writers based on storage backend
+enum ParquetWriter {
+    /// Writer for local file system
+    LocalFile(ArrowWriter<File>),
+    /// Writer for HDFS or other remote storage (writes to in-memory buffer)
+    /// Contains the arrow writer, HDFS operator, and destination path
+    /// an Arrow writer writes to in-memory buffer the data converted to 
Parquet format
+    /// The opendal::Writer is created lazily on first write
+    Remote(
+        ArrowWriter<Cursor<Vec<u8>>>,
+        Option<opendal::Writer>,
+        Operator,
+        String,
+    ),
+}
+
+impl ParquetWriter {
+    /// Write a RecordBatch to the underlying writer
+    async fn write(
+        &mut self,
+        batch: &RecordBatch,
+    ) -> std::result::Result<(), parquet::errors::ParquetError> {
+        match self {
+            ParquetWriter::LocalFile(writer) => writer.write(batch),
+            ParquetWriter::Remote(
+                arrow_parquet_buffer_writer,
+                hdfs_writer_opt,
+                op,
+                output_path,
+            ) => {
+                // Write batch to in-memory buffer
+                arrow_parquet_buffer_writer.write(batch)?;
+
+                // Flush and get the current buffer content
+                arrow_parquet_buffer_writer.flush()?;
+                let cursor = arrow_parquet_buffer_writer.inner_mut();
+                let current_data = cursor.get_ref().clone();
+
+                // Create HDFS writer lazily on first write
+                if hdfs_writer_opt.is_none() {
+                    let writer = 
op.writer(output_path.as_str()).await.map_err(|e| {

Review Comment:
   Thanks @wForget that sounds really good to refer to a generic interface 
rather than specific fs implementation, I would address it in follow up PR. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to