Andy Grove created ARROW-10920: ---------------------------------- Summary: [Rust] Segmentation fault in Arrow Parquet writer with huge arrays Key: ARROW-10920 URL: https://issues.apache.org/jira/browse/ARROW-10920 Project: Apache Arrow Issue Type: Bug Components: Rust Reporter: Andy Grove
I stumbled across this by chance. I am not too surprised that this fails but I would expect it to fail gracefully and not with a segmentation fault. {code:java} use std::fs::File; use std::sync::Arc; use arrow::array::StringBuilder; use arrow::datatypes::{DataType, Field, Schema}; use arrow::error::Result; use arrow::record_batch::RecordBatch; use parquet::arrow::ArrowWriter; fn main() -> Result<()> { let schema = Schema::new(vec![ Field::new("c0", DataType::Utf8, false), Field::new("c1", DataType::Utf8, true), ]); let batch_size = 2500000; let repeat_count = 140; let file = File::create("/tmp/test.parquet")?; let mut writer = ArrowWriter::try_new(file, Arc::new(schema.clone()), None).unwrap(); let mut c0_builder = StringBuilder::new(batch_size); let mut c1_builder = StringBuilder::new(batch_size); println!("Start of loop"); for i in 0..batch_size { let c0_value = format!("{:032}", i); let c1_value = c0_value.repeat(repeat_count); c0_builder.append_value(&c0_value)?; c1_builder.append_value(&c1_value)?; } println!("Finish building c0"); let c0 = Arc::new(c0_builder.finish()); println!("Finish building c1"); let c1 = Arc::new(c1_builder.finish()); println!("Creating RecordBatch"); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![c0, c1])?; // write the batch to parquet println!("Writing RecordBatch"); writer.write(&batch).unwrap(); println!("Closing writer"); writer.close().unwrap(); Ok(()) } {code} output: {code:java} Start of loop Finish building c0 Finish building c1 Creating RecordBatch Writing RecordBatch Segmentation fault (core dumped) {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)