[I] Writing 0 batches to parquet produces malformed parquet file. [iceberg-rust]

via GitHub Wed, 16 Apr 2025 18:59:27 -0700


hugokitano opened a new issue, #1224:
URL: https://github.com/apache/iceberg-rust/issues/1224


   ### Apache Iceberg Rust version
   
   0.4.0 (latest version)
   
   ### Describe the bug
   
   Performing 0 write batches should create an empty Parquet table, but it 
cannot be read using `pyarrow.parquet`.
   
   ### To Reproduce
   
   
   
   ```
   #[tokio::main]
   async fn main() -> Result<(), Box<dyn std::error::Error>> {
       run_iceberg_write().await
   }
   
   async fn run_iceberg_write() -> Result<(), Box<dyn std::error::Error>> {
       use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
       // Create an in-memory file IO
       let namespace_name = "test_namespace";
       let table_name = "test_table";
   
       let tmp_dir = std::env::temp_dir().to_str().unwrap().to_string();
       let warehouse_location = format!("file://{}/iceberg_warehouse", 
tmp_dir.trim_end_matches('/'));
       println!("\nwriting to {}", warehouse_location);
       std::fs::create_dir_all(format!("{}/iceberg_warehouse", tmp_dir))?;
       let file_io = IcebergFileIOBuilder::new("file").build()?;
   
   
       let mut metadata = HashMap::new();
       metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string());
   
       // Create schema with the metadata on the "id" field
       let id_field = arrow_schema::Field::new("id", 
arrow_schema::DataType::Int32, false)
           .with_metadata(metadata);
       let schema = ArrowSchema::new(vec![id_field]);
   
       // Create array and batch
       let id_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
       let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![id_array])?;
   
       // Create an in-memory catalog
       let catalog = 
iceberg_catalog_memory::MemoryCatalog::new(file_io.clone(), 
Some(warehouse_location.to_string()));
   
       // Create the namespace and table
       let namespace = NamespaceIdent::new(namespace_name.to_string());
       catalog.create_namespace(&namespace, HashMap::new()).await?;
   
       let table_ident = TableIdent::new(namespace.clone(), 
table_name.to_string());
   
       // Convert schema to Iceberg schema
       let iceberg_schema = iceberg::spec::Schema::try_from(&schema)?;
   
       let table = catalog.create_table(&namespace, TableCreation::builder()
           .name(table_name.to_string())
           .schema(iceberg_schema.clone())
           .properties(HashMap::new())
           .build()).await?;
   
       // Create location generator
       let location_generator = 
DefaultLocationGenerator::new(table.metadata().clone())?;
   
       // Create file name generator
       let writer_id = format!("file-{}", Uuid::new_v4());
       let file_name_generator = DefaultFileNameGenerator::new(
           writer_id,
           None,
           iceberg::spec::DataFileFormat::Parquet
       );
   
       // Create Parquet writer builder
       let parquet_props = 
parquet::file::properties::WriterProperties::builder().build();
       let parquet_writer_builder = ParquetWriterBuilder::new(
           parquet_props,
           Arc::new(iceberg_schema),
           file_io.clone(),
           location_generator,
           file_name_generator,
       );
   
       // Create data file writer
       let data_file_writer_builder = 
DataFileWriterBuilder::new(parquet_writer_builder, None);
       let mut writer = data_file_writer_builder.build().await?;
   
       // DO NOT write the batch
       // writer.write(batch).await?;
   
       // Close the writer and get data files
       let data_files = writer.close().await?;
   
       // Create a transaction to append data files
       let tx = Transaction::new(&table);
       let mut fast_append = tx.fast_append(None, vec![])?;
       fast_append.add_data_files(data_files)?;
       fast_append.apply().await?;
   
       println!("Successfully wrote table!");
   
       Ok(())
   }
   ```
   
   This creates a parquet file. When I perform 
   ```
   pq.ParquetFile('path/to/parquet').read()
   ```
   on the file, I get an error:
   
   ```File ~/***/site-packages/pyarrow/_parquet.pyx:1722, in 
pyarrow._parquet.ParquetReader.read_all()
   
   File ~/***/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
   
   OSError: The file only has 0 row groups, requested metadata for row group: -1
   ```
   
   ### Expected behavior
   
    I expect to receive an empty table like
   ```
   pyarrow.Table
   id: int64
   ----
   id: [[]]
   ```
   
   ### Willingness to contribute
   
   None


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[I] Writing 0 batches to parquet produces malformed parquet file. [iceberg-rust]

Reply via email to