jecsand838 commented on code in PR #9291:
URL: https://github.com/apache/arrow-rs/pull/9291#discussion_r2806575841
##########
arrow-avro/src/writer/mod.rs:
##########
@@ -3252,4 +3260,1631 @@ mod tests {
assert_eq!(expected_str, actual_str);
Ok(())
}
+
+ /// Helper to roundtrip a RecordBatch through OCF writer/reader
+ fn roundtrip_ocf(batch: &RecordBatch) -> Result<RecordBatch, AvroError> {
+ let schema = batch.schema();
+ let mut buffer = Vec::<u8>::new();
+ let mut writer = AvroWriter::new(&mut buffer,
schema.as_ref().clone())?;
+ writer.write(batch)?;
+ writer.finish()?;
+ drop(writer);
+ let reader = ReaderBuilder::new()
+ .build(Cursor::new(buffer))
+ .expect("build reader for roundtrip OCF");
+ // Get the Avro schema JSON from the OCF header
+ let avro_schema_json = reader
+ .avro_header()
+ .get(SCHEMA_METADATA_KEY)
+ .map(|raw| std::str::from_utf8(raw).expect("valid
UTF-8").to_string());
+ // Get the Arrow schema and add the Avro schema metadata
+ let arrow_schema = reader.schema();
+ let rt_schema = if let Some(json) = avro_schema_json {
+ let mut metadata = arrow_schema.metadata().clone();
+ metadata.insert(SCHEMA_METADATA_KEY.to_string(), json);
+ Arc::new(Schema::new_with_metadata(
+ arrow_schema.fields().clone(),
+ metadata,
+ ))
+ } else {
+ arrow_schema
+ };
+ let rt_batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>,
_>>()?;
+ Ok(arrow::compute::concat_batches(&rt_schema,
&rt_batches).expect("concat roundtrip"))
+ }
+
+ #[cfg(feature = "avro_custom_types")]
+ #[test]
+ fn test_roundtrip_int8_custom_types() -> Result<(), AvroError> {
+ use arrow_array::Int8Array;
+
+ let schema = Schema::new(vec![Field::new("val", DataType::Int8,
true)]);
+ let values: Vec<Option<i8>> = vec![
+ Some(i8::MIN),
+ Some(-1),
+ Some(0),
+ None,
+ Some(1),
+ Some(i8::MAX),
+ ];
+ let array = Int8Array::from(values.clone());
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(array) as ArrayRef])?;
+
+ let roundtrip = roundtrip_ocf(&batch)?;
+
+ // With avro_custom_types: expect exact Int8 type
+ assert_eq!(
+ roundtrip.schema().field(0).data_type(),
+ &DataType::Int8,
+ "Expected Int8 type with avro_custom_types enabled"
+ );
+ let got = roundtrip
+ .column(0)
+ .as_any()
+ .downcast_ref::<Int8Array>()
+ .expect("Int8Array");
Review Comment:
This was a great call-out. I went ahead and made these changes to the tests.
##########
arrow-avro/src/writer/mod.rs:
##########
@@ -3252,4 +3260,1631 @@ mod tests {
assert_eq!(expected_str, actual_str);
Ok(())
}
+
+ /// Helper to roundtrip a RecordBatch through OCF writer/reader
+ fn roundtrip_ocf(batch: &RecordBatch) -> Result<RecordBatch, AvroError> {
+ let schema = batch.schema();
+ let mut buffer = Vec::<u8>::new();
+ let mut writer = AvroWriter::new(&mut buffer,
schema.as_ref().clone())?;
+ writer.write(batch)?;
+ writer.finish()?;
+ drop(writer);
+ let reader = ReaderBuilder::new()
+ .build(Cursor::new(buffer))
+ .expect("build reader for roundtrip OCF");
+ // Get the Avro schema JSON from the OCF header
+ let avro_schema_json = reader
+ .avro_header()
+ .get(SCHEMA_METADATA_KEY)
+ .map(|raw| std::str::from_utf8(raw).expect("valid
UTF-8").to_string());
+ // Get the Arrow schema and add the Avro schema metadata
+ let arrow_schema = reader.schema();
+ let rt_schema = if let Some(json) = avro_schema_json {
+ let mut metadata = arrow_schema.metadata().clone();
+ metadata.insert(SCHEMA_METADATA_KEY.to_string(), json);
+ Arc::new(Schema::new_with_metadata(
+ arrow_schema.fields().clone(),
+ metadata,
+ ))
+ } else {
+ arrow_schema
+ };
+ let rt_batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>,
_>>()?;
+ Ok(arrow::compute::concat_batches(&rt_schema,
&rt_batches).expect("concat roundtrip"))
+ }
+
+ #[cfg(feature = "avro_custom_types")]
+ #[test]
+ fn test_roundtrip_int8_custom_types() -> Result<(), AvroError> {
+ use arrow_array::Int8Array;
+
+ let schema = Schema::new(vec![Field::new("val", DataType::Int8,
true)]);
+ let values: Vec<Option<i8>> = vec![
+ Some(i8::MIN),
+ Some(-1),
+ Some(0),
+ None,
+ Some(1),
+ Some(i8::MAX),
+ ];
+ let array = Int8Array::from(values.clone());
+ let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(array) as ArrayRef])?;
+
+ let roundtrip = roundtrip_ocf(&batch)?;
+
+ // With avro_custom_types: expect exact Int8 type
+ assert_eq!(
+ roundtrip.schema().field(0).data_type(),
+ &DataType::Int8,
+ "Expected Int8 type with avro_custom_types enabled"
+ );
+ let got = roundtrip
+ .column(0)
+ .as_any()
+ .downcast_ref::<Int8Array>()
+ .expect("Int8Array");
+ assert_eq!(got, &Int8Array::from(values));
+ Ok(())
+ }
+
+ #[cfg(not(feature = "avro_custom_types"))]
+ #[test]
+ fn test_roundtrip_int8_no_custom_widens_to_int32() -> Result<(),
AvroError> {
Review Comment:
Pushed up these changes as well.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]