nathaniel-d-ef commented on code in PR #8353:
URL: https://github.com/apache/arrow-rs/pull/8353#discussion_r2353710434


##########
arrow-avro/src/writer/mod.rs:
##########
@@ -415,4 +415,222 @@ mod tests {
         );
         Ok(())
     }
+
+    #[test]
+    fn test_round_trip_simple_fixed_ocf() -> Result<(), ArrowError> {
+        let path = arrow_test_data("avro/simple_fixed.avro");
+        let rdr_file = File::open(&path).expect("open avro/simple_fixed.avro");
+        let mut reader = ReaderBuilder::new()
+            .build(BufReader::new(rdr_file))
+            .expect("build avro reader");
+        let schema = reader.schema();
+        let input_batches = reader.collect::<Result<Vec<_>, _>>()?;
+        let original =
+            arrow::compute::concat_batches(&schema, 
&input_batches).expect("concat input");
+        let tmp = NamedTempFile::new().expect("create temp file");
+        let out_file = File::create(tmp.path()).expect("create temp avro");
+        let mut writer = AvroWriter::new(out_file, 
original.schema().as_ref().clone())?;
+        writer.write(&original)?;
+        writer.finish()?;
+        drop(writer);
+        let rt_file = File::open(tmp.path()).expect("open round_trip avro");
+        let mut rt_reader = ReaderBuilder::new()
+            .build(BufReader::new(rt_file))
+            .expect("build round_trip reader");
+        let rt_schema = rt_reader.schema();
+        let rt_batches = rt_reader.collect::<Result<Vec<_>, _>>()?;
+        let round_trip =
+            arrow::compute::concat_batches(&rt_schema, 
&rt_batches).expect("concat round_trip");
+        assert_eq!(round_trip, original);
+        Ok(())
+    }
+
+    #[cfg(not(feature = "canonical_extension_types"))]
+    #[test]
+    fn test_round_trip_duration_and_uuid_ocf() -> Result<(), ArrowError> {
+        let in_file =
+            File::open("test/data/duration_uuid.avro").expect("open 
test/data/duration_uuid.avro");
+        let mut reader = ReaderBuilder::new()
+            .build(BufReader::new(in_file))
+            .expect("build reader for duration_uuid.avro");
+        let in_schema = reader.schema();
+        let has_mdn = in_schema.fields().iter().any(|f| {
+            matches!(
+                f.data_type(),
+                DataType::Interval(IntervalUnit::MonthDayNano)
+            )
+        });
+        assert!(
+            has_mdn,
+            "expected at least one Interval(MonthDayNano) field in 
duration_uuid.avro"
+        );
+        let has_uuid_fixed = in_schema
+            .fields()
+            .iter()
+            .any(|f| matches!(f.data_type(), DataType::FixedSizeBinary(16)));
+        assert!(
+            has_uuid_fixed,
+            "expected at least one FixedSizeBinary(16) (uuid) field in 
duration_uuid.avro"
+        );
+        let input_batches = reader.collect::<Result<Vec<_>, _>>()?;
+        let input =
+            arrow::compute::concat_batches(&in_schema, 
&input_batches).expect("concat input");
+        let tmp = NamedTempFile::new().expect("create temp file");
+        {
+            let out_file = File::create(tmp.path()).expect("create temp avro");
+            let mut writer = AvroWriter::new(out_file, 
in_schema.as_ref().clone())?;
+            writer.write(&input)?;
+            writer.finish()?;
+        }
+        let rt_file = File::open(tmp.path()).expect("open round_trip avro");
+        let mut rt_reader = ReaderBuilder::new()
+            .build(BufReader::new(rt_file))
+            .expect("build round_trip reader");
+        let rt_schema = rt_reader.schema();
+        let rt_batches = rt_reader.collect::<Result<Vec<_>, _>>()?;
+        let round_trip =
+            arrow::compute::concat_batches(&rt_schema, 
&rt_batches).expect("concat round_trip");
+        assert_eq!(round_trip, input);
+        Ok(())
+    }
+
+    // This test reads the same 'nonnullable.impala.avro' used by the reader 
tests,
+    // writes it back out with the writer (hitting Map encoding paths), then 
reads it
+    // again and asserts exact Arrow equivalence.
+    #[test]
+    fn test_nonnullable_impala_roundtrip_writer() -> Result<(), ArrowError> {
+        // Load source Avro with Map fields
+        let path = arrow_test_data("avro/nonnullable.impala.avro");
+        let rdr_file = File::open(&path).expect("open 
avro/nonnullable.impala.avro");
+        let mut reader = ReaderBuilder::new()
+            .build(BufReader::new(rdr_file))
+            .expect("build reader for nonnullable.impala.avro");
+        // Collect all input batches and concatenate to a single RecordBatch
+        let in_schema = reader.schema();
+        // Sanity: ensure the file actually contains at least one Map field
+        let has_map = in_schema
+            .fields()
+            .iter()
+            .any(|f| matches!(f.data_type(), DataType::Map(_, _)));
+        assert!(
+            has_map,
+            "expected at least one Map field in avro/nonnullable.impala.avro"
+        );
+
+        let input_batches = reader.collect::<Result<Vec<_>, _>>()?;
+        let original =
+            arrow::compute::concat_batches(&in_schema, 
&input_batches).expect("concat input");
+        // Write out using the OCF writer into an in-memory Vec<u8>
+        let buffer = Vec::<u8>::new();
+        let mut writer = AvroWriter::new(buffer, in_schema.as_ref().clone())?;
+        writer.write(&original)?;
+        writer.finish()?;
+        let out_bytes = writer.into_inner();
+        // Read the produced bytes back with the Reader
+        let mut rt_reader = ReaderBuilder::new()
+            .build(Cursor::new(out_bytes))
+            .expect("build reader for round-tripped in-memory OCF");
+        let rt_schema = rt_reader.schema();
+        let rt_batches = rt_reader.collect::<Result<Vec<_>, _>>()?;
+        let roundtrip =
+            arrow::compute::concat_batches(&rt_schema, 
&rt_batches).expect("concat roundtrip");
+        // Exact value fidelity (schema + data)
+        assert_eq!(
+            roundtrip, original,
+            "Round-trip Avro map data mismatch for nonnullable.impala.avro"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn test_roundtrip_decimals_via_writer() -> Result<(), ArrowError> {
+        // (file, resolve via ARROW_TEST_DATA?)
+        let files: [(&str, bool); 8] = [
+            ("avro/fixed_length_decimal.avro", true), // fixed-backed -> 
Decimal128(25,2)
+            ("avro/fixed_length_decimal_legacy.avro", true), // legacy 
fixed[8] -> Decimal64(13,2)
+            ("avro/int32_decimal.avro", true),        // bytes-backed -> 
Decimal32(4,2)
+            ("avro/int64_decimal.avro", true),        // bytes-backed -> 
Decimal64(10,2)
+            ("test/data/int256_decimal.avro", false), // bytes-backed -> 
Decimal256(76,2)
+            ("test/data/fixed256_decimal.avro", false), // fixed[32]-backed -> 
Decimal256(76,10)
+            ("test/data/fixed_length_decimal_legacy_32.avro", false), // 
legacy fixed[4] -> Decimal32(9,2)
+            ("test/data/int128_decimal.avro", false), // bytes-backed -> 
Decimal128(38,2)
+        ];
+        for (rel, in_test_data_dir) in files {
+            // Resolve path the same way as reader::test_decimal
+            let path: String = if in_test_data_dir {
+                arrow_test_data(rel)
+            } else {
+                PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+                    .join(rel)
+                    .to_string_lossy()
+                    .into_owned()
+            };
+            // Read original file into a single RecordBatch for comparison
+            let f_in = File::open(&path).expect("open input avro");

Review Comment:
   Good suggestion for sure. I'll add this as a note and we can tackle it along 
with the in-memory recommendation from earlier. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to