Re: [PR] Test int96 Parquet file from Spark [arrow-rs]

via GitHub Fri, 04 Apr 2025 05:10:03 -0700


alamb commented on code in PR #7367:
URL: https://github.com/apache/arrow-rs/pull/7367#discussion_r2028639282



##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -1563,6 +1562,106 @@ mod tests {
         })
     }
 
+    #[test]
+    fn test_int96_from_spark_file_with_provided_schema() {
+        // int96_from_spark.parquet was written based on Spark's microsecond 
timestamps which trade
+        // range for resolution compared to a nanosecond timestamp. We must 
provide a schema with
+        // microsecond resolution for the Parquet reader to interpret these 
values correctly.
+        use arrow_schema::DataType::Timestamp;
+        let test_data = arrow::util::test_util::parquet_test_data();
+        let path = format!("{test_data}/int96_from_spark.parquet");
+        let file = File::open(path).unwrap();
+
+        let supplied_schema = Arc::new(Schema::new(vec![Field::new(
+            "a",
+            Timestamp(TimeUnit::Microsecond, None),
+            true,
+        )]));
+        let options = 
ArrowReaderOptions::new().with_schema(supplied_schema.clone());
+
+        let mut record_reader =
+            ParquetRecordBatchReaderBuilder::try_new_with_options(file, 
options)
+                .unwrap()
+                .build()
+                .unwrap();
+
+        let batch = record_reader.next().unwrap().unwrap();
+        assert_eq!(batch.num_columns(), 1);
+        let column = batch.column(0);
+        assert_eq!(column.data_type(), &Timestamp(TimeUnit::Microsecond, 
None));
+
+        let expected = Arc::new(Int64Array::from(vec![
+            Some(1704141296123456),
+            Some(1704070800000000),
+            Some(253402225200000000),
+            Some(1735599600000000),
+            None,
+            Some(9089380393200000000),
+        ]));
+
+        // arrow-rs relies on the chrono library to convert between timestamps 
and strings, so
+        // instead compare as Int64. The underlying type should be a 
PrimitiveArray of Int64
+        // anyway, so this should be a zero-copy non-modifying cast.
+
+        let binding = arrow_cast::cast(batch.column(0), 
&arrow_schema::DataType::Int64).unwrap();
+        let casted_timestamps = binding.as_primitive::<types::Int64Type>();
+
+        assert_eq!(casted_timestamps.len(), expected.len());
+
+        casted_timestamps
+            .iter()
+            .zip(expected.iter())
+            .for_each(|(lhs, rhs)| {
+                assert_eq!(lhs, rhs);
+            });
+    }
+
+    #[test]
+    fn test_int96_from_spark_file_without_provided_schema() {
+        // int96_from_spark.parquet was written based on Spark's microsecond 
timestamps which trade
+        // range for resolution compared to a nanosecond timestamp. Without a 
provided schema, some
+        // values when read as nanosecond resolution overflow and result in 
garbage values.
+        use arrow_schema::DataType::Timestamp;
+        let test_data = arrow::util::test_util::parquet_test_data();
+        let path = format!("{test_data}/int96_from_spark.parquet");
+        let file = File::open(path).unwrap();
+
+        let mut record_reader = ParquetRecordBatchReaderBuilder::try_new(file)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let batch = record_reader.next().unwrap().unwrap();
+        assert_eq!(batch.num_columns(), 1);
+        let column = batch.column(0);
+        assert_eq!(column.data_type(), &Timestamp(TimeUnit::Nanosecond, None));
+
+        let expected = Arc::new(Int64Array::from(vec![
+            Some(1704141296123456000),  // Reads as nanosecond fine (note 3 
extra 0s)
+            Some(1704070800000000000),  // Reads as nanosecond fine (note 3 
extra 0s)
+            Some(-4852191831933722624), // Cannot be represented with nanos 
timestamp (year 9999)

Review Comment:
   nice



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Test int96 Parquet file from Spark [arrow-rs]

Reply via email to