Re: [PR] test: add more tests for statistics reading [datafusion]

via GitHub Wed, 22 May 2024 14:00:32 -0700


alamb commented on code in PR #10592:
URL: https://github.com/apache/datafusion/pull/10592#discussion_r1610641830



##########
datafusion/core/tests/parquet/arrow_statistics.rs:
##########
@@ -624,20 +624,281 @@ async fn test_dates_64_diff_rg_sizes() {
     .run("date64");
 }
 
+// BUG:
+// https://github.com/apache/datafusion/issues/10604
+#[tokio::test]
+async fn test_uint() {
+    let row_per_group = 4;
+
+    // This creates a parquet files of 4 columns named "u8", "u16", "u32", 
"u64"
+    // "u8" --> UInt8Array
+    // "u16" --> UInt16Array
+    // "u32" --> UInt32Array
+    // "u64" --> UInt64Array
+
+    // The file is created by 4 record batches (each has a null row), each has 
5 rows but then will be split into 5 row groups with size 4
+    let reader = parquet_file_many_columns(Scenario::UInt, 
row_per_group).await;
+
+    // u8
+    // BUG: expect UInt8Array but returns Int32Array
+    Test {
+        reader,
+        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt8Array
+        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt8Array
+        expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
+        expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
+    }
+    .run("u8");
+
+    // u16
+    // BUG: expect UInt16Array but returns Int32Array
+    let reader = parquet_file_many_columns(Scenario::UInt, 
row_per_group).await;
+    Test {
+        reader,
+        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt16Array
+        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt16Array
+        expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
+        expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
+    }
+    .run("u16");
+
+    // u32
+    // BUG: expect UInt32Array but returns Int32Array
+    let reader = parquet_file_many_columns(Scenario::UInt, 
row_per_group).await;
+    Test {
+        reader,
+        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt32Array
+        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt32Array
+        expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
+        expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
+    }
+    .run("u32");
+
+    // u64
+    // BUG: expect UInt64rray but returns Int64Array
+    let reader = parquet_file_many_columns(Scenario::UInt, 
row_per_group).await;
+    Test {
+        reader,
+        expected_min: Arc::new(Int64Array::from(vec![0, 1, 4, 7, 251])), // 
shoudld be UInt64Array
+        expected_max: Arc::new(Int64Array::from(vec![3, 4, 6, 250, 254])), // 
shoudld be UInt64Array
+        expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
+        expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
+    }
+    .run("u64");
+}
+
+#[tokio::test]
+async fn test_int32_range() {
+    let row_per_group = 5;
+    // This creates a parquet file of 1 column "i"
+    // file has 2 record batches, each has 2 rows. They will be saved into one 
row group
+    let reader = parquet_file_many_columns(Scenario::Int32Range, 
row_per_group).await;
+
+    Test {
+        reader,
+        expected_min: Arc::new(Int32Array::from(vec![0])),
+        expected_max: Arc::new(Int32Array::from(vec![300000])),
+        expected_null_counts: UInt64Array::from(vec![0]),
+        expected_row_counts: UInt64Array::from(vec![4]),
+    }
+    .run("i");
+}
+
+// BUG: not convert UInt32Array to Int32Array
+// https://github.com/apache/datafusion/issues/10604
+#[tokio::test]
+async fn test_uint32_range() {
+    let row_per_group = 5;
+    // This creates a parquet file of 1 column "u"
+    // file has 2 record batches, each has 2 rows. They will be saved into one 
row group
+    let reader = parquet_file_many_columns(Scenario::UInt32Range, 
row_per_group).await;
+
+    Test {
+        reader,
+        expected_min: Arc::new(Int32Array::from(vec![0])), // shoudld be 
UInt32Array
+        expected_max: Arc::new(Int32Array::from(vec![300000])), // shoudld be 
UInt32Array
+        expected_null_counts: UInt64Array::from(vec![0]),
+        expected_row_counts: UInt64Array::from(vec![4]),
+    }
+    .run("u");
+}
+
+#[tokio::test]
+async fn test_float64() {
+    let row_per_group = 5;
+    // This creates a parquet file of 1 column "f"
+    // file has 4 record batches, each has 5 rows. They will be saved into 4 
row groups
+    let reader = parquet_file_many_columns(Scenario::Float64, 
row_per_group).await;
+
+    Test {
+        reader,
+        expected_min: Arc::new(Float64Array::from(vec![-5.0, -4.0, -0.0, 
5.0])),
+        expected_max: Arc::new(Float64Array::from(vec![-1.0, 0.0, 4.0, 9.0])),
+        expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
+        expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
+    }
+    .run("f");
+}
+
+#[tokio::test]
+async fn test_decimal() {
+    let row_per_group = 5;
+    // This creates a parquet file of 1 column "decimal_col" with decimal data 
type and precicion 9, scale 2
+    // file has 3 record batches, each has 5 rows. They will be saved into 3 
row groups
+    let reader = parquet_file_many_columns(Scenario::Decimal, 
row_per_group).await;

Review Comment:
   I was thinking smaller precisions -- I can't remember but I vaguely remember 
that spark stores different scale decimals with different underlying datatypes 
or something



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] test: add more tests for statistics reading [datafusion]

Reply via email to