alamb commented on code in PR #8930:
URL: https://github.com/apache/arrow-datafusion/pull/8930#discussion_r1462422283


##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -209,16 +214,51 @@ impl PruningStatistics for BloomFilterStatistics {
 
         let known_not_present = values
             .iter()
-            .map(|value| match value {
-                ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()),
-                ScalarValue::Boolean(Some(v)) => sbbf.check(v),
-                ScalarValue::Float64(Some(v)) => sbbf.check(v),
-                ScalarValue::Float32(Some(v)) => sbbf.check(v),
-                ScalarValue::Int64(Some(v)) => sbbf.check(v),
-                ScalarValue::Int32(Some(v)) => sbbf.check(v),
-                ScalarValue::Int16(Some(v)) => sbbf.check(v),
-                ScalarValue::Int8(Some(v)) => sbbf.check(v),
-                _ => true,
+            .map(|value| {
+                match value {
+                    ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()),
+                    ScalarValue::Boolean(Some(v)) => sbbf.check(v),
+                    ScalarValue::Float64(Some(v)) => sbbf.check(v),
+                    ScalarValue::Float32(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int64(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int32(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int16(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int8(Some(v)) => sbbf.check(v),
+                    ScalarValue::Decimal128(Some(v), p, s) => match 
parquet_type {
+                        Type::INT32 => {
+                            
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
+                            // All physical type  are little-endian
+                            let b = (*v as i32).to_le_bytes();
+                            let decimal = Decimal::Int32 {
+                                value: b,
+                                precision: *p as i32,
+                                scale: *s as i32,
+                            };
+                            sbbf.check(&decimal)

Review Comment:
   I think it would be clearer to use the constructors instead: 
https://docs.rs/parquet/latest/parquet/data_type/enum.Decimal.html#method.from_i32
   
   ```suggestion
                               let decimal = Decimal::from_i32(* v as i32, *p 
as i32, *s as i32)'
                               sbbf.check(&decimal)
   ```
   
   (the same applies to the other types here as well)



##########
test-utils/src/data_gen.rs:
##########
@@ -169,7 +179,19 @@ impl BatchBuilder {
             .append_option(rng.gen_bool(0.9).then(|| rng.gen()));
         self.response_status
             .append_value(status[rng.gen_range(0..status.len())]);
-        self.prices_status.append_value(self.row_count as i128);
+        self.decimal128_price.append_value(self.row_count as i128);
+
+        let val = rng.gen_range(0..100);

Review Comment:
   I think the fact that this code picks a new random number is why all the 
values in the test need to change
   
   I wonder if you could reuse the same value that was created for some other 
field to avoid havint od this
   
   For example, maybe you could reuse the value of the first ip byte
   
   ```rust
          let first_ip_byte = rng.gen::<u8>()
           self.client_addr.append_value(format!(
               "{}.{}.{}.{}",
               first_ip_byte,
               rng.gen::<u8>(),
               rng.gen::<u8>(),
               rng.gen::<u8>()
           ));
   ...
           // Avoid row group min_max to prune this, so skip 50 between 0..99
           if first_ip_byte == 50 {
               self.decimal128_id_int32.append_value(1);
               self.decimal128_id_int64.append_value(1);
               self.decimal128_id_fixed_len_byte.append_value(1);
           } else {
               self.decimal128_id_int32.append_value(first_ip_byte);
               self.decimal128_id_int64.append_value(first_ip_byte);
               self.decimal128_id_fixed_len_byte.append_value(first_ip_byte);
           }
   ...
   



##########
datafusion/core/tests/parquet/filter_pushdown.rs:
##########
@@ -80,15 +81,15 @@ async fn single_file() {
         // request_method = 'GET'
         .with_filter(col("request_method").eq(lit("GET")))
         .with_pushdown_expected(PushdownExpected::Some)
-        .with_expected_rows(688);
+        .with_expected_rows(745);

Review Comment:
   I am not quite sure what you are asking here @Ted-Jiang . It seems like most 
of the tests in this file have been updated as well



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -209,16 +214,51 @@ impl PruningStatistics for BloomFilterStatistics {
 
         let known_not_present = values
             .iter()
-            .map(|value| match value {
-                ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()),
-                ScalarValue::Boolean(Some(v)) => sbbf.check(v),
-                ScalarValue::Float64(Some(v)) => sbbf.check(v),
-                ScalarValue::Float32(Some(v)) => sbbf.check(v),
-                ScalarValue::Int64(Some(v)) => sbbf.check(v),
-                ScalarValue::Int32(Some(v)) => sbbf.check(v),
-                ScalarValue::Int16(Some(v)) => sbbf.check(v),
-                ScalarValue::Int8(Some(v)) => sbbf.check(v),
-                _ => true,
+            .map(|value| {
+                match value {
+                    ScalarValue::Utf8(Some(v)) => sbbf.check(&v.as_str()),
+                    ScalarValue::Boolean(Some(v)) => sbbf.check(v),
+                    ScalarValue::Float64(Some(v)) => sbbf.check(v),
+                    ScalarValue::Float32(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int64(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int32(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int16(Some(v)) => sbbf.check(v),
+                    ScalarValue::Int8(Some(v)) => sbbf.check(v),
+                    ScalarValue::Decimal128(Some(v), p, s) => match 
parquet_type {
+                        Type::INT32 => {

Review Comment:
   Is the value of the i128 the same as the value of this i32 (aka are we sure 
the top 12 bytes are always 0) as this code is ignores those bytes?
   
   The same basic question applies to the Int64 case too (are we sure its top 8 
bytes are always zero)



##########
datafusion/core/tests/parquet/mod.rs:
##########
@@ -66,7 +66,9 @@ enum Scenario {
     Int32Range,
     Float64,
     Decimal,
+    DecimalBloomFilter,

Review Comment:
   There are only two scenarios here but three encodings (Int32, Int64 and 
Bytes) handled by the code. Is there some way to over all three encodings?



##########
datafusion/core/tests/parquet/filter_pushdown.rs:
##########
@@ -240,100 +241,151 @@ async fn single_file_small_data_pages() {
     // docker run -v /tmp:/tmp nathanhowell/parquet-tools dump -d -c pod -n 
/tmp/.tmppkTohR/data.parquet
     //
     // ```
-    // row group 0
-    //     
--------------------------------------------------------------------------------
-    //     pod:  BINARY UNCOMPRESSED DO:782 FPO:1215 SZ:744/744/1.00 VC:2048 
ENC:RLE,RLE_DICTIONARY,PLAIN ST:[min: azvagebjesrqboyqxmgaskvpwddebuptqyy, max: 
zamirxzhihhfqdvhuxeziuukkqyutmczbhfgx, num_nulls not defined]
+    //  row group 0

Review Comment:
   Thank you for updating these values. I think it would make the PR easier to 
review if we can avoid changing the existing tests if possible (I left a 
suggestion on the data generator about a potential way to do this)



##########
datafusion/core/tests/parquet/row_group_pruning.rs:
##########
@@ -578,6 +578,27 @@ async fn prune_decimal_in_list() {
         6,
     )
     .await;
+    // test data -> r1: {1,2,3,4,5}, r2: {1,2,3,4,6}, r3: {1,2,3,4,6}
+    test_row_group_prune(
+        Scenario::DecimalBloomFilter,
+        "SELECT * FROM t where decimal_col in (5)",
+        Some(0),
+        Some(0),
+        Some(2),

Review Comment:
   Related PR https://github.com/apache/arrow-datafusion/pull/8941 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to