This is an automated email from the ASF dual-hosted git repository.

liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 330f21da8 feat(datafusion): Add LIKE predicate pushdown for StartsWith 
patterns (#2014)
330f21da8 is described below

commit 330f21da894948fc10b57d541cb2d6f32c8bdbb8
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Mon Jan 26 19:03:58 2026 +0800

    feat(datafusion): Add LIKE predicate pushdown for StartsWith patterns 
(#2014)
    
    Implement pushdown for LIKE predicates with simple prefix patterns
    (e.g., 'prefix%') to Iceberg's StartsWith operator. This optimization
    allows filtering to be performed at the storage layer, significantly
    improving query performance for prefix searches.
    
    Changes:
    - Add support for Expr::Like in predicate conversion
    - Convert LIKE 'prefix%' patterns to StartsWith operator
    - Convert NOT LIKE 'prefix%' patterns to NotStartsWith operator
    - Handle edge cases: empty prefix, unicode, special characters
    - Reject complex patterns that cannot be pushed down (wildcards in
    middle, underscore, ILIKE)
    - Add 8 comprehensive unit tests covering various scenarios
    
    Implementation details:
    - Only simple prefix patterns ending with % are converted
    - Patterns with % or _ wildcards in the prefix are not pushed down
    - Case-insensitive LIKE (ILIKE) is not supported for pushdown
    - Escape characters are not supported for pushdown
    - Works seamlessly with other predicates in AND/OR expressions
    
    ## Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax. For example
    `Closes #123` indicates that this PR will close issue #123.
    -->
    
    - Closes #.
    
    ## What changes are included in this PR?
    
    <!--
    Provide a summary of the modifications in this PR. List the main changes
    such as new features, bug fixes, refactoring, or any other updates.
    -->
    
    ## Are these changes tested?
    
    <!--
    Specify what test covers (unit test, integration test, etc.).
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    -->
    
    ---------
    
    Co-authored-by: Claude Sonnet 4.5 <[email protected]>
    Co-authored-by: Renjie Liu <[email protected]>
---
 .../src/physical_plan/expr_to_predicate.rs         | 147 ++++++++++++++++++++-
 .../sqllogictest/testdata/schedules/df_test.toml   |   4 +
 .../slts/df_test/like_predicate_pushdown.slt       | 113 ++++++++++++++++
 3 files changed, 262 insertions(+), 2 deletions(-)

diff --git 
a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs 
b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
index fc95be5db..2468606b4 100644
--- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
+++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
@@ -18,10 +18,10 @@
 use std::vec;
 
 use datafusion::arrow::datatypes::DataType;
-use datafusion::logical_expr::{Expr, Operator};
+use datafusion::logical_expr::{Expr, Like, Operator};
 use datafusion::scalar::ScalarValue;
 use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, 
UnaryExpression};
-use iceberg::spec::Datum;
+use iceberg::spec::{Datum, PrimitiveLiteral};
 
 // A datafusion expression could be an Iceberg predicate, column, or literal.
 enum TransformedResult {
@@ -128,6 +128,56 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult {
             }
             to_iceberg_predicate(&c.expr)
         }
+        Expr::Like(Like {
+            negated,
+            expr,
+            pattern,
+            escape_char,
+            case_insensitive,
+        }) => {
+            // Only support simple prefix patterns (e.g., 'prefix%')
+            // Note: Iceberg's StartsWith operator is case-sensitive, so we 
cannot
+            // push down case-insensitive LIKE (ILIKE) patterns
+            // Escape characters are also not supported for pushdown
+            if escape_char.is_some() || *case_insensitive {
+                return TransformedResult::NotTransformed;
+            }
+
+            // Extract the pattern string
+            let pattern_str = match to_iceberg_predicate(pattern) {
+                TransformedResult::Literal(d) => match d.literal() {
+                    PrimitiveLiteral::String(s) => s.clone(),
+                    _ => return TransformedResult::NotTransformed,
+                },
+                _ => return TransformedResult::NotTransformed,
+            };
+
+            // Check if it's a simple prefix pattern (ends with % and no other 
wildcards)
+            if pattern_str.ends_with('%')
+                && !pattern_str[..pattern_str.len() - 1].contains(['%', '_'])
+            {
+                // Extract the prefix (remove trailing %)
+                let prefix = pattern_str[..pattern_str.len() - 1].to_string();
+
+                // Get the column reference
+                let column = match to_iceberg_predicate(expr) {
+                    TransformedResult::Column(r) => r,
+                    _ => return TransformedResult::NotTransformed,
+                };
+
+                // Create the appropriate predicate
+                let predicate = if *negated {
+                    column.not_starts_with(Datum::string(prefix))
+                } else {
+                    column.starts_with(Datum::string(prefix))
+                };
+
+                TransformedResult::Predicate(predicate)
+            } else {
+                // Complex LIKE patterns cannot be pushed down
+                TransformedResult::NotTransformed
+            }
+        }
         _ => TransformedResult::NotTransformed,
     }
 }
@@ -458,4 +508,97 @@ mod tests {
             .and(Reference::new("bar").equal_to(Datum::binary(vec![1u8, 
2u8])));
         assert_eq!(predicate, expected_predicate);
     }
+
+    #[test]
+    fn test_predicate_conversion_with_like_starts_with() {
+        let sql = "bar LIKE 'test%'";
+        let predicate = convert_to_iceberg_predicate(sql).unwrap();
+        assert_eq!(
+            predicate,
+            Reference::new("bar").starts_with(Datum::string("test"))
+        );
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_not_like_starts_with() {
+        let sql = "bar NOT LIKE 'test%'";
+        let predicate = convert_to_iceberg_predicate(sql).unwrap();
+        assert_eq!(
+            predicate,
+            Reference::new("bar").not_starts_with(Datum::string("test"))
+        );
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_empty_prefix() {
+        let sql = "bar LIKE '%'";
+        let predicate = convert_to_iceberg_predicate(sql).unwrap();
+        assert_eq!(
+            predicate,
+            Reference::new("bar").starts_with(Datum::string(""))
+        );
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_complex_pattern() {
+        // Patterns with wildcards in the middle cannot be pushed down
+        let sql = "bar LIKE 'te%st'";
+        let predicate = convert_to_iceberg_predicate(sql);
+        assert_eq!(predicate, None);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_underscore_wildcard() {
+        // Patterns with underscore wildcard cannot be pushed down
+        let sql = "bar LIKE 'test_'";
+        let predicate = convert_to_iceberg_predicate(sql);
+        assert_eq!(predicate, None);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_no_wildcard() {
+        // Patterns without trailing % cannot be pushed down as StartsWith
+        let sql = "bar LIKE 'test'";
+        let predicate = convert_to_iceberg_predicate(sql);
+        assert_eq!(predicate, None);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_ilike() {
+        // Case-insensitive LIKE (ILIKE) is not supported
+        let sql = "bar ILIKE 'test%'";
+        let predicate = convert_to_iceberg_predicate(sql);
+        assert_eq!(predicate, None);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_and_other_conditions() {
+        let sql = "bar LIKE 'test%' AND foo > 1";
+        let predicate = convert_to_iceberg_predicate(sql).unwrap();
+        let expected_predicate = Predicate::and(
+            Reference::new("bar").starts_with(Datum::string("test")),
+            Reference::new("foo").greater_than(Datum::long(1)),
+        );
+        assert_eq!(predicate, expected_predicate);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_special_characters() {
+        // Test LIKE with special characters in prefix
+        let sql = "bar LIKE 'test-abc_123%'";
+        let predicate = convert_to_iceberg_predicate(sql);
+        // This should not be pushed down because it contains underscore
+        assert_eq!(predicate, None);
+    }
+
+    #[test]
+    fn test_predicate_conversion_with_like_unicode() {
+        // Test LIKE with unicode characters in prefix
+        let sql = "bar LIKE '测试%'";
+        let predicate = convert_to_iceberg_predicate(sql).unwrap();
+        assert_eq!(
+            predicate,
+            Reference::new("bar").starts_with(Datum::string("测试"))
+        );
+    }
 }
diff --git a/crates/sqllogictest/testdata/schedules/df_test.toml 
b/crates/sqllogictest/testdata/schedules/df_test.toml
index 1a1a0ad33..8f4860043 100644
--- a/crates/sqllogictest/testdata/schedules/df_test.toml
+++ b/crates/sqllogictest/testdata/schedules/df_test.toml
@@ -34,6 +34,10 @@ slt = "df_test/insert_into.slt"
 engine = "df"
 slt = "df_test/binary_predicate_pushdown.slt"
 
+[[steps]]
+engine = "df"
+slt = "df_test/like_predicate_pushdown.slt"
+
 [[steps]]
 engine = "df"
 slt = "df_test/drop_table.slt"
diff --git 
a/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt 
b/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt
new file mode 100644
index 000000000..41d8ec027
--- /dev/null
+++ b/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt
@@ -0,0 +1,113 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Test LIKE predicate pushdown for StartsWith patterns
+# This validates that LIKE 'prefix%' patterns are converted to StartsWith 
operator
+# Note: Iceberg's StartsWith operator is case-sensitive
+
+# Note: test_unpartitioned_table already contains data from insert_into.slt:
+# (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'), (4, NULL)
+# We'll insert additional test data for LIKE testing
+query I
+INSERT INTO default.default.test_unpartitioned_table VALUES (5, 'alice'), (6, 
'Albert'), (7, 'Bobby'), (8, 'Carol')
+----
+4
+
+# Test basic LIKE prefix pattern pushdown - verify predicate is pushed to 
IcebergTableScan
+query TT
+EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 
'Al%'
+----
+logical_plan
+01)Filter: default.default.test_unpartitioned_table.name LIKE Utf8("Al%")
+02)--TableScan: default.default.test_unpartitioned_table projection=[id, 
name], partial_filters=[default.default.test_unpartitioned_table.name LIKE 
Utf8("Al%")]
+physical_plan
+01)CoalesceBatchesExec: target_batch_size=8192
+02)--FilterExec: name@1 LIKE Al%
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------CooperativeExec
+05)--------IcebergTableScan projection:[id,name] predicate:[name STARTS WITH 
"Al"]
+
+# Test LIKE filtering with case-sensitive match
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%'
+----
+1 Alice
+6 Albert
+
+# Test NOT LIKE prefix pattern pushdown
+query TT
+EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT 
LIKE 'Al%'
+----
+logical_plan
+01)Filter: default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%")
+02)--TableScan: default.default.test_unpartitioned_table projection=[id, 
name], partial_filters=[default.default.test_unpartitioned_table.name NOT LIKE 
Utf8("Al%")]
+physical_plan
+01)CoalesceBatchesExec: target_batch_size=8192
+02)--FilterExec: name@1 NOT LIKE Al%
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------CooperativeExec
+05)--------IcebergTableScan projection:[id,name] predicate:[name NOT STARTS 
WITH "Al"]
+
+# Test NOT LIKE filtering
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 
'Al%'
+----
+2 Bob
+3 Charlie
+5 alice
+7 Bobby
+8 Carol
+
+# Test case sensitivity - lowercase 'al%' should not match 'Alice' or 'Albert'
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'al%'
+----
+5 alice
+
+# Test LIKE with empty prefix (matches everything except NULL)
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE '%'
+----
+1 Alice
+2 Bob
+3 Charlie
+5 alice
+6 Albert
+7 Bobby
+8 Carol
+
+# Test LIKE with single character prefix
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'B%'
+----
+2 Bob
+7 Bobby
+
+# Test LIKE combined with other predicates
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' 
AND id > 1
+----
+6 Albert
+
+# Test LIKE in OR expression
+query IT rowsort
+SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' 
OR name LIKE 'Bo%'
+----
+1 Alice
+2 Bob
+6 Albert
+7 Bobby

Reply via email to