This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new b89842374a [core] Add support for IN predicate pushdown in ORC format
(#6047)
b89842374a is described below
commit b89842374a7056870bd0f991cf696c7c6b4f4d1a
Author: xiaochen <[email protected]>
AuthorDate: Mon Aug 11 17:41:36 2025 +0800
[core] Add support for IN predicate pushdown in ORC format (#6047)
---
.../paimon/format/orc/filter/OrcFilters.java | 29 +++++++++
.../orc/filter/OrcPredicateFunctionVisitor.java | 20 +++++-
.../format/orc/filter/OrcFilterConverterTest.java | 73 +++++++++++++++++++++-
3 files changed, 118 insertions(+), 4 deletions(-)
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcFilters.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcFilters.java
index 4227137bab..81036452e1 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcFilters.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcFilters.java
@@ -304,4 +304,33 @@ public class OrcFilters {
return "AND(" + Arrays.toString(preds) + ")";
}
}
+
+ /** An IN predicate that can be evaluated by the OrcInputFormat. */
+ public static class In extends ColumnPredicate {
+ private final Object[] literals;
+
+ /** Creates an IN predicate. */
+ public In(String columnName, PredicateLeaf.Type literalType, Object...
literals) {
+ super(columnName, literalType);
+ this.literals = literals;
+ }
+
+ @Override
+ public SearchArgument.Builder add(SearchArgument.Builder builder) {
+ Object[] castedLiterals = new Object[literals.length];
+ for (int i = 0; i < literals.length; i++) {
+ if (literals[i] instanceof Serializable) {
+ castedLiterals[i] = castLiteral((Serializable)
literals[i]);
+ } else {
+ castedLiterals[i] = literals[i];
+ }
+ }
+ return builder.in(columnName, literalType, castedLiterals);
+ }
+
+ @Override
+ public String toString() {
+ return columnName + " IN " + Arrays.toString(literals);
+ }
+ }
}
diff --git
a/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcPredicateFunctionVisitor.java
b/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcPredicateFunctionVisitor.java
index 935b482109..23612edd71 100644
---
a/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcPredicateFunctionVisitor.java
+++
b/paimon-format/src/main/java/org/apache/paimon/format/orc/filter/OrcPredicateFunctionVisitor.java
@@ -122,12 +122,28 @@ public class OrcPredicateFunctionVisitor
@Override
public Optional<OrcFilters.Predicate> visitIn(FieldRef fieldRef,
List<Object> literals) {
- return Optional.empty();
+ PredicateLeaf.Type colType = toOrcType(fieldRef.type());
+ if (colType == null) {
+ return Optional.empty();
+ }
+
+ Object[] orcLiterals = new Object[literals.size()];
+ for (int i = 0; i < literals.size(); i++) {
+ Object orcLiteral = toOrcObject(colType, literals.get(i));
+ if (orcLiteral == null && literals.get(i) != null) {
+ // If conversion fails for non-null value, skip this predicate
+ return Optional.empty();
+ }
+ orcLiterals[i] = orcLiteral;
+ }
+
+ return Optional.of(new OrcFilters.In(fieldRef.name(), colType,
orcLiterals));
}
@Override
public Optional<OrcFilters.Predicate> visitNotIn(FieldRef fieldRef,
List<Object> literals) {
- return Optional.empty();
+ Optional<OrcFilters.Predicate> inPredicate = visitIn(fieldRef,
literals);
+ return inPredicate.map(OrcFilters.Not::new);
}
@Override
diff --git
a/paimon-format/src/test/java/org/apache/paimon/format/orc/filter/OrcFilterConverterTest.java
b/paimon-format/src/test/java/org/apache/paimon/format/orc/filter/OrcFilterConverterTest.java
index c9a972f150..78bd01be96 100644
---
a/paimon-format/src/test/java/org/apache/paimon/format/orc/filter/OrcFilterConverterTest.java
+++
b/paimon-format/src/test/java/org/apache/paimon/format/orc/filter/OrcFilterConverterTest.java
@@ -53,6 +53,7 @@ import java.math.BigDecimal;
import java.time.LocalDateTime;
import java.util.Arrays;
import java.util.Collections;
+import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
@@ -109,7 +110,7 @@ public class OrcFilterConverterTest {
.collect(Collectors.toList()))
.visit(OrcPredicateFunctionVisitor.VISITOR)
.isPresent())
- .isFalse();
+ .isTrue();
assertThat(
builder.notIn(
@@ -119,7 +120,7 @@ public class OrcFilterConverterTest {
.collect(Collectors.toList()))
.visit(OrcPredicateFunctionVisitor.VISITOR)
.isPresent())
- .isFalse();
+ .isTrue();
}
@ParameterizedTest
@@ -165,6 +166,74 @@ public class OrcFilterConverterTest {
builder.greaterOrEqual(0, tuple4.value),
new OrcFilters.Not(new OrcFilters.LessThan("fieldName",
tuple4.type, tuple4.value)),
tuple4.canPushDown);
+
+ test(
+ builder.in(0, Collections.singletonList(tuple4.value)),
+ new OrcFilters.Equals("fieldName", tuple4.type, tuple4.value),
+ tuple4.canPushDown);
+
+ test(
+ builder.notIn(0, Collections.singletonList(tuple4.value)),
+ new OrcFilters.Not(new OrcFilters.Equals("fieldName",
tuple4.type, tuple4.value)),
+ tuple4.canPushDown);
+ }
+
+ @Test
+ public void testInPredicateWithMultipleValues() {
+ PredicateBuilder builder =
+ new PredicateBuilder(
+ new RowType(
+ Collections.singletonList(
+ new DataField(0, "testField", new
BigIntType()))));
+
+ // Test IN with multiple values (≤20 values should be converted to OR
of EQUALS)
+ test(
+ builder.in(0, Arrays.asList(1L, 2L, 3L)),
+ new OrcFilters.Or(
+ new OrcFilters.Or(
+ new OrcFilters.Equals("testField",
PredicateLeaf.Type.LONG, 1L),
+ new OrcFilters.Equals("testField",
PredicateLeaf.Type.LONG, 2L)),
+ new OrcFilters.Equals("testField",
PredicateLeaf.Type.LONG, 3L)),
+ true);
+
+ // Test NOT IN with multiple values (should be converted to AND of NOT
EQUALS)
+ test(
+ builder.notIn(0, Arrays.asList(1L, 2L, 3L)),
+ new OrcFilters.And(
+ new OrcFilters.And(
+ new OrcFilters.Not(
+ new OrcFilters.Equals(
+ "testField",
PredicateLeaf.Type.LONG, 1L)),
+ new OrcFilters.Not(
+ new OrcFilters.Equals(
+ "testField",
PredicateLeaf.Type.LONG, 2L))),
+ new OrcFilters.Not(
+ new OrcFilters.Equals("testField",
PredicateLeaf.Type.LONG, 3L))),
+ true);
+ }
+
+ @Test
+ public void testInPredicateWithManyValues() {
+ PredicateBuilder builder =
+ new PredicateBuilder(
+ new RowType(
+ Collections.singletonList(
+ new DataField(0, "testField", new
BigIntType()))));
+
+ // Test IN with >20 values (should use real IN operation)
+ List<Object> manyValues = LongStream.range(1L,
22L).boxed().collect(Collectors.toList());
+ test(
+ builder.in(0, manyValues),
+ new OrcFilters.In("testField", PredicateLeaf.Type.LONG,
manyValues.toArray()),
+ true);
+
+ // Test NOT IN with >20 values (should use real NOT IN operation)
+ test(
+ builder.notIn(0, manyValues),
+ new OrcFilters.Not(
+ new OrcFilters.In(
+ "testField", PredicateLeaf.Type.LONG,
manyValues.toArray())),
+ true);
}
private void test(Predicate predicate, OrcFilters.Predicate orcPredicate,
boolean canPushDown) {