[
https://issues.apache.org/jira/browse/PARQUET-2237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17687586#comment-17687586
]
ASF GitHub Bot commented on PARQUET-2237:
-----------------------------------------
yabola commented on code in PR #1023:
URL: https://github.com/apache/parquet-mr/pull/1023#discussion_r1103820351
##########
parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestRowGroupFilterExactly.java:
##########
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.hadoop;
+
+import static
org.apache.parquet.filter2.compat.PredicateEvaluation.BLOCK_CANNOT_MATCH;
+import static
org.apache.parquet.filter2.compat.PredicateEvaluation.BLOCK_MUST_MATCH;
+import static org.apache.parquet.filter2.predicate.FilterApi.and;
+import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
+import static org.apache.parquet.filter2.predicate.FilterApi.doubleColumn;
+import static org.apache.parquet.filter2.predicate.FilterApi.eq;
+import static org.apache.parquet.filter2.predicate.FilterApi.gt;
+import static org.apache.parquet.filter2.predicate.FilterApi.gtEq;
+import static org.apache.parquet.filter2.predicate.FilterApi.in;
+import static org.apache.parquet.filter2.predicate.FilterApi.longColumn;
+import static org.apache.parquet.filter2.predicate.FilterApi.lt;
+import static org.apache.parquet.filter2.predicate.FilterApi.ltEq;
+import static org.apache.parquet.filter2.predicate.FilterApi.notEq;
+import static org.apache.parquet.filter2.predicate.FilterApi.notIn;
+import static org.apache.parquet.filter2.predicate.FilterApi.or;
+import static org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.junit.After;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.parquet.ParquetReadOptions;
+import org.apache.parquet.column.ParquetProperties;
+import org.apache.parquet.filter2.compat.FilterCompat;
+import org.apache.parquet.filter2.compat.PredicateEvaluation;
+import org.apache.parquet.filter2.predicate.FilterPredicate;
+import org.apache.parquet.filter2.recordlevel.PhoneBookWriter;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.api.Binary;
+
+import com.google.common.collect.Sets;
+
+@RunWith(Parameterized.class)
+public class TestRowGroupFilterExactly {
+ private final Path FILE = createTempFile();
+ private ParquetProperties.WriterVersion WRITER_VERSION;
+ private final Random RANDOM = new Random(42);
+ private final List<PhoneBookWriter.User> DATA =
Collections.unmodifiableList(generateData(10000));
+ private final long MAX_ID = DATA.size() - 1;
+ private final long MIN_ID = 0;
+ private final TestPredicateEvaluation testEvaluation = new
TestPredicateEvaluation();
+
+ @Parameterized.Parameters(name = "Run parquet version {index} ")
+ public static Collection<Object[]> params() {
+ return Arrays.asList(
+ new Object[]{ParquetProperties.WriterVersion.PARQUET_1_0},
+ new Object[]{ParquetProperties.WriterVersion.PARQUET_2_0});
+ }
+
+ public TestRowGroupFilterExactly(ParquetProperties.WriterVersion
WRITER_VERSION) throws IOException {
+ this.WRITER_VERSION = WRITER_VERSION;
+ deleteFile(FILE);
+ writePhoneBookToFile(FILE, this.WRITER_VERSION);
+ }
+
+ @After
+ public void deleteFiles() throws IOException {
+ deleteFile(FILE);
+ testEvaluation.setTestExactPredicate(new
ArrayList<>(Arrays.asList(BLOCK_MUST_MATCH, BLOCK_CANNOT_MATCH)));
+ }
+
+ @Test
+ public void testFiltering() throws IOException {
+
+ Set<Binary> existValues = new HashSet<>();
+ existValues.add(Binary.fromString("miller"));
+ existValues.add(Binary.fromString("anderson"));
+
+ assertCorrectFiltering(eq(binaryColumn("name"), null));
+ assertCorrectFiltering(eq(binaryColumn("name"),
Binary.fromString("miller")));
+ assertCorrectFiltering(eq(longColumn("id"), 1234L));
+ assertCorrectFiltering(eq(binaryColumn("name"),
Binary.fromString("noneExistName")));
+ assertCorrectFiltering(eq(doubleColumn("location.lat"), 99.9));
+
+ assertCorrectFiltering(notEq(binaryColumn("name"), null));
+ assertCorrectFiltering(notEq(binaryColumn("name"),
Binary.fromString("miller")));
+ assertCorrectFiltering(notEq(binaryColumn("name"),
Binary.fromString("noneExistName")));
+
+ assertCorrectFiltering(in(binaryColumn("name"), existValues));
+ assertCorrectFiltering(in(binaryColumn("name"),
Sets.newHashSet(Binary.fromString("miller"),
+ Binary.fromString("noneExistName"), null)));
+
+ assertCorrectFiltering(notIn(binaryColumn("name"),
+ Sets.newHashSet(Binary.fromString("miller"),
Binary.fromString("anderson"))));
+ assertCorrectFiltering(notIn(binaryColumn("name"),
+ Sets.newHashSet(Binary.fromString("miller"),
Binary.fromString("noneExistName"), null)));
+
+ assertCorrectFiltering(lt(longColumn("id"), MAX_ID + 1L));
+ assertCorrectFiltering(lt(longColumn("id"), MAX_ID));
+ assertCorrectFiltering(lt(longColumn("id"), 1234L));
+ assertCorrectFiltering(lt(longColumn("id"), MIN_ID));
+ assertCorrectFiltering(lt(longColumn("id"), MIN_ID - 1L));
+ // for dictionary exactly match less than `miller`
+ assertCorrectFiltering(lt(binaryColumn("name"),
Binary.fromString("ailler")));
+ assertCorrectFiltering(lt(binaryColumn("name"),
Binary.fromString("miller")));
+
+ assertCorrectFiltering(ltEq(longColumn("id"), MAX_ID + 1L));
+ assertCorrectFiltering(ltEq(longColumn("id"), MAX_ID));
+ assertCorrectFiltering(ltEq(longColumn("id"), 1234L));
+ assertCorrectFiltering(ltEq(longColumn("id"), MIN_ID));
+ assertCorrectFiltering(ltEq(longColumn("id"), MIN_ID - 1L));
+
+ assertCorrectFiltering(gt(longColumn("id"), MAX_ID + 1L));
+ assertCorrectFiltering(gt(longColumn("id"), MAX_ID));
+ assertCorrectFiltering(gt(longColumn("id"), 1234L));
+ assertCorrectFiltering(gt(longColumn("id"), MIN_ID));
+ assertCorrectFiltering(gt(longColumn("id"), MIN_ID - 1L));
+
+ assertCorrectFiltering(gtEq(longColumn("id"), MAX_ID + 1L));
+ assertCorrectFiltering(gtEq(longColumn("id"), MAX_ID));
+ assertCorrectFiltering(gtEq(longColumn("id"), 1234L));
+ assertCorrectFiltering(gtEq(longColumn("id"), MIN_ID));
+ assertCorrectFiltering(gtEq(longColumn("id"), MIN_ID - 1L));
+
+ assertCorrectFiltering(and(eq(binaryColumn("name"),
Binary.fromString("noneExistName")),
+ lt(longColumn("id"), -99L)));
+ assertCorrectFiltering(and(eq(binaryColumn("name"),
Binary.fromString("miller")),
+ lt(longColumn("id"), 1234L)));
+ assertCorrectFiltering(and(eq(binaryColumn("name"),
Binary.fromString("noneExistName")),
+ lt(longColumn("id"), 1234L)));
+
+ assertCorrectFiltering(or(eq(binaryColumn("name"),
Binary.fromString("noneExistName")),
+ lt(longColumn("id"), -99L)));
+ assertCorrectFiltering(or(eq(binaryColumn("name"),
Binary.fromString("miller")),
+ lt(longColumn("id"), 1234L)));
+ assertCorrectFiltering(or(eq(binaryColumn("name"),
Binary.fromString("noneExistName")),
+ lt(longColumn("id"), 1234L)));
+ }
+
+ private void assertCorrectFiltering(FilterPredicate filter) throws
IOException {
+ ParquetReadOptions readOptions = ParquetReadOptions.builder()
+ .withRecordFilter(FilterCompat.get(filter)).build();
+
+ // simulate the previous behavior, only skip other filters when predicate
is BLOCK_CANNOT_MATCH
+
testEvaluation.setTestExactPredicate(Collections.singletonList(BLOCK_CANNOT_MATCH));
Review Comment:
simulate the previous behavior, only skip other filters when predicate is
BLOCK_CANNOT_MATCH
> Improve performance when filters in RowGroupFilter can match exactly
> --------------------------------------------------------------------
>
> Key: PARQUET-2237
> URL: https://issues.apache.org/jira/browse/PARQUET-2237
> Project: Parquet
> Issue Type: Improvement
> Reporter: Mars
> Priority: Major
>
> If we can accurately judge by the minMax status, we don’t need to load the
> dictionary from filesystem and compare one by one anymore.
> Similarly , Bloomfilter needs to load from filesystem, it may costs time and
> memory. If we can exactly determine the existence/nonexistence of the value
> from minMax or dictionary filters , then we can avoid using Bloomfilter to
> Improve performance.
> For example,
> # read data greater than {{x1}} in the block, if minMax in status is all
> greater than {{{}x1{}}}, then we don't need to read dictionary and compare
> one by one.
> # If we already have page dictionaries and have compared one by one, we
> don't need to read BloomFilter and compare.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)