This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 2438f21 ORC: Simplify logic to determine which columns have stats
(#1167)
2438f21 is described below
commit 2438f21a37ca269eb80f6149893ea2975a40d2b7
Author: Ryan Blue <[email protected]>
AuthorDate: Sun Jul 5 13:29:48 2020 -0700
ORC: Simplify logic to determine which columns have stats (#1167)
---
.../java/org/apache/iceberg/orc/OrcMetrics.java | 72 ++++------------------
1 file changed, 11 insertions(+), 61 deletions(-)
diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
index 86b7697..78138a4 100644
--- a/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
+++ b/orc/src/main/java/org/apache/iceberg/orc/OrcMetrics.java
@@ -24,8 +24,8 @@ import java.nio.ByteBuffer;
import java.sql.Timestamp;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
import java.util.Optional;
-import java.util.Queue;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
@@ -36,11 +36,8 @@ import org.apache.iceberg.common.DynFields;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.hadoop.HadoopInputFile;
import org.apache.iceberg.io.InputFile;
-import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
-import org.apache.iceberg.relocated.com.google.common.collect.Queues;
-import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
@@ -87,7 +84,7 @@ public class OrcMetrics {
private static Metrics buildOrcMetrics(final long numOfRows, final
TypeDescription orcSchema,
final ColumnStatistics[] colStats) {
final Schema schema = ORCSchemaUtil.convert(orcSchema);
- final Set<TypeDescription> columnsInContainers =
findColumnsInContainers(schema, orcSchema);
+ final Set<Integer> statsColumns = statsColumns(orcSchema);
Map<Integer, Long> columnSizes =
Maps.newHashMapWithExpectedSize(colStats.length);
Map<Integer, Long> valueCounts =
Maps.newHashMapWithExpectedSize(colStats.length);
Map<Integer, Long> nullCounts =
Maps.newHashMapWithExpectedSize(colStats.length);
@@ -106,7 +103,7 @@ public class OrcMetrics {
columnSizes.put(fieldId, colStat.getBytesOnDisk());
- if (!columnsInContainers.contains(orcCol)) {
+ if (statsColumns.contains(fieldId)) {
// Since ORC does not track null values nor repeated ones, the value
count for columns in
// containers (maps, list) may be larger than what it actually is,
however these are not
// used in experssions right now. For such cases, we use the value
number of values
@@ -209,64 +206,17 @@ public class OrcMetrics {
return Optional.ofNullable(Conversions.toByteBuffer(column.type(), max));
}
- private static Set<TypeDescription> findColumnsInContainers(Schema schema,
- TypeDescription
orcSchema) {
- ColumnsInContainersVisitor visitor = new ColumnsInContainersVisitor();
- OrcSchemaWithTypeVisitor.visit(schema, orcSchema, visitor);
- return visitor.getColumnsInContainers();
+ private static Set<Integer> statsColumns(TypeDescription schema) {
+ return OrcSchemaVisitor.visit(schema, new StatsColumnsVisitor());
}
- private static class ColumnsInContainersVisitor extends
OrcSchemaWithTypeVisitor<TypeDescription> {
-
- private final Set<TypeDescription> columnsInContainers;
-
- private ColumnsInContainersVisitor() {
- columnsInContainers = Sets.newHashSet();
- }
-
- public Set<TypeDescription> getColumnsInContainers() {
- return columnsInContainers;
- }
-
- private Set<TypeDescription> flatten(TypeDescription rootType) {
- if (rootType == null) {
- return ImmutableSet.of();
- }
-
- final Set<TypeDescription> flatTypes =
Sets.newHashSetWithExpectedSize(rootType.getMaximumId());
- final Queue<TypeDescription> queue = Queues.newLinkedBlockingQueue();
- queue.add(rootType);
- while (!queue.isEmpty()) {
- TypeDescription type = queue.remove();
- flatTypes.add(type);
-
queue.addAll(Optional.ofNullable(type.getChildren()).orElse(ImmutableList.of()));
- }
- return flatTypes;
- }
-
- @Override
- public TypeDescription record(Types.StructType iStruct, TypeDescription
record,
- List<String> names, List<TypeDescription>
fields) {
- return record;
- }
-
- @Override
- public TypeDescription list(Types.ListType iList, TypeDescription array,
TypeDescription element) {
- columnsInContainers.addAll(flatten(element));
- return array;
- }
-
- @Override
- public TypeDescription map(Types.MapType iMap, TypeDescription map,
- TypeDescription key, TypeDescription value) {
- columnsInContainers.addAll(flatten(key));
- columnsInContainers.addAll(flatten(value));
- return map;
- }
-
+ private static class StatsColumnsVisitor extends
OrcSchemaVisitor<Set<Integer>> {
@Override
- public TypeDescription primitive(Type.PrimitiveType iPrimitive,
TypeDescription primitive) {
- return primitive;
+ public Set<Integer> record(TypeDescription record, List<String> names,
List<Set<Integer>> fields) {
+ ImmutableSet.Builder<Integer> result = ImmutableSet.builder();
+ fields.stream().filter(Objects::nonNull).forEach(result::addAll);
+
record.getChildren().stream().map(ORCSchemaUtil::fieldId).forEach(result::add);
+ return result.build();
}
}