Github user vdiravka commented on a diff in the pull request:
https://github.com/apache/drill/pull/1214#discussion_r183644277
--- Diff:
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScanStatistics.java
---
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.drill.exec.store.parquet;
+
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.exec.physical.base.GroupScan;
+import org.apache.parquet.schema.OriginalType;
+import org.apache.parquet.schema.PrimitiveType;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static
org.apache.drill.exec.store.parquet.metadata.MetadataBase.ColumnMetadata;
+import static
org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetTableMetadataBase;
+import static
org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ColumnTypeMetadata_v3;
+import static
org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetTableMetadata_v3;
+
+/**
+ * Holds common statistics about data in parquet group scan,
+ * including information about total row count, columns counts, partition
columns.
+ */
+public class ParquetGroupScanStatistics {
+
+ // map from file names to maps of column name to partition value mappings
+ private Map<String, Map<SchemaPath, Object>> partitionValueMap;
+ // only for partition columns : value is unique for each partition
+ private Map<SchemaPath, TypeProtos.MajorType> partitionColTypeMap;
+ // total number of non-null value for each column in parquet files
+ private Map<SchemaPath, Long> columnValueCounts;
+ // total number of rows (obtained from parquet footer)
+ private long rowCount;
+
+
+ public ParquetGroupScanStatistics(List<RowGroupInfo> rowGroupInfos,
ParquetTableMetadataBase parquetTableMetadata) {
+ collect(rowGroupInfos, parquetTableMetadata);
+ }
+
+ public ParquetGroupScanStatistics(ParquetGroupScanStatistics that) {
+ this.partitionValueMap = new HashMap<>(that.partitionValueMap);
+ this.partitionColTypeMap = new HashMap<>(that.partitionColTypeMap);
+ this.columnValueCounts = new HashMap<>(that.columnValueCounts);
+ this.rowCount = that.rowCount;
+ }
+
+ public long getColumnValueCount(SchemaPath column) {
+ return columnValueCounts.containsKey(column) ?
columnValueCounts.get(column) : 0;
+ }
+
+ public List<SchemaPath> getPartitionColumns() {
+ return new ArrayList<>(partitionColTypeMap.keySet());
+ }
+
+ public TypeProtos.MajorType getTypeForColumn(SchemaPath schemaPath) {
+ return partitionColTypeMap.get(schemaPath);
+ }
+
+ public long getRowCount() {
+ return rowCount;
+ }
+
+ public Object getPartitionValue(String path, SchemaPath column) {
+ return partitionValueMap.get(path).get(column);
+ }
+
+ public void collect(List<RowGroupInfo> rowGroupInfos,
ParquetTableMetadataBase parquetTableMetadata) {
+ resetHolders();
+ boolean first = true;
+ for (RowGroupInfo rowGroup : rowGroupInfos) {
+ long rowCount = rowGroup.getRowCount();
+ for (ColumnMetadata column : rowGroup.getColumns()) {
+ SchemaPath schemaPath =
SchemaPath.getCompoundPath(column.getName());
+ Long previousCount = columnValueCounts.get(schemaPath);
+ if (previousCount != null) {
+ if (previousCount != GroupScan.NO_COLUMN_STATS) {
+ if (column.getNulls() != null) {
--- End diff --
Combine if statement with above.
---