ihuzenko commented on a change in pull request #1810: DRILL-7271: Refactor
Metadata interfaces and classes to contain all needed information for the File
based Metastore
URL: https://github.com/apache/drill/pull/1810#discussion_r296229552
##########
File path:
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetTableMetadataUtils.java
##########
@@ -148,112 +142,71 @@ private ParquetTableMetadataUtils() {
public static RowGroupMetadata
getRowGroupMetadata(MetadataBase.ParquetTableMetadataBase tableMetadata,
MetadataBase.RowGroupMetadata rowGroupMetadata, int rgIndexInFile, Path
location) {
Map<SchemaPath, ColumnStatistics> columnsStatistics =
getRowGroupColumnStatistics(tableMetadata, rowGroupMetadata);
- Map<StatisticsKind, Object> rowGroupStatistics = new HashMap<>();
- rowGroupStatistics.put(TableStatisticsKind.ROW_COUNT,
rowGroupMetadata.getRowCount());
- rowGroupStatistics.put(() -> ExactStatisticsConstants.START,
rowGroupMetadata.getStart());
- rowGroupStatistics.put(() -> ExactStatisticsConstants.LENGTH,
rowGroupMetadata.getLength());
+ List<StatisticsHolder> rowGroupStatistics = new ArrayList<>();
+ rowGroupStatistics.add(new
StatisticsHolder<>(rowGroupMetadata.getRowCount(),
TableStatisticsKind.ROW_COUNT));
+ rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getStart(),
new BaseStatisticsKind(ExactStatisticsConstants.START, true)));
+ rowGroupStatistics.add(new
StatisticsHolder<>(rowGroupMetadata.getLength(), new
BaseStatisticsKind(ExactStatisticsConstants.LENGTH, true)));
Map<SchemaPath, TypeProtos.MajorType> columns =
getRowGroupFields(tableMetadata, rowGroupMetadata);
TupleSchema schema = new TupleSchema();
columns.forEach((schemaPath, majorType) ->
MetadataUtils.addColumnMetadata(schema, schemaPath, majorType));
- return new RowGroupMetadata(
- schema, columnsStatistics, rowGroupStatistics,
rowGroupMetadata.getHostAffinity(), rgIndexInFile, location);
- }
+ MetadataInfo metadataInfo = new MetadataInfo(MetadataType.ROW_GROUP,
MetadataInfo.GENERAL_INFO_KEY, null);
- /**
- * Merges list of specified metadata into the map of {@link
ColumnStatistics} with columns as keys.
- *
- * @param <T> type of metadata to collect
- * @param metadataList list of metadata to be merged
- * @param columns set of columns whose statistics should be
merged
- * @param statisticsToCollect kinds of statistics that should be collected
- * @param parquetTableMetadata ParquetTableMetadata object to fetch the
non-interesting columns
- * @return list of merged metadata
- */
- @SuppressWarnings("unchecked")
- public static <T extends BaseMetadata> Map<SchemaPath, ColumnStatistics>
mergeColumnsStatistics(
- Collection<T> metadataList, Set<SchemaPath> columns,
List<CollectableColumnStatisticsKind> statisticsToCollect,
MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
- Map<SchemaPath, ColumnStatistics> columnsStatistics = new HashMap<>();
-
- for (SchemaPath column : columns) {
- List<ColumnStatistics> statisticsList = new ArrayList<>();
- for (T metadata : metadataList) {
- ColumnStatistics statistics =
metadata.getColumnsStatistics().get(column);
- if (statistics == null) {
- // schema change happened, set statistics which represents all nulls
- statistics = new ColumnStatisticsImpl(
- ImmutableMap.of(ColumnStatisticsKind.NULLS_COUNT,
metadata.getStatistic(TableStatisticsKind.ROW_COUNT)),
- getNaturalNullsFirstComparator());
- }
- statisticsList.add(statistics);
- }
- Map<StatisticsKind, Object> statisticsMap = new HashMap<>();
- for (CollectableColumnStatisticsKind statisticsKind :
statisticsToCollect) {
- Object mergedStatistic =
statisticsKind.mergeStatistics(statisticsList);
- statisticsMap.put(statisticsKind, mergedStatistic);
- }
- columnsStatistics.put(column, new ColumnStatisticsImpl(statisticsMap,
statisticsList.iterator().next().getValueComparator()));
- }
- return columnsStatistics;
+ return new RowGroupMetadata(TableInfo.UNKNOWN_TABLE_INFO, metadataInfo,
+ schema, columnsStatistics, rowGroupStatistics,
rowGroupMetadata.getHostAffinity(), rgIndexInFile, location);
}
/**
* Returns {@link FileMetadata} instance received by merging specified
{@link RowGroupMetadata} list.
*
* @param rowGroups list of {@link RowGroupMetadata} to be merged
- * @param tableName name of the table
- * @param parquetTableMetadata the source of column metadata for
non-interesting column's statistics
* @return {@link FileMetadata} instance
*/
- public static FileMetadata getFileMetadata(List<RowGroupMetadata> rowGroups,
String tableName,
- MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
+ public static FileMetadata getFileMetadata(List<RowGroupMetadata> rowGroups)
{
if (rowGroups.isEmpty()) {
return null;
}
- Map<StatisticsKind, Object> fileStatistics = new HashMap<>();
- fileStatistics.put(TableStatisticsKind.ROW_COUNT,
TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups));
+ List<StatisticsHolder> fileStatistics = new ArrayList<>();
+ fileStatistics.add(new
StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups),
TableStatisticsKind.ROW_COUNT));
+
+ RowGroupMetadata rowGroupMetadata = rowGroups.iterator().next();
+ TupleMetadata schema = rowGroupMetadata.getSchema();
+
+ Set<SchemaPath> columns = rowGroupMetadata.getColumnsStatistics().keySet();
- TupleMetadata schema = rowGroups.iterator().next().getSchema();
+ MetadataInfo metadataInfo = new MetadataInfo(MetadataType.FILE,
MetadataInfo.GENERAL_INFO_KEY, null);
- return new FileMetadata(rowGroups.iterator().next().getLocation(), schema,
- mergeColumnsStatistics(rowGroups,
rowGroups.iterator().next().getColumnsStatistics().keySet(),
PARQUET_STATISTICS, parquetTableMetadata),
- fileStatistics, tableName, -1);
+ return new FileMetadata(rowGroupMetadata.getTableInfo(), metadataInfo,
rowGroupMetadata.getPath(), schema,
+ TableMetadataUtils.mergeColumnsStatistics(rowGroups, columns,
PARQUET_COLUMN_STATISTICS),
+ fileStatistics, BaseTableMetadata.NON_DEFINED_LAST_MODIFIED_TIME);
}
/**
* Returns {@link PartitionMetadata} instance received by merging specified
{@link FileMetadata} list.
*
* @param partitionColumn partition column
* @param files list of files to be merged
- * @param tableName name of the table
* @return {@link PartitionMetadata} instance
*/
- public static PartitionMetadata getPartitionMetadata(SchemaPath
partitionColumn, List<FileMetadata> files, String tableName) {
+ public static PartitionMetadata getPartitionMetadata(SchemaPath
partitionColumn, List<FileMetadata> files) {
Set<Path> locations = new HashSet<>();
Set<SchemaPath> columns = new HashSet<>();
for (FileMetadata file : files) {
columns.addAll(file.getColumnsStatistics().keySet());
- locations.add(file.getLocation());
+ locations.add(file.getPath());
}
- Map<StatisticsKind, Object> partStatistics = new HashMap<>();
- partStatistics.put(TableStatisticsKind.ROW_COUNT,
TableStatisticsKind.ROW_COUNT.mergeStatistics(files));
+ FileMetadata fileMetadata = files.iterator().next();
- return new PartitionMetadata(partitionColumn,
files.iterator().next().getSchema(),
- mergeColumnsStatistics(files, columns, PARQUET_STATISTICS, null),
partStatistics, locations, tableName, -1);
- }
+ MetadataInfo metadataInfo = new MetadataInfo(MetadataType.PARTITION,
MetadataInfo.GENERAL_INFO_KEY, null);
- /**
- * Returns "natural order" comparator which threads nulls as min values.
- *
- * @param <T> type to compare
- * @return "natural order" comparator
- */
- public static <T extends Comparable<T>> Comparator<T>
getNaturalNullsFirstComparator() {
- return Comparator.nullsFirst(Comparator.naturalOrder());
+ return new PartitionMetadata(fileMetadata.getTableInfo(), metadataInfo,
partitionColumn, fileMetadata.getSchema(),
Review comment:
maybe use builder here ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services