This is an automated email from the ASF dual-hosted git repository. szita pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 5ad2c80 HIVE-25689: Remove deprecated DataUtil from iceberg-handler (Adam Szita, reviewed by Laszlo Pinter and Stamatis Zampetakis) 5ad2c80 is described below commit 5ad2c8042ddc9fc1719c641e55a9f3d2faa9a6c4 Author: Adam Szita <40628386+sz...@users.noreply.github.com> AuthorDate: Fri Nov 19 09:25:44 2021 +0100 HIVE-25689: Remove deprecated DataUtil from iceberg-handler (Adam Szita, reviewed by Laszlo Pinter and Stamatis Zampetakis) --- .../java/org/apache/iceberg/mr/hive/DataUtil.java | 211 --------------------- .../org/apache/iceberg/mr/hive/HiveTableUtil.java | 5 +- .../alter_multi_part_table_to_iceberg.q.out | 2 +- .../positive/alter_part_table_to_iceberg.q.out | 2 +- .../results/positive/alter_table_to_iceberg.q.out | 2 +- .../truncate_partitioned_iceberg_table.q.out | 2 +- 6 files changed, 7 insertions(+), 217 deletions(-) diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/DataUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/DataUtil.java deleted file mode 100644 index 85398d1..0000000 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/DataUtil.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.iceberg.mr.hive; - -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.apache.hive.iceberg.org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.hive.iceberg.org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionField; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.hadoop.HadoopInputFile; -import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.orc.OrcMetrics; -import org.apache.iceberg.parquet.ParquetUtil; - -/** - * @deprecated use org.apache.iceberg.data.DataUtil once Iceberg 0.12 is released. - */ -@Deprecated -public class DataUtil { - - private DataUtil() { - } - - private static final PathFilter HIDDEN_PATH_FILTER = - p -> !p.getName().startsWith("_") && !p.getName().startsWith("."); - - /** - * Returns the data files in a partition by listing the partition location. - * - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. - * @deprecated use org.apache.iceberg.data.DataUtil#listPartition() once Iceberg 0.12 is released. - * - * @param partitionKeys partition key, e.g., "a=1/b=2" - * @param uri partition location URI - * @param format partition format, avro, parquet or orc - * @param spec a partition spec - * @param conf a Hadoop conf - * @param metricsConfig a metrics conf - * @return a List of DataFile - */ - @Deprecated - public static List<DataFile> listPartition(Map<String, String> partitionKeys, String uri, String format, - PartitionSpec spec, Configuration conf, MetricsConfig metricsConfig) { - return listPartition(partitionKeys, uri, format, spec, conf, metricsConfig, null); - } - - /** - * Returns the data files in a partition by listing the partition location. - * <p> - * For Parquet and ORC partitions, this will read metrics from the file footer. For Avro partitions, - * metrics are set to null. - * <p> - * Note: certain metrics, like NaN counts, that are only supported by iceberg file writers but not file footers, - * will not be populated. - * @deprecated use org.apache.iceberg.data.DataUtil#listPartition() once Iceberg 0.12 is released. - * - * @param partitionKeys partition key, e.g., "a=1/b=2" - * @param uri partition location URI - * @param format partition format, avro, parquet or orc - * @param spec a partition spec - * @param conf a Hadoop conf - * @param metricsConfig a metrics conf - * @param mapping a name mapping - * @return a List of DataFile - */ - @Deprecated - public static List<DataFile> listPartition(Map<String, String> partitionKeys, String uri, String format, - PartitionSpec spec, Configuration conf, MetricsConfig metricsConfig, - NameMapping mapping) { - if (format.contains("avro")) { - return listAvroPartition(partitionKeys, uri, spec, conf); - } else if (format.contains("parquet")) { - return listParquetPartition(partitionKeys, uri, spec, conf, metricsConfig, mapping); - } else if (format.contains("orc")) { - return listOrcPartition(partitionKeys, uri, spec, conf, metricsConfig, mapping); - } else { - throw new UnsupportedOperationException("Unknown partition format: " + format); - } - } - - private static List<DataFile> listAvroPartition(Map<String, String> partitionPath, String partitionUri, - PartitionSpec spec, Configuration conf) { - try { - Path partition = new Path(partitionUri); - FileSystem fs = partition.getFileSystem(conf); - return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) - .filter(FileStatus::isFile) - .map(stat -> { - // Avro file statistics cannot be calculated without reading the file. - // Setting the rowCount to 0 is just a workaround so that the DataFiles.Builder.build() doesn't fail. - Metrics metrics = new Metrics(0L, null, null, null); - String partitionKey = spec.fields().stream() - .map(PartitionField::name) - .map(name -> String.format("%s=%s", name, partitionPath.get(name))) - .collect(Collectors.joining("/")); - - return DataFiles.builder(spec) - .withPath(stat.getPath().toString()) - .withFormat("avro") - .withFileSizeInBytes(stat.getLen()) - .withMetrics(metrics) - .withPartitionPath(partitionKey) - .build(); - - }).collect(Collectors.toList()); - } catch (IOException e) { - throw new RuntimeException("Unable to list files in partition: " + partitionUri, e); - } - } - - private static List<DataFile> listParquetPartition(Map<String, String> partitionPath, String partitionUri, - PartitionSpec spec, Configuration conf, - MetricsConfig metricsSpec, NameMapping mapping) { - try { - Path partition = new Path(partitionUri); - FileSystem fs = partition.getFileSystem(conf); - - return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) - .filter(FileStatus::isFile) - .map(stat -> { - Metrics metrics; - try { - ParquetMetadata metadata = ParquetFileReader.readFooter(conf, stat); - metrics = ParquetUtil.footerMetrics(metadata, Stream.empty(), metricsSpec, mapping); - } catch (IOException e) { - throw new RuntimeException("Unable to read the footer of the parquet file: " + - stat.getPath(), e); - } - String partitionKey = spec.fields().stream() - .map(PartitionField::name) - .map(name -> String.format("%s=%s", name, partitionPath.get(name))) - .collect(Collectors.joining("/")); - - return DataFiles.builder(spec) - .withPath(stat.getPath().toString()) - .withFormat("parquet") - .withFileSizeInBytes(stat.getLen()) - .withMetrics(metrics) - .withPartitionPath(partitionKey) - .build(); - }).collect(Collectors.toList()); - } catch (IOException e) { - throw new RuntimeException("Unable to list files in partition: " + partitionUri, e); - } - } - - private static List<DataFile> listOrcPartition(Map<String, String> partitionPath, String partitionUri, - PartitionSpec spec, Configuration conf, - MetricsConfig metricsSpec, NameMapping mapping) { - try { - Path partition = new Path(partitionUri); - FileSystem fs = partition.getFileSystem(conf); - - return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) - .filter(FileStatus::isFile) - .map(stat -> { - Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf), - metricsSpec, mapping); - String partitionKey = spec.fields().stream() - .map(PartitionField::name) - .map(name -> String.format("%s=%s", name, partitionPath.get(name))) - .collect(Collectors.joining("/")); - - return DataFiles.builder(spec) - .withPath(stat.getPath().toString()) - .withFormat("orc") - .withFileSizeInBytes(stat.getLen()) - .withMetrics(metrics) - .withPartitionPath(partitionKey) - .build(); - - }).collect(Collectors.toList()); - } catch (IOException e) { - throw new RuntimeException("Unable to list files in partition: " + partitionUri, e); - } - } - - -} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveTableUtil.java index fccea42..a975cd6 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveTableUtil.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveTableUtil.java @@ -45,6 +45,7 @@ import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.TableMigrationUtil; import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.mr.Catalogs; @@ -128,8 +129,8 @@ public class HiveTableUtil { if (fileName.startsWith(".") || fileName.startsWith("_")) { continue; } - dataFiles.addAll(DataUtil.listPartition(partitionKeys, fileStatus.getPath().toString(), format, spec, conf, - metricsConfig, nameMapping)); + dataFiles.addAll(TableMigrationUtil.listPartition(partitionKeys, fileStatus.getPath().toString(), format, spec, + conf, metricsConfig, nameMapping)); } return dataFiles; } diff --git a/iceberg/iceberg-handler/src/test/results/positive/alter_multi_part_table_to_iceberg.q.out b/iceberg/iceberg-handler/src/test/results/positive/alter_multi_part_table_to_iceberg.q.out index 2d00101..7c71d12 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/alter_multi_part_table_to_iceberg.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/alter_multi_part_table_to_iceberg.q.out @@ -668,7 +668,7 @@ Table Parameters: #### A masked pattern was here #### metadata_location hdfs://### HDFS PATH ### numFiles 7 - numRows 0 + numRows 15 previous_metadata_location hdfs://### HDFS PATH ### storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler table_type ICEBERG diff --git a/iceberg/iceberg-handler/src/test/results/positive/alter_part_table_to_iceberg.q.out b/iceberg/iceberg-handler/src/test/results/positive/alter_part_table_to_iceberg.q.out index 46e4797..6bc2192 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/alter_part_table_to_iceberg.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/alter_part_table_to_iceberg.q.out @@ -530,7 +530,7 @@ Table Parameters: #### A masked pattern was here #### metadata_location hdfs://### HDFS PATH ### numFiles 4 - numRows 0 + numRows 9 previous_metadata_location hdfs://### HDFS PATH ### storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler table_type ICEBERG diff --git a/iceberg/iceberg-handler/src/test/results/positive/alter_table_to_iceberg.q.out b/iceberg/iceberg-handler/src/test/results/positive/alter_table_to_iceberg.q.out index d278e75..d69c983 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/alter_table_to_iceberg.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/alter_table_to_iceberg.q.out @@ -389,7 +389,7 @@ Table Parameters: #### A masked pattern was here #### metadata_location hdfs://### HDFS PATH ### numFiles 1 - numRows 0 + numRows 5 previous_metadata_location hdfs://### HDFS PATH ### rawDataSize 0 storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler diff --git a/iceberg/iceberg-handler/src/test/results/positive/truncate_partitioned_iceberg_table.q.out b/iceberg/iceberg-handler/src/test/results/positive/truncate_partitioned_iceberg_table.q.out index 0a63478..d67d125 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/truncate_partitioned_iceberg_table.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/truncate_partitioned_iceberg_table.q.out @@ -100,7 +100,7 @@ Table Parameters: #### A masked pattern was here #### metadata_location hdfs://### HDFS PATH ### numFiles 4 - numRows 0 + numRows 9 previous_metadata_location hdfs://### HDFS PATH ### storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler table_type ICEBERG