amrishlal commented on code in PR #8645: URL: https://github.com/apache/hudi/pull/8645#discussion_r1190629741
########## hudi-utilities/src/main/java/org/apache/hudi/utilities/TableSizeStats.java: ########## @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.metadata.HoodieTableMetadata; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Snapshot; +import com.codahale.metrics.UniformReservoir; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Serializable; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Calculate and output file size stats of data files that were modified in the half-open interval [start date (--start-date parameter), Review Comment: Fixed. ########## hudi-utilities/src/main/java/org/apache/hudi/utilities/TableSizeStats.java: ########## @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.utilities; + +import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.HoodieMetadataConfig; +import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieLocalEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieBaseFile; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.view.FileSystemViewManager; +import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.TableNotFoundException; +import org.apache.hudi.metadata.HoodieTableMetadata; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Snapshot; +import com.codahale.metrics.UniformReservoir; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Serializable; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Calculate and output file size stats of data files that were modified in the half-open interval [start date (--start-date parameter), + * end date (--end-date parameter)). --num-days parameter can be used to select data files over last --num-days. If --start-date is + * specified, --num-days will be ignored. If none of the date parameters are set, stats will be computed over all data files of all + * partitions in the table. Note that date filtering is carried out only if the partition name has the format '[column name=]yyyy-M-d', + * '[column name=]yyyy/M/d'. By default, only table level file size stats are printed. If --partition-status option is used, partition + * level file size stats also get printed. + * <br><br> + * The following stats are calculated: + * Number of files. + * Total table size. + * Minimum file size + * Maximum file size + * Average file size + * Median file size + * p50 file size + * p90 file size + * p95 file size + * p99 file size + * <br><br> + * Sample spark-submit command: + * ./bin/spark-submit \ + * --class org.apache.hudi.utilities.TableSizeStats \ + * $HUDI_DIR/packaging/hudi-utilities-bundle/target/hudi-utilities-bundle_2.11-0.14.0-SNAPSHOT.jar \ + * --base-path <base-path> \ + * --num-days <number-of-days> + */ +public class TableSizeStats implements Serializable { + + private static final Logger LOG = LoggerFactory.getLogger(TableSizeStats.class); + + // Date formatter for parsing partition dates (example: 2023/5/5/ or 2023-5-5). + private static final DateTimeFormatter DATE_FORMATTER = + (new DateTimeFormatterBuilder()).appendOptional(DateTimeFormatter.ofPattern("yyyy/M/d")).appendOptional(DateTimeFormatter.ofPattern("yyyy-M-d")).toFormatter(); + + // File size stats will be displayed in the units specified below. + private static final String[] FILE_SIZE_UNITS = {"B", "KB", "MB", "GB", "TB"}; + + // Spark context + private transient JavaSparkContext jsc; + // config + private Config cfg; + // Properties with source, hoodie client, key generator etc. + private TypedProperties props; + + public TableSizeStats(JavaSparkContext jsc, Config cfg) { + this.jsc = jsc; + this.cfg = cfg; + + this.props = cfg.propsFilePath == null + ? UtilHelpers.buildProperties(cfg.configs) + : readConfigFromFileSystem(jsc, cfg); + } + + /** + * Reads config from the file system. + * + * @param jsc {@link JavaSparkContext} instance. + * @param cfg {@link Config} instance. + * @return the {@link TypedProperties} instance. + */ + private TypedProperties readConfigFromFileSystem(JavaSparkContext jsc, Config cfg) { + return UtilHelpers.readConfig(jsc.hadoopConfiguration(), new Path(cfg.propsFilePath), cfg.configs) + .getProps(true); + } + + public static class Config implements Serializable { + @Parameter(names = {"--base-path", "-sp"}, description = "Base path for the table", required = false) + public String basePath = null; + + @Parameter(names = {"--num-days", "-nd"}, description = "Consider files modified within this many days.", required = false) + public long numDays = 0; + + @Parameter(names = {"--start-date", "-sd"}, description = "Consider files modified on or after this date.", required = false) + public String startDate = null; + + @Parameter(names = {"--end-date", "-ed"}, description = "Consider files modified before this date.", required = false) + public String endDate = null; + + @Parameter(names = {"--partition-stats", "-ps"}, description = "Show partition-level stats besides table-level stats.", required = false) Review Comment: Fixed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org