bvaradar commented on a change in pull request #2064: URL: https://github.com/apache/hudi/pull/2064#discussion_r497149559
########## File path: hudi-cli/src/main/java/org/apache/hudi/cli/commands/MetadataCommand.java ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.cli.commands; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.cli.HoodieCLI; +import org.apache.hudi.cli.utils.SparkUtil; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.metadata.HoodieMetadata; +import org.apache.spark.api.java.JavaSparkContext; +import org.springframework.shell.core.CommandMarker; +import org.springframework.shell.core.annotation.CliCommand; +import org.springframework.shell.core.annotation.CliOption; +import org.springframework.stereotype.Component; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +/** + * CLI commands to operate on the Metadata Table. + */ +@Component +public class MetadataCommand implements CommandMarker { + private JavaSparkContext jsc; + + @CliCommand(value = "metadata set", help = "Set options for Metadata Table") + public String set(@CliOption(key = {"metadataDir"}, + help = "Directory to read/write metadata table (can be different from dataset)", unspecifiedDefaultValue = "") + final String metadataDir) { + if (!metadataDir.isEmpty()) { + HoodieMetadata.setMetadataBaseDirectory(metadataDir); + } + + return String.format("Ok"); + } + + @CliCommand(value = "metadata create", help = "Create the Metadata Table if it does not exist") + public String create() throws IOException { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + Path metadataPath = new Path(HoodieMetadata.getMetadataTableBasePath(HoodieCLI.basePath)); + try { + FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath); + if (statuses.length > 0) { + throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") not empty."); + } + } catch (FileNotFoundException e) { + // Metadata directory does not exist yet + HoodieCLI.fs.mkdirs(metadataPath); + } + + long t1 = System.currentTimeMillis(); + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(HoodieCLI.basePath) + .withUseFileListingMetadata(true).build(); + initJavaSparkContext(); + HoodieMetadata.init(jsc, writeConfig); + long t2 = System.currentTimeMillis(); + + return String.format("Created Metadata Table in %s (duration=%.2fsec)", metadataPath, (t2 - t1) / 1000.0); + } + + @CliCommand(value = "metadata delete", help = "Remove the Metadata Table") + public String delete() throws Exception { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + Path metadataPath = new Path(HoodieMetadata.getMetadataTableBasePath(HoodieCLI.basePath)); + try { + FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath); + if (statuses.length > 0) { + HoodieCLI.fs.delete(metadataPath, true); + } + } catch (FileNotFoundException e) { + // Metadata directory does not exist + } + + HoodieMetadata.remove(HoodieCLI.basePath); + + return String.format("Removed Metdata Table from %s", metadataPath); + } + + @CliCommand(value = "metadata init", help = "Update the metadata table from commits since the creation") + public String init(@CliOption(key = {"readonly"}, unspecifiedDefaultValue = "false", + help = "Open in read-only mode") final boolean readOnly) throws Exception { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + Path metadataPath = new Path(HoodieMetadata.getMetadataTableBasePath(HoodieCLI.basePath)); + try { + FileStatus[] statuses = HoodieCLI.fs.listStatus(metadataPath); + } catch (FileNotFoundException e) { + // Metadata directory does not exist + throw new RuntimeException("Metadata directory (" + metadataPath.toString() + ") does not exist."); + } + + HoodieMetadata.remove(HoodieCLI.basePath); + + long t1 = System.currentTimeMillis(); + if (readOnly) { + HoodieMetadata.init(HoodieCLI.conf, HoodieCLI.basePath); + } else { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath(HoodieCLI.basePath) + .withUseFileListingMetadata(true).build(); + initJavaSparkContext(); + HoodieMetadata.init(jsc, writeConfig); + } + long t2 = System.currentTimeMillis(); + + String action = readOnly ? "Opened" : "Initialized"; + return String.format(action + " Metadata Table in %s (duration=%.2fsec)", metadataPath, (t2 - t1) / 1000.0); + } + + @CliCommand(value = "metadata stats", help = "Print stats about the metadata") + public String stats() throws IOException { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); + Map<String, String> stats = HoodieMetadata.getStats(HoodieCLI.basePath, true); + + StringBuffer out = new StringBuffer("\n"); + out.append(String.format("Base path: %s\n", HoodieMetadata.getMetadataTableBasePath(HoodieCLI.basePath))); + for (Map.Entry<String, String> entry : stats.entrySet()) { + out.append(String.format("%s: %s\n", entry.getKey(), entry.getValue())); + } + + return out.toString(); + } + + @CliCommand(value = "metadata list-partitions", help = "Print a list of all partitions from the metadata") + public String listPartitions() throws IOException { + HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient(); Review comment: > > I instrumented the code to track writeConfig.useFileListingMetadata. It looks like it is being disabled for a dataset that has the metadata dataset > > 20/09/26 02:58:11 ERROR HoodieMetadata: writeConfig.useFileListingMetadata() =true > > 20/09/26 02:58:14 ERROR HoodieMetadata: writeConfig.useFileListingMetadata() =false > > The above line is coming from HoodieMetadata.init() method. I am unable to test using spark.write() as it seems to fallback to listing > > I think the second log (writeConfig.useFileListingMetadata() =false) is coming from the Hoodie Metadata Table's internal writeConfig. Since the HUDI Metadata Table is also a HUDI Table, it also has a HoodieWriteConfig wherein the useFileListingMetadata is disabled. > > Fallback to listing is by design to prevent issues during adoption. Got it, So, the spark.write.xxxx() works ? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org