bhasudha commented on a change in pull request #689: [HUDI-25] Optimize HoodieInputFormat.listStatus for faster Hive Incremental queries URL: https://github.com/apache/incubator-hudi/pull/689#discussion_r291480583
########## File path: hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/InputPathHandler.java ########## @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. ([email protected]) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.hadoop; + +import static com.uber.hoodie.hadoop.HoodieInputFormat.getTableMetaClient; + +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.exception.DatasetNotFoundException; +import com.uber.hoodie.exception.InvalidDatasetException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +/** + * InputPathHandler takes in a set of input paths and incremental tables list. Then, classifies the + * input paths to incremental, non-incremental paths and non-hoodie paths. This is then accessed + * later to mutate the JobConf before processing incremental mode queries and snapshot queries. + */ +public class InputPathHandler { + + public static final Log LOG = LogFactory.getLog(InputPathHandler.class); + + private final Configuration conf; + // tablename to metadata mapping for all Hoodie tables(both incremental & non-incremental) + private final Map<String, HoodieTableMetaClient> tableMetaClientMap; + private final Map<HoodieTableMetaClient, List<Path>> groupedIncrementalPaths; + private final List<Path> nonIncrementalPaths; + private final List<Path> nonHoodieInputPaths; + + InputPathHandler(Configuration conf, Path[] inputPaths, List<String> incrementalTables) throws IOException { + this.conf = conf; + tableMetaClientMap = new HashMap<>(); + nonIncrementalPaths = new ArrayList<>(); + nonHoodieInputPaths = new ArrayList<>(); + groupedIncrementalPaths = new HashMap<>(); + parseInputPaths(inputPaths, incrementalTables); + } + + /** + * Takes in the original InputPaths and classifies each of them into incremental, non-incremental + * and non-hoodie InputPaths. The logic is as follows: + * + * 1. Check if an inputPath starts with the same basepath as any of the metadata basepaths we know + * 1a. If yes, this belongs to a Hoodie table that we already know about. Simply classify this + * as incremental or non incremental - We can get the table name of this inputPath from the + * metadata. Then based on the list of incrementalTables, we can classify this inputPath. + * 1b. If no, this could be a new Hoodie Table we haven't seen yet or a non-Hoodie Input Path. + * Try creating the HoodieTableMetadataClient. + * - If it succeeds, further classify as incremental on non-incremental as described in + * step 1a above. + * - If DatasetNotFoundException/InvalidDatasetException is caught, this is a + * non-Hoodie inputPath + * @param inputPaths - InputPaths from the original jobConf that was passed to HoodieInputFormat + * @param incrementalTables - List of all incremental tables extracted from the config + * `hoodie.<table-name>.consume.mode=INCREMENTAL` + * @throws IOException + */ + private void parseInputPaths(Path[] inputPaths, List<String> incrementalTables) + throws IOException { + for (int i = 0; i < inputPaths.length; i++) { + Path inputPath = inputPaths[i]; + boolean basePathKnown = false; + for (HoodieTableMetaClient metaClient : tableMetaClientMap.values()) { + if (inputPath.toString().startsWith(metaClient.getBasePath())) { + // We already know this base path. Review comment: No specific reason. I ll change it to be consistent with other places. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
