Author: jlowe Date: Tue Jul 2 22:00:43 2013 New Revision: 1499127 URL: http://svn.apache.org/r1499127 Log: svn merge -c 1499125 FIXES: MAPREDUCE-3193. FileInputFormat doesn't read files recursively in the input path dir. Contributed by Devaraj K
Added: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/ - copied from r1499125, hadoop/common/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java - copied unchanged from r1499125, hadoop/common/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestFileInputFormat.java Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/util/ConfigUtil.java hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt?rev=1499127&r1=1499126&r2=1499127&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt (original) +++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/CHANGES.txt Tue Jul 2 22:00:43 2013 @@ -15,6 +15,9 @@ Release 2.3.0 - UNRELEASED MAPREDUCE-5316. job -list-attempt-ids command does not handle illegal task-state (Ashwin Shankar via jlowe) + MAPREDUCE-3193. FileInputFormat doesn't read files recursively in the + input path dir (Devaraj K via jlowe) + Release 2.2.0 - UNRELEASED INCOMPATIBLE CHANGES @@ -1062,6 +1065,9 @@ Release 0.23.10 - UNRELEASED BUG FIXES + MAPREDUCE-3193. FileInputFormat doesn't read files recursively in the + input path dir (Devaraj K via jlowe) + Release 0.23.9 - 2013-07-08 INCOMPATIBLE CHANGES Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java?rev=1499127&r1=1499126&r2=1499127&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java (original) +++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/FileInputFormat.java Tue Jul 2 22:00:43 2013 @@ -69,6 +69,10 @@ public abstract class FileInputFormat<K, public static final String NUM_INPUT_FILES = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.NUM_INPUT_FILES; + + public static final String INPUT_DIR_RECURSIVE = + org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE; + private static final double SPLIT_SLOP = 1.1; // 10% slop @@ -192,7 +196,7 @@ public abstract class FileInputFormat<K, TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job); // Whether we need to recursive look into the directory structure - boolean recursive = job.getBoolean("mapred.input.dir.recursive", false); + boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false); List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java?rev=1499127&r1=1499126&r2=1499127&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java (original) +++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.java Tue Jul 2 22:00:43 2013 @@ -64,6 +64,8 @@ public abstract class FileInputFormat<K, "mapreduce.input.pathFilter.class"; public static final String NUM_INPUT_FILES = "mapreduce.input.fileinputformat.numinputfiles"; + public static final String INPUT_DIR_RECURSIVE = + "mapreduce.input.fileinputformat.input.dir.recursive"; private static final Log LOG = LogFactory.getLog(FileInputFormat.class); @@ -102,6 +104,27 @@ public abstract class FileInputFormat<K, return true; } } + + /** + * @param job + * the job to modify + * @param inputDirRecursive + */ + public static void setInputDirRecursive(Job job, + boolean inputDirRecursive) { + job.getConfiguration().setBoolean(INPUT_DIR_RECURSIVE, + inputDirRecursive); + } + + /** + * @param job + * the job to look at. + * @return should the files to be read recursively? + */ + public static boolean getInputDirRecursive(JobContext job) { + return job.getConfiguration().getBoolean(INPUT_DIR_RECURSIVE, + false); + } /** * Get the lower bound on split size imposed by the format. @@ -210,6 +233,9 @@ public abstract class FileInputFormat<K, TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); + // Whether we need to recursive look into the directory structure + boolean recursive = getInputDirRecursive(job); + List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the @@ -235,7 +261,11 @@ public abstract class FileInputFormat<K, if (globStat.isDirectory()) { for(FileStatus stat: fs.listStatus(globStat.getPath(), inputFilter)) { - result.add(stat); + if (recursive && stat.isDirectory()) { + addInputPathRecursively(result, fs, stat.getPath(), inputFilter); + } else { + result.add(stat); + } } } else { result.add(globStat); @@ -252,6 +282,31 @@ public abstract class FileInputFormat<K, } /** + * Add files in the input path recursively into the results. + * @param result + * The List to store all files. + * @param fs + * The FileSystem. + * @param path + * The input path. + * @param inputFilter + * The input filter that can be used to filter files/dirs. + * @throws IOException + */ + protected void addInputPathRecursively(List<FileStatus> result, + FileSystem fs, Path path, PathFilter inputFilter) + throws IOException { + for(FileStatus stat: fs.listStatus(path, inputFilter)) { + if (stat.isDirectory()) { + addInputPathRecursively(result, fs, stat.getPath(), inputFilter); + } else { + result.add(stat); + } + } + } + + + /** * A factory that makes the split for this class. It can be overridden * by sub-classes to make sub-types */ Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/util/ConfigUtil.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/util/ConfigUtil.java?rev=1499127&r1=1499126&r2=1499127&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/util/ConfigUtil.java (original) +++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/util/ConfigUtil.java Tue Jul 2 22:00:43 2013 @@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configurat import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.MRConfig; import org.apache.hadoop.mapreduce.MRJobConfig; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.server.jobtracker.JTConfig; import org.apache.hadoop.mapreduce.server.tasktracker.TTConfig; @@ -528,6 +529,8 @@ public class ConfigUtil { MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST); Configuration.addDeprecation(JTConfig.JT_MAX_JOB_SPLIT_METAINFO_SIZE, MRJobConfig.SPLIT_METAINFO_MAXSIZE); + Configuration.addDeprecation("mapred.input.dir.recursive", + FileInputFormat.INPUT_DIR_RECURSIVE); } public static void main(String[] args) { Modified: hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java?rev=1499127&r1=1499126&r2=1499127&view=diff ============================================================================== --- hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java (original) +++ hadoop/common/branches/branch-2/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestFileInputFormat.java Tue Jul 2 22:00:43 2013 @@ -190,7 +190,7 @@ public class TestFileInputFormat extends + "directory with directories inside.", exceptionThrown); // Enable multi-level/recursive inputs - job.setBoolean("mapred.input.dir.recursive", true); + job.setBoolean(FileInputFormat.INPUT_DIR_RECURSIVE, true); InputSplit[] splits = inFormat.getSplits(job, 1); assertEquals(splits.length, 2); }