Author: lewismc Date: Wed Jul 22 12:51:05 2015 New Revision: 1692268 URL: http://svn.apache.org/r1692268 Log: NUTCH-2063 Add -mimeStats flag to FileDumper tool
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1692268&r1=1692267&r2=1692268&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jul 22 12:51:05 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2063 Add -mimeStats flag to FileDumper tool (Mike Joyce via lewismc) + * NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is Fetched (lewismc) * NUTCH-2058 Indexer plugin that allows RegEx replacements on the NutchDocument Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1692268&r1=1692267&r2=1692268&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Jul 22 12:51:05 2015 @@ -126,9 +126,12 @@ public class FileDumper { * @param mimeTypes * an array of mime types we have to dump, all others will be * filtered out. + * @param mimeTypeStats + * a flag indicating whether mimetype stats should be displayed + * instead of dumping files. * @throws Exception */ - public void dump(File outputDir, File segmentRootDir, String[] mimeTypes) + public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean mimeTypeStats) throws Exception { if (mimeTypes == null) LOG.info("Accepting all mimetypes."); @@ -206,24 +209,25 @@ public class FileDumper { } if (filter) { - String md5Ofurl = DumpFileUtil.getUrlMD5(url); - String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl); - - if (!Strings.isNullOrEmpty(fullDir)) { - String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension)); - File outputFile = new File(outputFullPath); - - if (!outputFile.exists()) { - LOG.info("Writing: [" + outputFullPath + "]"); - FileOutputStream output = new FileOutputStream(outputFile); - IOUtils.write(content.getContent(), output); - fileCount++; - } else { - LOG.info("Skipping writing: [" + outputFullPath - + "]: file already exists"); + if (!mimeTypeStats) { + String md5Ofurl = DumpFileUtil.getUrlMD5(url); + String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl); + + if (!Strings.isNullOrEmpty(fullDir)) { + String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension)); + File outputFile = new File(outputFullPath); + + if (!outputFile.exists()) { + LOG.info("Writing: [" + outputFullPath + "]"); + FileOutputStream output = new FileOutputStream(outputFile); + IOUtils.write(content.getContent(), output); + fileCount++; + } else { + LOG.info("Skipping writing: [" + outputFullPath + + "]: file already exists"); + } } } - } } reader.close(); @@ -240,6 +244,10 @@ public class FileDumper { LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts)); + if (mimeTypeStats) { + System.out.println("Dumper File Stats: " + + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts)); + } } /** @@ -271,6 +279,12 @@ public class FileDumper { .withDescription( "an optional list of mimetypes to dump, excluding all others. Defaults to all.") .create("mimetype"); + @SuppressWarnings("static-access") + Option mimeStat = OptionBuilder + .withArgName("mimeStats") + .withDescription( + "only display mimetype stats for the segment(s) instead of dumping file.") + .create("mimeStats"); // create the options Options options = new Options(); @@ -278,6 +292,7 @@ public class FileDumper { options.addOption(outputOpt); options.addOption(segOpt); options.addOption(mimeOpt); + options.addOption(mimeStat); CommandLineParser parser = new GnuParser(); try { @@ -292,17 +307,22 @@ public class FileDumper { File outputDir = new File(line.getOptionValue("outputDir")); File segmentRootDir = new File(line.getOptionValue("segment")); String[] mimeTypes = line.getOptionValues("mimetype"); + boolean shouldDisplayStats = false; + if (line.hasOption("mimeStats")) + shouldDisplayStats = true; if (!outputDir.exists()) { LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it."); - if (!outputDir.mkdirs()) - throw new Exception("Unable to create: [" + if (!shouldDisplayStats) { + if (!outputDir.mkdirs()) + throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]"); + } } FileDumper dumper = new FileDumper(); - dumper.dump(outputDir, segmentRootDir, mimeTypes); + dumper.dump(outputDir, segmentRootDir, mimeTypes, shouldDisplayStats); } catch (Exception e) { LOG.error("FileDumper: " + StringUtils.stringifyException(e)); e.printStackTrace();