Author: lewismc Date: Wed Mar 4 18:48:32 2015 New Revision: 1664109 URL: http://svn.apache.org/r1664109 Log: NUTCH-1949 Dump out the Nutch data into the Common Crawl format
Added: nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/ivy/ivy.xml nutch/trunk/src/bin/nutch nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1664109&r1=1664108&r2=1664109&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Mar 4 18:48:32 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.10-SNAPSHOT +* NUTCH-1949 Dump out the Nutch data into the Common Crawl format (Giuseppe Totaro via lewismc) + * NUTCH-1950 File name too long (Jiaheng Zhang, Chong Li via mattmann) * NUTCH-1921 Optionally disable HTTP if-modified-since header (markus) Modified: nutch/trunk/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1664109&r1=1664108&r2=1664109&view=diff ============================================================================== --- nutch/trunk/ivy/ivy.xml (original) +++ nutch/trunk/ivy/ivy.xml Wed Mar 4 18:48:32 2015 @@ -49,7 +49,8 @@ rev="3.1" conf="*->master" /> <dependency org="commons-codec" name="commons-codec" rev="1.3" conf="*->default" /> - + <dependency org="org.apache.commons" name="commons-compress" rev="1.9" + conf="*->default" /> <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0" conf="*->default"> <exclude org="hsqldb" name="hsqldb" /> @@ -70,6 +71,9 @@ <dependency org="com.google.guava" name="guava" rev="11.0.2" /> <dependency org="com.google.code.crawler-commons" name="crawler-commons" rev="0.5" /> + + <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" /> + <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" /> <!--Configuration: test --> Modified: nutch/trunk/src/bin/nutch URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1664109&r1=1664108&r2=1664109&view=diff ============================================================================== --- nutch/trunk/src/bin/nutch (original) +++ nutch/trunk/src/bin/nutch Wed Mar 4 18:48:32 2015 @@ -71,7 +71,8 @@ if [ $# = 0 ]; then echo " mergelinkdb merge linkdb-s, with optional filtering" echo " index run the plugin-based indexer on parsed segments and linkdb" echo " dedup deduplicate entries in the crawldb and give them a special status" - echo " dump exports cralwed data from segments into files" + echo " dump exports crawled data from segments into files" + echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR" echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead" echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead" echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead" @@ -233,6 +234,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; th CLASS=org.apache.nutch.crawl.LinkDbMerger elif [ "$COMMAND" = "dump" ] ; then CLASS=org.apache.nutch.tools.FileDumper +elif [ "$COMMAND" = "commoncrawldump" ] ; then + CLASS=org.apache.nutch.tools.CommonCrawlDataDumper elif [ "$COMMAND" = "solrindex" ] ; then CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1" shift Added: nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.UnknownHostException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; + +/** + * Abstract class that implements {@see CommonCrawlFormat} interface. + * + */ +public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { + protected String url; + + protected byte[] content; + + protected Metadata metadata; + + protected Configuration conf; + + public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf) { + this.url = url; + this.content = content; + this.metadata = metadata; + this.conf = conf; + } + + @Override + public String getJsonData(boolean mapAll) throws IOException { + if (mapAll) { + return getJsonDataAll(); + } + else { + return getJsonDataSet(); + } + } + + protected abstract String getJsonDataSet() throws IOException; + + protected abstract String getJsonDataAll() throws IOException; + + protected String ifNullString(String value) { + return (value != null) ? value : ""; + } + + protected static String getHostName() { + String hostName = ""; + try { + hostName = InetAddress.getLocalHost().getHostName(); + } catch (UnknownHostException uhe) { + + } + return hostName; + } + + protected static String getHostAddress() { + String hostAddress = ""; + try { + hostAddress = InetAddress.getLocalHost().getHostAddress(); + } catch (UnknownHostException uhe) { + + } + return hostAddress; + } +} Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1664109&r1=1664108&r2=1664109&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Wed Mar 4 18:48:32 2015 @@ -53,6 +53,7 @@ public class Benchmark extends Configure System.exit(res); } + @SuppressWarnings("unused") private static String getDate() { return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System .currentTimeMillis())); Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,470 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +//JDK imports +import java.io.BufferedOutputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileFilter; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; +//Commons imports +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.FilenameUtils; + +//Hadoop +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; + +//Tika imports +import org.apache.tika.Tika; +import com.fasterxml.jackson.dataformat.cbor.CBORFactory; +import com.fasterxml.jackson.dataformat.cbor.CBORGenerator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ibm.icu.text.SimpleDateFormat; + +/** + * <p> + * The Common Crawl Data Dumper tool enables one to reverse generate the raw + * content from Nutch segment data directories into a common crawling data + * format, consumed by many applications. The data is then serialized as <a + * href="http://cbor.io">CBOR</a> + * </p> + * <p> + * Text content will be stored in a structured document format. Below is a + * schema for storage of data and metadata related to a crawling request, with + * the response body truncated for readability. This document must be encoded + * using CBOR and should be compressed with gzip after encoding. The timestamped + * URL key for these records' keys follows the same layout as the media file + * directory structure, with underscores in place of directory separators. </li> + * </p> + * <p> + * Thus, the timestamped url key for the record is provided below followed by an + * example record: + * + * <pre> + * {@code + * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000 + * + * { + * "url": "http:\/\/somepage.com\/22\/14560817", + * "timestamp": "1411623696000", + * "request": { + * "method": "GET", + * "client": { + * "hostname": "crawler01.local", + * "address": "74.347.129.200", + * "software": "Apache Nutch v1.10", + * "robots": "classic", + * "contact": { + * "name": "Nutch Admin", + * "email": "nutch....@nutchadmin.org" + * } + * }, + * "headers": { + * "Accept": "text\/html,application\/xhtml+xml,application\/xml", + * "Accept-Encoding": "gzip,deflate,sdch", + * "Accept-Language": "en-US,en", + * "User-Agent": "Mozilla\/5.0", + * "...": "..." + * }, + * "body": null + * }, + * "response": { + * "status": "200", + * "server": { + * "hostname": "somepage.com", + * "address": "55.33.51.19", + * }, + * "headers": { + * "Content-Encoding": "gzip", + * "Content-Type": "text\/html", + * "Date": "Thu, 25 Sep 2014 04:16:58 GMT", + * "Expires": "Thu, 25 Sep 2014 04:16:57 GMT", + * "Server": "nginx", + * "...": "..." + * }, + * "body": "\r\n <!DOCTYPE html PUBLIC ... \r\n\r\n \r\n </body>\r\n </html>\r\n \r\n\r\n", + * }, + * "key": "com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000", + * "imported": "1411623698000" + * } + * } + * </pre> + * + * <p> + * Upon successful completion the tool displays a very convenient JSON snippet + * detailing the mimetype classifications and the counts of documents which fall + * into those classifications. An example is as follows: + * </p> + * + * <pre> + * {@code + * INFO: File Types: + * TOTAL Stats: { + * {"mimeType":"application/xml","count":19"} + * {"mimeType":"image/png","count":47"} + * {"mimeType":"image/jpeg","count":141"} + * {"mimeType":"image/vnd.microsoft.icon","count":4"} + * {"mimeType":"text/plain","count":89"} + * {"mimeType":"video/quicktime","count":2"} + * {"mimeType":"image/gif","count":63"} + * {"mimeType":"application/xhtml+xml","count":1670"} + * {"mimeType":"application/octet-stream","count":40"} + * {"mimeType":"text/html","count":1863"} + * } + * } + * </pre> + * + */ +public class CommonCrawlDataDumper { + + private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName()); + + /** + * Main method for invoking this tool + * + * @param args + * 1) output directory (which will be created if it does not + * already exist) to host the CBOR data and 2) a directory + * containing one or more segments from which we wish to generate + * CBOR data from. Optionally, 3) a list of mimetypes and the 4) + * the gzip option may be provided. + * @throws Exception + */ + @SuppressWarnings("static-access") + public static void main(String[] args) throws Exception { + Option helpOpt = new Option("h", "help", false, + "show this help message"); + // argument options + Option outputOpt = OptionBuilder + .withArgName("outputDir") + .hasArg() + .withDescription( + "output directory (which will be created) to host the CBOR data") + .create("outputDir"); + Option segOpt = OptionBuilder.withArgName("segment").hasArgs() + .withDescription("the segment(s) to use").create("segment"); + // GIUSEPPE: create mimetype and gzip options + Option mimeOpt = OptionBuilder + .isRequired(false) + .withArgName("mimetype") + .hasArgs() + .withDescription( + "an optional list of mimetypes to dump, excluding all others. Defaults to all.") + .create("mimetype"); + Option gzipOpt = OptionBuilder + .isRequired(false) + .hasArg(false) + .withDescription( + "an optional flag indicating whether to additionally gzip the data") + .create("gzip"); + + // create the options + Options options = new Options(); + options.addOption(helpOpt); + options.addOption(outputOpt); + options.addOption(segOpt); + // create mimetypes and gzip options + options.addOption(mimeOpt); + options.addOption(gzipOpt); + + CommandLineParser parser = new GnuParser(); + try { + CommandLine line = parser.parse(options, args); + if (line.hasOption("help") || !line.hasOption("outputDir") || (!line.hasOption("segment"))) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(CommonCrawlDataDumper.class.getName(), options, true); + return; + } + + File outputDir = new File(line.getOptionValue("outputDir")); + File segmentRootDir = new File(line.getOptionValue("segment")); + String[] mimeTypes = line.getOptionValues("mimetype"); + boolean gzip = line.hasOption("gzip"); + + if (!outputDir.exists()) { + LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it."); + if (!outputDir.mkdirs()) + throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]"); + } + + CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(); + + dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes); + + } catch (Exception e) { + LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils.stringifyException(e)); + e.printStackTrace(); + return; + } + } + + /** + * Dumps the reverse engineered CBOR content from the provided segment + * directories if a parent directory contains more than one segment, + * otherwise a single segment can be passed as an argument. If the boolean + * argument is provided then the CBOR is also zipped. + * + * @param outputDir + * the directory you wish to dump the raw content to. This + * directory will be created. + * @param segmentRootDir + * a directory containing one or more segments. + * @param gzip + * a boolean flag indicating whether the CBOR content should also + * be gzipped. + * @param mimetypes + * an array of mime types we have to dump, all others will be + * filtered out. + * @throws Exception + */ + public void dump(File outputDir, File segmentRootDir, boolean gzip, String[] mimeTypes) throws Exception { + if (!gzip) { + LOG.info("Gzipping CBOR data has been skipped"); + } + // total file counts + Map<String, Integer> typeCounts = new HashMap<String, Integer>(); + // filtered file counters + Map<String, Integer> filteredCounts = new HashMap<String, Integer>(); + + Configuration conf = NutchConfiguration.create(); + FileSystem fs = FileSystem.get(conf); + File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() { + @Override + public boolean accept(File file) { + return file.canRead() && file.isDirectory(); + } + }); + + if (segmentDirs == null) { + LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]"); + System.exit(1); + } + + // Gzip initialization + FileOutputStream fileOutput = null; + BufferedOutputStream bufOutput = null; + GzipCompressorOutputStream gzipOutput = null; + TarArchiveOutputStream tarOutput = null; + + ArrayList<String> fileList = null; + + if (gzip) { + String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date()); + fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName)); + bufOutput = new BufferedOutputStream(fileOutput); + gzipOutput = new GzipCompressorOutputStream(bufOutput); + tarOutput = new TarArchiveOutputStream(gzipOutput); + + fileList = new ArrayList<String>(); + } + + for (File segment : segmentDirs) { + LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]"); + // GIUSEPPE: Never used (also in FileDumper.java)! + //DataOutputStream doutputStream = null; + try { + String segmentContentPath = segment.getAbsolutePath() + File.separator + Content.DIR_NAME + "/part-00000/data"; + Path file = new Path(segmentContentPath); + + if (!new File(file.toString()).exists()) { + LOG.warn("Skipping segment: [" + segmentContentPath + "]: no data directory present"); + continue; + } + SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); + + if (!new File(file.toString()).exists()) { + LOG.warn("Skipping segment: [" + segmentContentPath + "]: no data directory present"); + continue; + } + Writable key = (Writable) reader.getKeyClass().newInstance(); + + Content content = null; + + while (reader.next(key)) { + content = new Content(); + reader.getCurrentValue(content); + String url = key.toString(); + String baseName = FilenameUtils.getBaseName(url); + String extension = FilenameUtils.getExtension(url); + if (extension == null || extension.equals("")) { + extension = "html"; + } + + String filename = baseName + "." + extension; + + // Encode all filetypes if no mimetypes have been given + Boolean filter = (mimeTypes == null); + + String jsonData = ""; + try { + String mimeType = new Tika().detect(content.getContent()); + // Maps file to JSON-based structure + CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf); + jsonData = format.getJsonData(false); + + collectStats(typeCounts, mimeType); + // collects statistics for the given mimetypes + if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) { + collectStats(filteredCounts, mimeType); + filter = true; + } + } catch (Exception e) { + e.printStackTrace(); + LOG.warn("Tika is unable to detect type for: [" + url + + "]"); + } + + if (filter) { + + byte[] byteData = serializeCBORData(jsonData); + + if (!gzip) { + String outputFullPath = outputDir + File.separator + filename; + File outputFile = new File(outputFullPath); + if (outputFile.exists()) { + LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists"); + } + else { + LOG.info("Writing: [" + outputFullPath + "]"); + IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile)); + } + } + else { + if (fileList.contains(filename)) { + LOG.info("Skipping compressing: [" + filename + "]: file already exists"); + } + else { + fileList.add(filename); + LOG.info("Compressing: [" + filename + "]"); + TarArchiveEntry tarEntry = new TarArchiveEntry(filename); + tarEntry.setSize(byteData.length); + tarOutput.putArchiveEntry(tarEntry); + IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput); + tarOutput.closeArchiveEntry(); + } + } + } + } + reader.close(); + } finally { + fs.close(); + } + } + + if (gzip) { + tarOutput.finish(); + + tarOutput.close(); + gzipOutput.close(); + bufOutput.close(); + fileOutput.close(); + } + + LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts)); + } + + private byte[] serializeCBORData(String jsonData) { + CBORFactory factory = new CBORFactory(); + + CBORGenerator generator = null; + ByteArrayOutputStream stream = null; + + try { + stream = new ByteArrayOutputStream(); + generator = factory.createGenerator(stream); + generator.writeString(jsonData); + generator.flush(); + stream.flush(); + + return stream.toByteArray(); + + } catch (Exception e) { + LOG.warn("CBOR encoding failed: " + e.getMessage()); + } finally { + try { + generator.close(); + stream.close(); + } catch (IOException e) { + // nothing to do + } + } + + return null; + } + + private void collectStats(Map<String, Integer> typeCounts, String mimeType) { + typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1); + } + + private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) { + StringBuilder builder = new StringBuilder(); + // print total stats + builder.append("\n TOTAL Stats:\n"); + builder.append(" {\n"); + for (String mimeType : typeCounts.keySet()) { + builder.append(" {\"mimeType\":\""); + builder.append(mimeType); + builder.append("\",\"count\":"); + builder.append(typeCounts.get(mimeType)); + builder.append("\"}\n"); + } + builder.append("}\n"); + // filtered types stats + if (!filteredCounts.isEmpty()) { + builder.append("\n FILTERED Stats:\n"); + builder.append(" {\n"); + for (String mimeType : filteredCounts.keySet()) { + builder.append(" {\"mimeType\":\""); + builder.append(mimeType); + builder.append("\",\"count\":"); + builder.append(filteredCounts.get(mimeType)); + builder.append("\"}\n"); + } + builder.append("}\n"); + } + return builder.toString(); + } +} Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import java.io.IOException; + +/** + * Interface for all CommonCrawl formatter. It provides the signature for the + * method used to get JSON data. + * + * @author gtotaro + * + */ +public interface CommonCrawlFormat { + + /** + * + * @param mapAll If {@code true} maps all metdata on the JSON structure. + * @return the JSON data + */ + public String getJsonData(boolean mapAll) throws IOException; +} Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; + +/** + * Factory class that creates new {@see CommonCrawlFormat} objects (a.k.a. formatter) that map crawled files to CommonCrawl format. + * + */ +public class CommonCrawlFormatFactory { + + /** + * Returns a new instance of a {@see CommonCrawlFormat} object specifying the type of formatter. + * @param formatType the type of formatter to be created. + * @param url the url. + * @param content the content. + * @param metadata the metadata. + * @param conf the configuration. + * @return the new {@see CommonCrawlFormat} object. + */ + public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, byte[] content, + Metadata metadata, Configuration conf) { + if (formatType == null) { + return null; + } + + if (formatType.equalsIgnoreCase("jackson")) { + return new CommonCrawlFormatJackson(url, content, metadata, conf); + } + else if (formatType.equalsIgnoreCase("jettinson")) { + return new CommonCrawlFormatJettinson(url, content, metadata, conf); + } + else if (formatType.equalsIgnoreCase("simple")) { + return new CommonCrawlFormatSimple(url, content, metadata, conf); + } + + return null; + } +} Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,253 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.URLUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +/** + * This class provides methods to map crawled data on JSON using Jackson Streaming APIs. + * + */ +public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat { + + private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName()); + + public CommonCrawlFormatJackson(String url, byte[] content, + Metadata metadata, Configuration conf) { + super(url, content, metadata, conf); + } + + @Override + protected String getJsonDataAll() throws IOException { + JsonFactory factory = new JsonFactory(); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + JsonGenerator generator = null; + + try { + generator = factory.createGenerator(out); + generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT + + generator.writeStartObject(); + + // url + generator.writeFieldName("url"); + generator.writeString(url); + + // timestamp + generator.writeFieldName("timestamp"); + generator.writeString(metadata.get(Metadata.LAST_MODIFIED)); + + + //request + generator.writeFieldName("request"); + generator.writeStartObject(); + generator.writeFieldName("method"); + generator.writeString("GET"); + generator.writeFieldName("client"); + generator.writeStartObject(); + generator.writeFieldName("hostname"); + generator.writeString(getHostName()); + generator.writeFieldName("address"); + generator.writeString(getHostAddress()); + generator.writeFieldName("software"); + generator.writeString(conf.get("http.agent.version", "")); + generator.writeFieldName("robots"); + generator.writeString("classic"); + generator.writeFieldName("contact"); + generator.writeStartObject(); + generator.writeFieldName("name"); + generator.writeString(conf.get("http.agent.name", "")); + generator.writeFieldName("email"); + generator.writeString(conf.get("http.agent.email", "")); + generator.writeEndObject(); + generator.writeFieldName("headers"); + generator.writeStartObject(); + generator.writeFieldName("Accept"); + generator.writeString(conf.get("accept", "")); + generator.writeFieldName("Accept-Encoding"); + generator.writeString(""); // TODO + generator.writeFieldName("Accept-Language"); + generator.writeString(conf.get("http.accept.language", "")); + generator.writeFieldName("User-Agent"); + generator.writeString(conf.get("http.robots.agents", "")); + generator.writeEndObject(); + generator.writeFieldName("body"); + generator.writeNull(); + generator.writeEndObject(); + + //response + generator.writeFieldName("response"); + generator.writeStartObject(); + generator.writeFieldName("status"); + generator.writeString(ifNullString(metadata.get("status"))); + generator.writeFieldName("server"); + + generator.writeStartObject(); + generator.writeFieldName("hostname"); + generator.writeString(URLUtil.getHost(url)); + generator.writeFieldName("address"); + generator.writeString(ifNullString(metadata.get("_ip_"))); + generator.writeEndObject(); + + generator.writeFieldName("headers"); + generator.writeStartObject(); + for (String name : metadata.names()) { + generator.writeFieldName(name); + generator.writeString(ifNullString(metadata.get(name))); + } + generator.writeEndObject(); + + generator.writeFieldName("body"); + generator.writeString(new String(content)); + generator.writeEndObject(); + + generator.writeFieldName("key"); + generator.writeString(url); + + generator.writeFieldName("imported"); // TODO + generator.writeString(""); + + generator.writeEndObject(); + + generator.flush(); + + return out.toString(); + + } catch (IOException ioe) { + LOG.warn("Error in processing file " + url + ": " + ioe.getMessage()); + throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage()); + } + } + + @Override + protected String getJsonDataSet() throws IOException { + JsonFactory factory = new JsonFactory(); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + JsonGenerator generator = null; + + try { + generator = factory.createGenerator(out); + generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT + + generator.writeStartObject(); + + // url + generator.writeFieldName("url"); + generator.writeString(url); + + // timestamp + generator.writeFieldName("timestamp"); + generator.writeString(metadata.get(Metadata.LAST_MODIFIED)); + + //request + generator.writeFieldName("request"); + generator.writeStartObject(); + generator.writeFieldName("method"); + generator.writeString("GET"); + generator.writeFieldName("client"); + generator.writeStartObject(); + generator.writeFieldName("hostname"); + generator.writeString(getHostName()); + generator.writeFieldName("address"); + generator.writeString(getHostAddress()); + generator.writeFieldName("software"); + generator.writeString(conf.get("http.agent.version", "")); + generator.writeFieldName("robots"); + generator.writeString("CLASSIC"); + generator.writeFieldName("contact"); + generator.writeStartObject(); + generator.writeFieldName("name"); + generator.writeString(conf.get("http.agent.name", "")); + generator.writeFieldName("email"); + generator.writeString(conf.get("http.agent.email", "")); + generator.writeEndObject(); + generator.writeFieldName("headers"); + generator.writeStartObject(); + generator.writeFieldName("Accept"); + generator.writeString(conf.get("accept", "")); + generator.writeFieldName("Accept-Encoding"); + generator.writeString(""); // TODO + generator.writeFieldName("Accept-Language"); + generator.writeString(conf.get("http.accept.language", "")); + generator.writeFieldName("User-Agent"); + generator.writeString(conf.get("http.robots.agents", "")); + generator.writeEndObject(); + generator.writeFieldName("body"); + generator.writeNull(); + generator.writeEndObject(); + + //response + generator.writeFieldName("response"); + generator.writeStartObject(); + generator.writeFieldName("status"); + generator.writeString(ifNullString(metadata.get("status"))); + generator.writeFieldName("server"); + + generator.writeStartObject(); + generator.writeFieldName("hostname"); + generator.writeString(URLUtil.getHost(url)); + generator.writeFieldName("address"); + generator.writeString(ifNullString(metadata.get("_ip_"))); + generator.writeEndObject(); + + generator.writeFieldName("headers"); + generator.writeStartObject(); + generator.writeFieldName("Content-Encoding"); + generator.writeString(ifNullString(metadata.get("Content-Encoding"))); + generator.writeFieldName("Content-Type"); + generator.writeString(ifNullString(metadata.get("Content-Type"))); + generator.writeFieldName("Date"); + generator.writeString(ifNullString(metadata.get("Date"))); + generator.writeFieldName("Server"); + generator.writeString(ifNullString(metadata.get("Server"))); + generator.writeEndObject(); + + generator.writeFieldName("body"); + generator.writeString(new String(content)); + generator.writeEndObject(); + + generator.writeFieldName("key"); + generator.writeString(url); + + generator.writeFieldName("imported"); // TODO + generator.writeString(""); + + generator.writeEndObject(); + + generator.flush(); + + return out.toString(); + + } catch (IOException ioe) { + LOG.warn("Error in processing file " + url + ": " + ioe.getMessage()); + throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage()); + } + } +} Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.URLUtil; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; +import org.mortbay.log.Log; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class provides methods to map crawled data on JSON using Jettinson APIs. + * + */ +public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat { + + private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName()); + + public CommonCrawlFormatJettinson(String url, byte[] content, + Metadata metadata, Configuration conf) { + super(url, content, metadata, conf); + } + + @Override + protected String getJsonDataAll() throws IOException { + JSONObject object = new JSONObject(); + + try { + // url + object.put("url", url); + + // timestamp + object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED)); + + // request + JSONObject requestObject = new JSONObject(); + requestObject.put("method", "GET"); + JSONObject clientObject = new JSONObject(); + clientObject.put("hostname", getHostName()); + clientObject.put("address", getHostAddress()); + clientObject.put("software", conf.get("http.agent.version", "")); + clientObject.put("robots", "CLASSIC"); + JSONObject contactObject = new JSONObject(); + contactObject.put("name", conf.get("http.agent.name", "")); + contactObject.put("email", conf.get("http.agent.email", "")); + clientObject.put("contact", contactObject); + requestObject.put("client", clientObject); + JSONObject reqHeadersObject = new JSONObject(); + reqHeadersObject.put("Accept", conf.get("http.accept", "")); + reqHeadersObject.put("Accept-Encoding", ""); // TODO + reqHeadersObject.put("Accept-Language", conf.get("http.accept.language", "")); + reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", "")); + requestObject.put("headers", reqHeadersObject); + requestObject.put("body", JSONObject.NULL); + object.put("request", requestObject); + + // response + JSONObject responseObject = new JSONObject(); + responseObject.put("status", ifNullString(metadata.get("status"))); + JSONObject serverObject = new JSONObject(); + serverObject.put("hostname", URLUtil.getHost(url)); + serverObject.put("address", ifNullString(metadata.get("_ip_"))); + responseObject.put("client", serverObject); + JSONObject respHeadersObject = new JSONObject(); + for (String name : metadata.names()) { + respHeadersObject.put(name, ifNullString(metadata.get(name))); + } + responseObject.put("headers", respHeadersObject); + responseObject.put("body", new String(content)); + object.put("response", responseObject); + + // key + object.put("key", url); + + // imported + object.put("imported", ""); // TODO + + return object.toString(2); // INDENTED OUTPUT + + } catch (JSONException jsone) { + LOG.warn("Error in processing file " + url + ": " + jsone.getMessage()); + throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage()); + } + } + + @Override + protected String getJsonDataSet() throws IOException { + JSONObject object = new JSONObject(); + + try { + // url + object.put("url", url); + + // timestamp + object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED)); + + // request + JSONObject requestObject = new JSONObject(); + requestObject.put("method", "GET"); + JSONObject clientObject = new JSONObject(); + clientObject.put("hostname", getHostName()); + clientObject.put("address", getHostAddress()); + clientObject.put("software", conf.get("http.agent.version", "")); + clientObject.put("robots", "CLASSIC"); + JSONObject contactObject = new JSONObject(); + contactObject.put("name", conf.get("http.agent.name", "")); + contactObject.put("email", conf.get("http.agent.email", "")); + clientObject.put("contact", contactObject); + requestObject.put("client", clientObject); + JSONObject reqHeadersObject = new JSONObject(); + reqHeadersObject.put("Accept", conf.get("http.accept", "")); + reqHeadersObject.put("Accept-Encoding", ""); // TODO + reqHeadersObject.put("Accept-Language", conf.get("http.accept.language", "")); + reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", "")); + requestObject.put("headers", reqHeadersObject); + requestObject.put("body", JSONObject.NULL); + object.put("request", requestObject); + + // response + JSONObject responseObject = new JSONObject(); + responseObject.put("status", ifNullString(metadata.get("status"))); + JSONObject serverObject = new JSONObject(); + serverObject.put("hostname", URLUtil.getHost(url)); + serverObject.put("address", ifNullString(metadata.get("_ip_"))); + responseObject.put("client", serverObject); + JSONObject respHeadersObject = new JSONObject(); + respHeadersObject.put("Content-Encoding", ifNullString(metadata.get("Content-Encoding"))); + respHeadersObject.put("Content-Type", ifNullString(metadata.get("Content-Type"))); + respHeadersObject.put("Date", ifNullString(metadata.get("Date"))); + respHeadersObject.put("Server", ifNullString(metadata.get("Server"))); + responseObject.put("headers", respHeadersObject); + responseObject.put("body", new String(content)); + object.put("response", responseObject); + + // key + object.put("key", url); + + // imported + object.put("imported", ""); // TODO + + return object.toString(2); // INDENTED OUTPUT + + } catch (JSONException jsone) { + LOG.warn("Error in processing file " + url + ": " + jsone.getMessage()); + throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage()); + } + } +} Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1664109&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java (added) +++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java Wed Mar 4 18:48:32 2015 @@ -0,0 +1,152 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.util.URLUtil; + +/** + * This class provides methods to map crawled data on JSON using a {@see StringBuilder} object. + * + */ +public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat { + + public CommonCrawlFormatSimple(String url, byte[] content, Metadata metadata, + Configuration conf) { + super(url, content, metadata, conf); + } + + @Override + protected String getJsonDataAll() { + // TODO character escaping + StringBuilder sb = new StringBuilder(); + sb.append("{\n"); + + // url + sb.append("\t\"url\": \"" + url + "\",\n"); + + // timstamp + sb.append("\t\"timstamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n"); + + // request + sb.append("\t\"request\": {\n"); + sb.append("\t\t\"method\": \"GET\",\n"); + sb.append("\t\t\"client\": {\n"); + sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n"); + sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n"); + sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n"); + sb.append("\t\t\t\"robots\": \"CLASSIC\",\n"); + sb.append("\t\t\t\"contact\": {\n"); + sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n"); + sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n"); + sb.append("\t\t\t}\n"); + sb.append("\t\t},\n"); + sb.append("\t\t\"headers\": {\n"); + sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n"); + sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO + sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n"); + sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n"); + sb.append("\t},\n"); + + // response + sb.append("\t\"response\": {\n"); + sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n"); + sb.append("\t\t\"server\": {\n"); + sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); + sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n"); + sb.append("\t\t},\n"); + sb.append("\t\t\"headers\": {\n"); + for (String name : metadata.names()) { + sb.append("\t\t\t\"" + name + "\": \"" + metadata.get(name) + "\"\n"); + } + sb.append("\t\t},\n"); + sb.append("\t\t\"body\": " + new String(content) + "\",\n"); + sb.append("\t},\n"); + + // key + sb.append("\t\"key\": \"" + url + "\",\n"); + + // imported + sb.append("\t\"imported\": \"\"\n"); //TODO + + sb.append("}"); + + return sb.toString(); + } + + @Override + protected String getJsonDataSet() { + // TODO character escaping + StringBuilder sb = new StringBuilder(); + sb.append("{\n"); + + // url + sb.append("\t\"url\": \"" + url + "\",\n"); + + // timstamp + sb.append("\t\"timestamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n"); + + // request + sb.append("\t\"request\": {\n"); + sb.append("\t\t\"method\": \"GET\",\n"); + sb.append("\t\t\"client\": {\n"); + sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n"); + sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n"); + sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n"); + sb.append("\t\t\t\"robots\": \"CLASSIC\",\n"); + sb.append("\t\t\t\"contact\": {\n"); + sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n"); + sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n"); + sb.append("\t\t\t}\n"); + sb.append("\t\t},\n"); + sb.append("\t\t\"headers\": {\n"); + sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n"); + sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO + sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n"); + sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n"); + sb.append("\t},\n"); + + // response + sb.append("\t\"response\": {\n"); + sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n"); + sb.append("\t\t\"server\": {\n"); + sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); + sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n"); + sb.append("\t\t},\n"); + sb.append("\t\t\"headers\": {\n"); + sb.append("\t\t\t\"Content-Encoding\": " + ifNullString(metadata.get("Content-Encoding"))); + sb.append("\t\t\t\"Content-Type\": " + ifNullString(metadata.get("Content-Type"))); + sb.append("\t\t\t\"Date\": " + ifNullString(metadata.get("Date"))); + sb.append("\t\t\t\"Server\": " + ifNullString(metadata.get("Server"))); + sb.append("\t\t},\n"); + sb.append("\t\t\"body\": " + new String(content) + "\",\n"); + sb.append("\t},\n"); + + // key + sb.append("\t\"key\": \"" + url + "\",\n"); + + // imported + sb.append("\t\"imported\": \"\"\n"); // TODO + + sb.append("}"); + + return sb.toString(); + } + +} Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1664109&r1=1664108&r2=1664109&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Mar 4 18:48:32 2015 @@ -100,7 +100,7 @@ import org.slf4j.LoggerFactory; * } * </pre> * <p> - * In the case above the tool would have been run with the <b>-mimeType + * In the case above, the tool would have been run with the <b>-mimeType * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b> * flag and corresponding values activated. *