I'm trying to patch Nutch to allow the page cache to be added to the Solr index when using the SolrIndexer tool. Is there any reason this is not done by default? The Solr schema even has the "cache" field but it is left empty.
I'm enclosing a patch of the changes I have made. I have done some testing and this seems to work fine. Can someone please take a look at it let me know if I'm doing anything wrong? I'm especially not sure about the character encoding to assume when converting the Content (which is stored as byte[]) to a String; I'm getting the encoding from Metadata (using the key Metadata.ORIGINAL_CHAR_ENCODING) but it is always null. Thanks, Siddhartha
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java =================================================================== --- src/java/org/apache/nutch/indexer/IndexerMapReduce.java (revision 774282) +++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java (working copy) @@ -46,6 +46,7 @@ import org.apache.nutch.parse.ParseText; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.protocol.Content; public class IndexerMapReduce extends Configured implements Mapper<Text, Writable, Text, NutchWritable>, @@ -75,6 +76,7 @@ CrawlDatum fetchDatum = null; ParseData parseData = null; ParseText parseText = null; + Content content = null; while (values.hasNext()) { final Writable value = values.next().get(); // unwrap if (value instanceof Inlinks) { @@ -97,6 +99,8 @@ parseData = (ParseData)value; } else if (value instanceof ParseText) { parseText = (ParseText)value; + } else if (value instanceof Content) { + content = (Content)value; } else if (LOG.isWarnEnabled()) { LOG.warn("Unrecognized type: "+value.getClass()); } @@ -155,6 +159,14 @@ // store boost for use by explain and dedup doc.add("boost", Float.toString(boost)); + String encoding = metadata.get(Metadata.ORIGINAL_CHAR_ENCODING); + System.out.println(encoding); + if (encoding == null) { + encoding = "UTF-8"; + } + doc.removeField("cache"); + doc.add("cache", new String(content.getContent(), encoding)); + output.collect(key, doc); } @@ -173,6 +185,7 @@ FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); + FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); } FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));