Author: fenglu Date: Sat Aug 24 15:21:20 2013 New Revision: 1517147 URL: http://svn.apache.org/r1517147 Log: NUTCH-1619 Writes Dmoz Description and Title information to db with snippet argument.
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1517147&r1=1517146&r2=1517147&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Aug 24 15:21:20 2013 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1619 Writes Dmoz Description and Title information to db with snippet argument ( Yasin Kılınç via feng) + * NUTCH-1631 Display Document Count Added To Solr Server (Furkan KAMACI via lewismc) * NUTCH-1629 Injector skips empty lines in seed files (kaveh minooie via jnioche) Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java?rev=1517147&r1=1517146&r2=1517147&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java Sat Aug 24 15:21:20 2013 @@ -18,6 +18,7 @@ package org.apache.nutch.tools; import java.io.*; +import java.nio.ByteBuffer; import java.util.*; import java.util.regex.*; @@ -29,10 +30,15 @@ import org.apache.xerces.util.XMLChar; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.avro.util.Utf8; +import org.apache.gora.store.DataStore; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.storage.StorageUtils; +import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.TableUtil; /** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */ @@ -40,7 +46,8 @@ public class DmozParser { public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class); long pages = 0; - + private static DataStore<String, WebPage> store = null; + /** * This filter fixes characters that might offend our parser. * This lets us be tolerant of errors that might appear in the input XML. @@ -104,18 +111,20 @@ public class DmozParser { int subsetDenom; int hashSkew; boolean includeAdult; + boolean snippet; Locator location; /** * Pass in an XMLReader, plus a flag as to whether we * should include adult material. */ - public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern) throws IOException { + public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern, boolean snippet) throws IOException { this.reader = reader; this.subsetDenom = subsetDenom; this.includeAdult = includeAdult; this.topicPattern = topicPattern; - + this.snippet = snippet; + this.hashSkew = skew != 0 ? skew : new Random().nextInt(); } @@ -179,20 +188,44 @@ public class DmozParser { // Inc the number of pages, insert the page, and // possibly print status. // - System.out.println(curURL); - pages++; - - // - // Clear out the link text. This is what - // you would use for adding to the linkdb. - // - if (title.length() > 0) { - title.delete(0, title.length()); - } - if (desc.length() > 0) { - desc.delete(0, desc.length()); + if(snippet){ + try { + String reversedUrl = TableUtil.reverseUrl(curURL); + WebPage row = store.get(reversedUrl); + + if(row!=null){ + if (desc.length() > 0) { + row.putToMetadata(new Utf8("_dmoz_desc_"), ByteBuffer.wrap(desc.toString().getBytes())); + desc.delete(0, desc.length()); + } + if (title.length() > 0) { + row.putToMetadata(new Utf8("_dmoz_title_"), ByteBuffer.wrap(title.toString().getBytes())); + title.delete(0, title.length()); + } + store.put(reversedUrl, row); + store.flush(); + } + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } else { + System.out.println(curURL); + + // + // Clear out the link text. This is what + // you would use for adding to the linkdb. + // + if (desc.length() > 0) { + desc.delete(0, desc.length()); + } + if (title.length() > 0) { + title.delete(0, title.length()); + } } - + pages++; + // Null out the URL. curURL = null; } else if ("d:Title".equals(qName)) { @@ -215,6 +248,11 @@ public class DmozParser { */ public void endDocument() { LOG.info("Completed parse. Found " + pages + " pages."); + try { + store.close(); + } catch (IOException e) { + e.printStackTrace(); + } } /** @@ -268,7 +306,8 @@ public class DmozParser { public void parseDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, int skew, - Pattern topicPattern) + Pattern topicPattern, + boolean snippet) throws IOException, SAXException, ParserConfigurationException { @@ -279,7 +318,7 @@ public class DmozParser { // Create our own processor to receive SAX events RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, - skew, topicPattern); + skew, topicPattern, snippet); reader.setContentHandler(rp); reader.setErrorHandler(rp); LOG.info("skew = " + rp.hashSkew); @@ -331,7 +370,7 @@ public class DmozParser { */ public static void main(String argv[]) throws Exception { if (argv.length < 1) { - System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]"); + System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-snippet] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]"); return; } @@ -343,10 +382,12 @@ public class DmozParser { int skew = 0; String dmozFile = argv[0]; boolean includeAdult = false; + boolean snippet = false; Pattern topicPattern = null; Vector<String> topics = new Vector<String>(); Configuration conf = NutchConfiguration.create(); + store = StorageUtils.createWebStore(conf,String.class, WebPage.class); FileSystem fs = FileSystem.get(conf); try { for (int i = 1; i < argv.length; i++) { @@ -364,6 +405,8 @@ public class DmozParser { } else if ("-skew".equals(argv[i])) { skew = Integer.parseInt(argv[i+1]); i++; + }else if ("-snippet".equals(argv[i])) { + snippet = true; } } @@ -383,7 +426,7 @@ public class DmozParser { } parser.parseDmozFile(new File(dmozFile), subsetDenom, - includeAdult, skew, topicPattern); + includeAdult, skew, topicPattern, snippet); } finally { fs.close();