Author: dogacan Date: Sun Jun 24 03:04:30 2007 New Revision: 550196 URL: http://svn.apache.org/viewvc?view=rev&rev=550196 Log: NUTCH-504 - Parsing during fetching is broken.
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/test/crawl-tests.xml lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=550196&r1=550195&r2=550196 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Jun 24 03:04:30 2007 @@ -62,6 +62,8 @@ 19. NUTCH-468 - Scoring filter should distribute score to all outlinks at once. (dogacan) +20. NUTCH-504 - NUTCH-443 broke parsing during fetching. (dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=550196&r1=550195&r2=550196 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sun Jun 24 03:04:30 2007 @@ -311,41 +311,7 @@ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } - if (parseResult != null) { - for (Entry<Text, Parse> entry : parseResult) { - Text url = entry.getKey(); - Parse parse = entry.getValue(); - ParseStatus parseStatus = parse.getData().getStatus(); - - if (!parseStatus.isSuccess()) { - LOG.warn("Error parsing: " + key + ": " + parseStatus); - parse = parseStatus.getEmptyParse(getConf()); - } - - // Calculate page signature. For non-parsing fetchers this will - // be done in ParseSegment - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, parse); - // Ensure segment name and score are in parseData metadata - parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, - segmentName); - parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, - StringUtil.toHexString(signature)); - // Pass fetch time to content meta - parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, - Long.toString(datum.getFetchTime())); - if (url.equals(key)) - datum.setSignature(signature); - try { - scfilters.passScoreAfterParsing(url, content, parse); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - e.printStackTrace(LogUtil.getWarnStream(LOG)); - LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); - } - } - } - } else { + if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(conf)); @@ -360,8 +326,40 @@ output.collect(key, new ObjectWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { - output.collect(entry.getKey(), - new ObjectWritable(new ParseImpl(entry.getValue()))); + Text url = entry.getKey(); + Parse parse = entry.getValue(); + ParseStatus parseStatus = parse.getData().getStatus(); + + if (!parseStatus.isSuccess()) { + LOG.warn("Error parsing: " + key + ": " + parseStatus); + parse = parseStatus.getEmptyParse(getConf()); + } + + // Calculate page signature. For non-parsing fetchers this will + // be done in ParseSegment + byte[] signature = + SignatureFactory.getSignature(getConf()).calculate(content, parse); + // Ensure segment name and score are in parseData metadata + parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, + segmentName); + parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, + StringUtil.toHexString(signature)); + // Pass fetch time to content meta + parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, + Long.toString(datum.getFetchTime())); + if (url.equals(key)) + datum.setSignature(signature); + try { + scfilters.passScoreAfterParsing(url, content, parse); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + } + } + output.collect(url, new ObjectWritable( + new ParseImpl(new ParseText(parse.getText()), + parse.getData(), parse.isCanonical()))); } } } catch (IOException e) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=550196&r1=550195&r2=550196 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sun Jun 24 03:04:30 2007 @@ -685,41 +685,7 @@ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } - if (parseResult != null) { - for (Entry<Text, Parse> entry : parseResult) { - Text url = entry.getKey(); - Parse parse = entry.getValue(); - ParseStatus parseStatus = parse.getData().getStatus(); - - if (!parseStatus.isSuccess()) { - LOG.warn("Error parsing: " + key + ": " + parseStatus); - parse = parseStatus.getEmptyParse(getConf()); - } - - // Calculate page signature. For non-parsing fetchers this will - // be done in ParseSegment - byte[] signature = - SignatureFactory.getSignature(getConf()).calculate(content, parse); - // Ensure segment name and score are in parseData metadata - parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, - segmentName); - parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, - StringUtil.toHexString(signature)); - // Pass fetch time to content meta - parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, - Long.toString(datum.getFetchTime())); - if (url.equals(key)) - datum.setSignature(signature); - try { - scfilters.passScoreAfterParsing(url, content, parse); - } catch (Exception e) { - if (LOG.isWarnEnabled()) { - e.printStackTrace(LogUtil.getWarnStream(LOG)); - LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); - } - } - } - } else { + if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(conf)); @@ -730,12 +696,44 @@ try { output.collect(key, new ObjectWritable(datum)); - if (storingContent) + if (content != null && storingContent) output.collect(key, new ObjectWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { - output.collect(entry.getKey(), - new ObjectWritable(new ParseImpl(entry.getValue()))); + Text url = entry.getKey(); + Parse parse = entry.getValue(); + ParseStatus parseStatus = parse.getData().getStatus(); + + if (!parseStatus.isSuccess()) { + LOG.warn("Error parsing: " + key + ": " + parseStatus); + parse = parseStatus.getEmptyParse(getConf()); + } + + // Calculate page signature. For non-parsing fetchers this will + // be done in ParseSegment + byte[] signature = + SignatureFactory.getSignature(getConf()).calculate(content, parse); + // Ensure segment name and score are in parseData metadata + parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, + segmentName); + parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, + StringUtil.toHexString(signature)); + // Pass fetch time to content meta + parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, + Long.toString(datum.getFetchTime())); + if (url.equals(key)) + datum.setSignature(signature); + try { + scfilters.passScoreAfterParsing(url, content, parse); + } catch (Exception e) { + if (LOG.isWarnEnabled()) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); + } + } + output.collect(url, new ObjectWritable( + new ParseImpl(new ParseText(parse.getText()), + parse.getData(), parse.isCanonical()))); } } } catch (IOException e) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=550196&r1=550195&r2=550196 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Sun Jun 24 03:04:30 2007 @@ -186,6 +186,10 @@ || parseText == null || parseData == null) { return; // only have inlinks } + + if (!parseData.getStatus().isSuccess()) { + return; + } Document doc = new Document(); Metadata metadata = parseData.getContentMeta(); Modified: lucene/nutch/trunk/src/test/crawl-tests.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?view=diff&rev=550196&r1=550195&r2=550196 ============================================================================== --- lucene/nutch/trunk/src/test/crawl-tests.xml (original) +++ lucene/nutch/trunk/src/test/crawl-tests.xml Sun Jun 24 03:04:30 2007 @@ -28,4 +28,10 @@ <value>test-nutch</value> </property> -</configuration> \ No newline at end of file +<property> + <name>http.robots.agents</name> + <value>test-nutch,*</value> +</property> + +</configuration> + Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=550196&r1=550195&r2=550196 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun Jun 24 03:04:30 2007 @@ -28,6 +28,9 @@ import org.apache.nutch.crawl.CrawlDBTestUtil; import org.apache.nutch.crawl.Generator; import org.apache.nutch.crawl.Injector; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; import org.mortbay.jetty.Server; @@ -78,6 +81,7 @@ addUrl(urls,"pagea.html"); addUrl(urls,"pageb.html"); addUrl(urls,"dup_of_pagea.html"); + addUrl(urls,"exception.html"); CrawlDBTestUtil.generateSeedList(fs, urlPath, urls); @@ -102,17 +106,17 @@ int minimumTime=(int) ((urls.size()+1)*1000*conf.getFloat("fetcher.server.delay",5)); assertTrue(time > minimumTime); - //verify results + //verify content Path content=new Path(new Path(generatedSegment, Content.DIR_NAME),"part-00000/data"); SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf); ArrayList<String> handledurls=new ArrayList<String>(); - READ: + READ_CONTENT: do { Text key=new Text(); Content value=new Content(); - if(!reader.next(key, value)) break READ; + if(!reader.next(key, value)) break READ_CONTENT; String contentString=new String(value.getContent()); if(contentString.indexOf("Nutch fetcher test page")!=-1) { handledurls.add(key.toString()); @@ -130,7 +134,33 @@ //verify that correct pages were handled assertTrue(handledurls.containsAll(urls)); assertTrue(urls.containsAll(handledurls)); + + handledurls.clear(); + //verify parse data + Path parseData = new Path(new Path(generatedSegment, ParseData.DIR_NAME),"part-00000/data"); + reader = new SequenceFile.Reader(fs, parseData, conf); + + READ_PARSE_DATA: + do { + Text key = new Text(); + ParseData value = new ParseData(); + if(!reader.next(key, value)) break READ_PARSE_DATA; + // make sure they all contain "nutch.segment.name" and "nutch.content.digest" + // keys in parse metadata + Metadata contentMeta = value.getContentMeta(); + if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null + && contentMeta.get(Nutch.SIGNATURE_KEY) != null) { + handledurls.add(key.toString()); + } + } while(true); + + Collections.sort(handledurls); + + assertEquals(urls.size(), handledurls.size()); + + assertTrue(handledurls.containsAll(urls)); + assertTrue(urls.containsAll(handledurls)); } private void addUrl(ArrayList<String> urls, String page) { Added: lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html?view=auto&rev=550196 ============================================================================== --- lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html (added) +++ lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html Sun Jun 24 03:04:30 2007 @@ -0,0 +1,13 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> +<HTML> +<HEAD> +<TITLE>Exception</TITLE> +<META http-equiv="Content-Type" content="text/html; charset=unicode"> +</HEAD> +<BODY> +!!Trying to parse this one will fail with a MalformedInputException!! + +Nutch fetcher test page. +</BODY> +</HTML> + ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs