Author: kubes Date: Sat Mar 10 09:40:20 2007 New Revision: 516757 URL: http://svn.apache.org/viewvc?view=rev&rev=516757 Log: NUTCH-436 resolved. Fixed behavior of urls with param (i.e. ;xxxx) information. Finally found workaround for problems that I was experiencing with EOL characters.
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?view=diff&rev=516757&r1=516756&r2=516757 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Sat Mar 10 09:40:20 2007 @@ -282,6 +282,51 @@ return false; } + + /** + * Handles cases where the url param information is encoded into the base + * url as opposed to the target. + * <p> + * If the taget contains params (i.e. ';xxxx') information then the target + * params information is assumed to be correct and any base params information + * is ignored. If the base contains params information but the tareget does + * not, then the params information is moved to the target allowing it to be + * correctly determined by the java.net.URL class. + * + * @param base The base URL. + * @param target The target path from the base URL. + * + * @return URL A URL with the params information correctly encoded. + * + * @throws MalformedURLException If the url is not a well formed URL. + */ + private URL fixEmbeddedParams(URL base, String target) + throws MalformedURLException{ + + // the target contains params information or the base doesn't then no + // conversion necessary, return regular URL + if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + return new URL(base, target); + } + + // get the base url and it params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS >= 0) { + target = target.substring(0, startQS) + params + + target.substring(startQS); + } + else { + target += params; + } + + return new URL(base, target); + } /** * This method finds all anchors below the supplied DOM @@ -333,7 +378,9 @@ } if (target != null && !noFollow && !post) try { - URL url = new URL(base, target); + + URL url = (base.toString().indexOf(';') > 0) ? + fixEmbeddedParams(base, target) : new URL(base, target); outlinks.add(new Outlink(url.toString(), linkText.toString().trim(), conf)); } catch (MalformedURLException e) { Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?view=diff&rev=516757&r1=516756&r2=516757 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Sat Mar 10 09:40:20 2007 @@ -134,6 +134,20 @@ + "<input type=submit><p>test1</p></form>" + "<form method='GET' action='/dummy.jsp'><input type=text>" + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + + "</head><body>" + + "<a href=\";x\">anchor1</a>" + + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + + "</body></html>"), + new String("<html><head><title> title </title>" + + "</head><body>" + + "<a href=\"g\">anchor1</a>" + + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + + "</body></html>"), }; private static int SKIP = 9; @@ -149,6 +163,8 @@ "http://www.nutch.org//", "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/", + "http://www.nutch.org/;something" }; private static final DocumentFragment testDOMs[]= @@ -173,7 +189,9 @@ + "End this madness ! . . . .", "ignore ignore", "test1 test2", - "test1 test2" + "test1 test2", + "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5" }; private static final String[] answerTitle= { @@ -186,7 +204,9 @@ "my title", "", "", - "" + "", + "title", + "title" }; // note: should be in page-order @@ -258,6 +278,18 @@ new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf), }, { + }, + { + new Outlink("http://www.nutch.org/;x", "anchor1", conf), + new Outlink("http://www.nutch.org/g;x", "anchor2", conf), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf) + }, + { + new Outlink("http://www.nutch.org/g;something", "anchor1", conf), + new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf), + new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf) } }; ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs