Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Jan 9 06:34:33 2015 @@ -36,326 +36,253 @@ import org.junit.Before; import org.junit.Test; import static org.junit.Assert.*; -/** +/** * Unit tests for DOMContentUtils. */ public class TestDOMContentUtils { - private static final String[] testPages= { - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" - + "</body></html>"), - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" - + " home </a><!--comment-->" - + "<style> style </style>" - + " <a href=\"bot.html\">" - + " bots </a>" - + "</body></html>"), - new String("<html><head><title> </title>" - + "</head><body> " - + "<a href=\"/\"> separate this " - + "<a href=\"ok\"> from this" - + "</a></a>" - + "</body></html>"), - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " - + "<ul>" - + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" - + "<li> <a href=\"2\"> 2" - + "</ul>" - + "</body></html>"), - // test frameset link extraction. The invalid frame in the middle will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " - + "<frame src=\"top.html\">" - + "</frame>" - + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" - + "<frame src=\"invalid.html\"/>" - + "</frame>" - + "<frame src=\"right.html\">" - + "</frame>" - + "</frameset>" - + "</frameset>" - + "</body></html>"), - // test <area> and <iframe> link extraction + url normalization - new String("<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" - + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" - + "</body></html>"), - // test whitespace processing for plain text extraction - new String("<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." - + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - new String("<html><head><title> title </title>" - + "</head><body>" - + "<a href=\";x\">anchor1</a>" - + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" - + "</body></html>"), - new String("<html><head><title> title </title>" - + "</head><body>" - + "<a href=\"g\">anchor1</a>" - + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" - + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" - + "</body></html>"), - }; - + private static final String[] testPages = { + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + // test frameset link extraction. The invalid frame in the middle will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; + private static int SKIP = 9; - private static String[] testBaseHrefs= { - "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", - "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", - "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", - "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", - "http://www.nutch.org/", - "http://www.nutch.org/", - "http://www.nutch.org/", - "http://www.nutch.org/;something" - }; - - private static final DocumentFragment testDOMs[]= - new DocumentFragment[testPages.length]; - - private static URL[] testBaseHrefURLs= - new URL[testPages.length]; - - - private static final String[] answerText= { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", - "ignore ignore", - "test1 test2", - "test1 test2", - "title anchor1 anchor2 anchor3", - "title anchor1 anchor2 anchor3 anchor4 anchor5" - }; - - private static final String[] answerTitle= { - "title", - "title", - "", - "my title", - "my title", - "my title", - "my title", - "", - "", - "", - "title", - "title" - }; + private static String[] testBaseHrefs = { "http://www.nutch.org", + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something" }; + + private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; + + private static URL[] testBaseHrefURLs = new URL[testPages.length]; + + private static final String[] answerText = { + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5" }; + + private static final String[] answerTitle = { "title", "title", "", + "my title", "my title", "my title", "my title", "", "", "", "title", + "title" }; // note: should be in page-order private static Outlink[][] answerOutlinks; - + private static Configuration conf; private static DOMContentUtils utils = null; - + @Before public void setup() { conf = NutchConfiguration.create(); conf.setBoolean("parser.html.form.use_action", true); utils = new DOMContentUtils(conf); - DOMFragmentParser parser= new DOMFragmentParser(); + DOMFragmentParser parser = new DOMFragmentParser(); try { - parser.setFeature( - "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", - true); - } catch (SAXException e) {} - for (int i= 0; i < testPages.length; i++) { - DocumentFragment node= - new HTMLDocumentImpl().createDocumentFragment(); - try { - parser.parse( - new InputSource( - new ByteArrayInputStream(testPages[i].getBytes()) ), + parser + .setFeature( + "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", + true); + } catch (SAXException e) { + } + for (int i = 0; i < testPages.length; i++) { + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + try { + parser.parse( + new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), node); - testBaseHrefURLs[i]= new URL(testBaseHrefs[i]); - } catch (Exception e) { - assertTrue("caught exception: " + e, false); - } - testDOMs[i]= node; + testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); + } catch (Exception e) { + assertTrue("caught exception: " + e, false); + } + testDOMs[i] = node; } try { - answerOutlinks = new Outlink[][]{ - { - new Outlink("http://www.nutch.org", "anchor"), - }, - { - new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", "bots"), - }, - { - new Outlink("http://www.nutch.org/", "separate this"), - new Outlink("http://www.nutch.org/docs/ok", "from this"), - }, - { - new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), - }, - { - new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html", ""), - new Outlink("http://www.nutch.org/frames/right.html", ""), - }, - { - new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", ""), - }, - { - new Outlink("http://www.nutch.org/index.html", "whitespace test"), - }, - { - }, - { - new Outlink("http://www.nutch.org/dummy.jsp", "test2"), - }, - { - }, - { - new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") - }, - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g", "anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") - } - }; + answerOutlinks = new Outlink[][] { + { new Outlink("http://www.nutch.org", "anchor"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/ok", "from this"), }, + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") } }; } catch (MalformedURLException e) { - - } + + } } private static boolean equalsIgnoreWhitespace(String s1, String s2) { - StringTokenizer st1= new StringTokenizer(s1); - StringTokenizer st2= new StringTokenizer(s2); + StringTokenizer st1 = new StringTokenizer(s1); + StringTokenizer st2 = new StringTokenizer(s2); while (st1.hasMoreTokens()) { - if (!st2.hasMoreTokens()) + if (!st2.hasMoreTokens()) return false; - if ( ! st1.nextToken().equals(st2.nextToken()) ) + if (!st1.nextToken().equals(st2.nextToken())) return false; } - if (st2.hasMoreTokens()) + if (st2.hasMoreTokens()) return false; return true; } @Test public void testGetText() { - if (testDOMs[0] == null) + if (testDOMs[0] == null) setup(); - for (int i= 0; i < testPages.length; i++) { - StringBuilder sb= new StringBuilder(); + for (int i = 0; i < testPages.length; i++) { + StringBuilder sb = new StringBuilder(); utils.getText(sb, testDOMs[i]); - String text= sb.toString(); - assertTrue("expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: "+ text, - equalsIgnoreWhitespace(answerText[i], text)); + String text = sb.toString(); + assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerText[i], text)); } } @Test public void testGetTitle() { - if (testDOMs[0] == null) + if (testDOMs[0] == null) setup(); - for (int i= 0; i < testPages.length; i++) { - StringBuilder sb= new StringBuilder(); + for (int i = 0; i < testPages.length; i++) { + StringBuilder sb = new StringBuilder(); utils.getTitle(sb, testDOMs[i]); - String text= sb.toString(); - assertTrue("expecting text: " + answerText[i] - + System.getProperty("line.separator") - + System.getProperty("line.separator") - + "got text: "+ text, - equalsIgnoreWhitespace(answerTitle[i], text)); + String text = sb.toString(); + assertTrue( + "expecting text: " + answerText[i] + + System.getProperty("line.separator") + + System.getProperty("line.separator") + "got text: " + text, + equalsIgnoreWhitespace(answerTitle[i], text)); } } @Test public void testGetOutlinks() { - if (testDOMs[0] == null) + if (testDOMs[0] == null) setup(); - for (int i= 0; i < testPages.length; i++) { - ArrayList<Outlink> outlinks= new ArrayList<Outlink>(); + for (int i = 0; i < testPages.length; i++) { + ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); if (i == SKIP) { conf.setBoolean("parser.html.form.use_action", false); utils.setConf(conf); @@ -364,52 +291,48 @@ public class TestDOMContentUtils { utils.setConf(conf); } utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]); - Outlink[] outlinkArr= new Outlink[outlinks.size()]; - outlinkArr= outlinks.toArray(outlinkArr); + Outlink[] outlinkArr = new Outlink[outlinks.size()]; + outlinkArr = outlinks.toArray(outlinkArr); compareOutlinks(answerOutlinks[i], outlinkArr); } } private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { - for (int i= 0; i < o.length; i++) { + for (int i = 0; i < o.length; i++) { sb.append(o[i].toString()); sb.append(System.getProperty("line.separator")); } } private static final String outlinksString(Outlink[] o) { - StringBuffer sb= new StringBuffer(); + StringBuffer sb = new StringBuffer(); appendOutlinks(sb, o); return sb.toString(); } private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) { if (o1.length != o2.length) { - assertTrue("got wrong number of outlinks (expecting " + o1.length - + ", got " + o2.length + ")" - + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + outlinksString(o1) - + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + outlinksString(o2) - + System.getProperty("line.separator"), - false - ); + assertTrue( + "got wrong number of outlinks (expecting " + o1.length + ", got " + + o2.length + ")" + System.getProperty("line.separator") + + "answer: " + System.getProperty("line.separator") + + outlinksString(o1) + System.getProperty("line.separator") + + "got: " + System.getProperty("line.separator") + + outlinksString(o2) + System.getProperty("line.separator"), + false); } - for (int i= 0; i < o1.length; i++) { + for (int i = 0; i < o1.length; i++) { if (!o1[i].equals(o2[i])) { - assertTrue("got wrong outlinks at position " + i - + System.getProperty("line.separator") - + "answer: " + System.getProperty("line.separator") - + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'" - + System.getProperty("line.separator") - + "got: " + System.getProperty("line.separator") - + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'", - false - ); - + assertTrue( + "got wrong outlinks at position " + i + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); + } } }
Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Fri Jan 9 06:34:33 2015 @@ -36,71 +36,55 @@ import static org.junit.Assert.*; public class TestHtmlParser { - public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(TestHtmlParser.class); + + private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; + private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; + private static final String encodingTestContent = "<title>" + + encodingTestKeywords + "</title>\n" + + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; + + private static String[][] encodingTestPages = { + { + "HTML4, utf-8, meta http-equiv, no quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML4, utf-8, meta http-equiv, single quotes", + "utf-8", + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + "<html>\n<head>\n" + + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + + encodingTestContent }, + { + "XHTML, utf-8, meta http-equiv, double quotes", + "utf-8", + "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" + + "<html>\n<head>\n" + + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + + encodingTestContent }, + { + "HTML5, utf-8, meta charset", + "utf-8", + "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">" + + encodingTestContent }, + { "HTML5, utf-8, BOM", "utf-8", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent }, + { "HTML5, utf-16, BOM", "utf-16", + "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } }; - private static final String encodingTestKeywords = - "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά"; - private static final String encodingTestBody = - "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>"; - private static final String encodingTestContent = - "<title>" + encodingTestKeywords + "</title>\n" - + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "</meta>\n" - + "</head>\n<body>" + encodingTestBody + "</body>\n</html>"; - - private static String[][] encodingTestPages= { - { - "HTML4, utf-8, meta http-equiv, no quotes", - "utf-8", - "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " - + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" - + "<html>\n<head>\n" - + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />" - + encodingTestContent - }, - { - "HTML4, utf-8, meta http-equiv, single quotes", - "utf-8", - "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" " - + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" - + "<html>\n<head>\n" - + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" - + encodingTestContent - }, - { - "XHTML, utf-8, meta http-equiv, double quotes", - "utf-8", - "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">" - + "<html>\n<head>\n" - + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" - + encodingTestContent - }, - { - "HTML5, utf-8, meta charset", - "utf-8", - "<!DOCTYPE html>\n<html>\n<head>\n" - + "<meta charset=\"utf-8\">" - + encodingTestContent - }, - { - "HTML5, utf-8, BOM", - "utf-8", - "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" - + encodingTestContent - }, - { - "HTML5, utf-16, BOM", - "utf-16", - "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" - + encodingTestContent - } - }; - private Configuration conf; private Parser parser; - + private static final String dummyUrl = "http://dummy.url/"; - @Before public void setup() { conf = NutchConfiguration.create(); @@ -115,25 +99,25 @@ public class TestHtmlParser { page.setContentType(new Utf8("text/html")); return page; } - + protected Parse parse(WebPage page) { return parser.getParse(dummyUrl, page); } - @Test public void testEncodingDetection() { for (String[] testPage : encodingTestPages) { String name = testPage[0]; Charset charset = Charset.forName(testPage[1]); byte[] contentBytes = testPage[2].getBytes(charset); - //Parse parse = parse(contentBytes); + // Parse parse = parse(contentBytes); WebPage page = page(contentBytes); Parse parse = parse(page); String text = parse.getText(); String title = parse.getTitle(); - //String keywords = parse.getMeta("keywords"); - String keywords = Bytes.toString(page.getMetadata().get(new Utf8("keywords"))); + // String keywords = parse.getMeta("keywords"); + String keywords = Bytes.toString(page.getMetadata().get( + new Utf8("keywords"))); LOG.info(name); LOG.info("title:\t" + title); LOG.info("keywords:\t" + keywords); Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original) +++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Fri Jan 9 06:34:33 2015 @@ -34,120 +34,96 @@ import static org.junit.Assert.*; public class TestRobotsMetaProcessor { /* - - some sample tags: - - <meta name="robots" content="index,follow"> - <meta name="robots" content="noindex,follow"> - <meta name="robots" content="index,nofollow"> - <meta name="robots" content="noindex,nofollow"> - - <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - - */ - - - public static String[] tests= - { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" - + "</head><body>" - + " some text" - + "</body></html>", + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> + */ + + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", }; - public static final boolean[][] answers= { - {true, true, true}, // NONE - {false, false, true}, // all - {true, true, true}, // nOnE - {true, true, false}, // none - {true, true, false}, // noindex,nofollow - {true, false, false}, // noindex,follow - {false, true, false}, // index,nofollow - {false, false, false}, // index,follow - {false, false, false}, // missing! + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! }; private URL[][] currURLsAndAnswers; @Test public void testRobotsMetaProcessor() { - DOMFragmentParser parser= new DOMFragmentParser();; + DOMFragmentParser parser = new DOMFragmentParser(); + ; - try { - currURLsAndAnswers= new URL[][] { - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/")}, - {new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/")} - }; + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; } catch (Exception e) { assertTrue("couldn't make test URLs!", false); } - for (int i= 0; i < tests.length; i++) { - byte[] bytes= tests[i].getBytes(); + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); @@ -157,24 +133,22 @@ public class TestRobotsMetaProcessor { e.printStackTrace(); } - HTMLMetaTags robotsMeta= new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, - currURLsAndAnswers[i][0]); + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); assertTrue("got index wrong on test " + i, - robotsMeta.getNoIndex() == answers[i][0]); + robotsMeta.getNoIndex() == answers[i][0]); assertTrue("got follow wrong on test " + i, - robotsMeta.getNoFollow() == answers[i][1]); + robotsMeta.getNoFollow() == answers[i][1]); assertTrue("got cache wrong on test " + i, - robotsMeta.getNoCache() == answers[i][2]); - assertTrue("got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ( (robotsMeta.getBaseHref() == null) - && (currURLsAndAnswers[i][1] == null) ) - || ( (robotsMeta.getBaseHref() != null) - && robotsMeta.getBaseHref().equals( - currURLsAndAnswers[i][1]) ) ); - + robotsMeta.getNoCache() == answers[i][2]); + assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); + } } Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Fri Jan 9 06:34:33 2015 @@ -56,11 +56,10 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** - * This class is a heuristic link extractor for JavaScript files and - * code snippets. The general idea of a two-pass regex matching comes from - * Heritrix. Parts of the code come from OutlinkExtractor.java - * by Stephan Strittmatter. - * + * This class is a heuristic link extractor for JavaScript files and code + * snippets. The general idea of a two-pass regex matching comes from Heritrix. + * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter. + * * @author Andrzej Bialecki <a...@getopt.org> */ public class JSParseFilter implements ParseFilter, Parser { @@ -72,11 +71,17 @@ public class JSParseFilter implements Pa /** * Scan the JavaScript looking for possible {@link Outlink}'s - * @param url URL of the {@link WebPage} to be parsed - * @param page {@link WebPage} object relative to the URL - * @param parse {@link Parse} object holding parse status - * @param metatags within the {@link NutchDocument} - * @param doc The {@link NutchDocument} object + * + * @param url + * URL of the {@link WebPage} to be parsed + * @param page + * {@link WebPage} object relative to the URL + * @param parse + * {@link Parse} object holding parse status + * @param metatags + * within the {@link NutchDocument} + * @param doc + * The {@link NutchDocument} object * @return parse the actual {@link Parse} object */ @Override @@ -98,28 +103,34 @@ public class JSParseFilter implements Pa return parse; } - private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) { + private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, + List<Outlink> outlinks) { if (n instanceof Element) { String name = n.getNodeName(); if (name.equalsIgnoreCase("script")) { @SuppressWarnings("unused") String lang = null; Node lNode = n.getAttributes().getNamedItem("language"); - if (lNode == null) lang = "javascript"; - else lang = lNode.getNodeValue(); + if (lNode == null) + lang = "javascript"; + else + lang = lNode.getNodeValue(); StringBuffer script = new StringBuffer(); NodeList nn = n.getChildNodes(); if (nn.getLength() > 0) { for (int i = 0; i < nn.getLength(); i++) { - if (i > 0) script.append('\n'); + if (i > 0) + script.append('\n'); script.append(nn.item(i).getNodeValue()); } // This logging makes the output very messy. - //if (LOG.isInfoEnabled()) { - // LOG.info("script: language=" + lang + ", text: " + script.toString()); - //} + // if (LOG.isInfoEnabled()) { + // LOG.info("script: language=" + lang + ", text: " + + // script.toString()); + // } Outlink[] links = getJSLinks(script.toString(), "", base); - if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); + if (links != null && links.length > 0) + outlinks.addAll(Arrays.asList(links)); // no other children of interest here, go one level up. return; } @@ -131,7 +142,8 @@ public class JSParseFilter implements Pa // Window: onload,onunload // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus // Keyboard: onkeydown,onkeypress,onkeyup - // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup + // Mouse: + // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup Node anode = attrs.item(i); Outlink[] links = null; if (anode.getNodeName().startsWith("on")) { @@ -142,7 +154,8 @@ public class JSParseFilter implements Pa links = getJSLinks(val, "", base); } } - if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); + if (links != null && links.length > 0) + outlinks.addAll(Arrays.asList(links)); } } } @@ -154,42 +167,51 @@ public class JSParseFilter implements Pa /** * Set the {@link Configuration} object - * @param url URL of the {@link WebPage} which is parsed - * @param page {@link WebPage} object relative to the URL + * + * @param url + * URL of the {@link WebPage} which is parsed + * @param page + * {@link WebPage} object relative to the URL * @return parse the actual {@link Parse} object */ @Override public Parse getParse(String url, WebPage page) { String type = TableUtil.toString(page.getContentType()); - if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript")) - return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_INVALID_FORMAT, - "Content not JavaScript: '" + type + "'", getConf()); + if (type != null && !type.trim().equals("") + && !type.toLowerCase().startsWith("application/x-javascript")) + return ParseStatusUtils.getEmptyParse( + ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '" + + type + "'", getConf()); String script = Bytes.toString(page.getContent()); Outlink[] outlinks = getJSLinks(script, "", url); - if (outlinks == null) outlinks = new Outlink[0]; + if (outlinks == null) + outlinks = new Outlink[0]; // Title? use the first line of the script... String title; int idx = script.indexOf('\n'); if (idx != -1) { - if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN; + if (idx > MAX_TITLE_LEN) + idx = MAX_TITLE_LEN; title = script.substring(0, idx); } else { idx = Math.min(MAX_TITLE_LEN, script.length()); title = script.substring(0, idx); } - Parse parse = - new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS); + Parse parse = new Parse(script, title, outlinks, + ParseStatusUtils.STATUS_SUCCESS); return parse; } private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)"; // A simple pattern. This allows also invalid URL characters. private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)"; + // Alternative pattern, which limits valid url characters. - //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)"; + // private static final String URI_PATTERN = + // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)"; /** - * This method extracts URLs from literals embedded in JavaScript. + * This method extracts URLs from literals embedded in JavaScript. */ private Outlink[] getJSLinks(String plainText, String anchor, String base) { @@ -199,8 +221,8 @@ public class JSParseFilter implements Pa try { baseURL = new URL(base); } catch (Exception e) { - if (LOG.isErrorEnabled()) { - LOG.error("error assigning base URL", e); + if (LOG.isErrorEnabled()) { + LOG.error("error assigning base URL", e); } } @@ -208,10 +230,10 @@ public class JSParseFilter implements Pa final PatternCompiler cp = new Perl5Compiler(); final Pattern pattern = cp.compile(STRING_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); + | Perl5Compiler.MULTILINE_MASK); final Pattern pattern1 = cp.compile(URI_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); + | Perl5Compiler.MULTILINE_MASK); final PatternMatcher matcher = new Perl5Matcher(); final PatternMatcher matcher1 = new Perl5Matcher(); @@ -220,28 +242,28 @@ public class JSParseFilter implements Pa MatchResult result; String url; - //loop the matches + // loop the matches while (matcher.contains(input, pattern)) { result = matcher.getMatch(); url = result.group(2); PatternMatcherInput input1 = new PatternMatcherInput(url); if (!matcher1.matches(input1, pattern1)) { - if (LOG.isTraceEnabled()) { - LOG.trace(" - invalid '" + url + "'"); + if (LOG.isTraceEnabled()) { + LOG.trace(" - invalid '" + url + "'"); } continue; } if (url.startsWith("www.")) { url = "http://" + url; } else { - // See if candidate URL is parseable. If not, pass and move on to + // See if candidate URL is parseable. If not, pass and move on to // the next match. try { url = new URL(baseURL, url).toString(); } catch (MalformedURLException ex) { if (LOG.isTraceEnabled()) { - LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + - baseURL + "'", ex); + LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + + baseURL + "'", ex); } continue; } @@ -255,14 +277,14 @@ public class JSParseFilter implements Pa } catch (Exception ex) { // if it is a malformed URL we just throw it away and continue with // extraction. - if (LOG.isErrorEnabled()) { - LOG.error(" - invalid or malformed URL", ex); + if (LOG.isErrorEnabled()) { + LOG.error(" - invalid or malformed URL", ex); } } final Outlink[] retval; - //create array of the Outlinks + // create array of the Outlinks if (outlinks != null && outlinks.size() > 0) { retval = outlinks.toArray(new Outlink[0]); } else { @@ -273,8 +295,10 @@ public class JSParseFilter implements Pa } /** - * Main method which can be run from command line with the plugin option. - * The method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js baseURL + * Main method which can be run from command line with the plugin option. The + * method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js + * baseURL + * * @param args * @throws Exception */ @@ -287,7 +311,8 @@ public class JSParseFilter implements Pa BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); StringBuffer sb = new StringBuffer(); String line = null; - while ((line = br.readLine()) != null) sb.append(line + "\n"); + while ((line = br.readLine()) != null) + sb.append(line + "\n"); JSParseFilter parseFilter = new JSParseFilter(); parseFilter.setConf(NutchConfiguration.create()); Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]); @@ -311,10 +336,9 @@ public class JSParseFilter implements Pa } /** - * Gets all the fields for a given {@link WebPage} - * Many datastores need to setup the mapreduce job by specifying the fields - * needed. All extensions that work on WebPage are able to specify what fields - * they need. + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. */ @Override public Collection<WebPage.Field> getFields() { Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java (original) +++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * from JavaScript files and embedded JavaScript code snippets. */ package org.apache.nutch.parse.js; + Modified: nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java (original) +++ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java Fri Jan 9 06:34:33 2015 @@ -38,9 +38,9 @@ import java.nio.ByteBuffer; import static org.junit.Assert.assertEquals; /** - * JUnit test case for {@link JSParseFilter} which tests - * 1. That 5 outlinks are extracted from JavaScript snippets embedded in HTML - * 2. That X outlinks are extracted from a pure JavaScript file (this is temporarily disabled) + * JUnit test case for {@link JSParseFilter} which tests 1. That 5 outlinks are + * extracted from JavaScript snippets embedded in HTML 2. That X outlinks are + * extracted from a pure JavaScript file (this is temporarily disabled) * * @author lewismc */ @@ -54,47 +54,53 @@ public class TestJSParseFilter { // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-js/build.xml during plugin compilation. - private String[] sampleFiles = { "parse_pure_js_test.js", "parse_embedded_js_test.html" }; - + private String[] sampleFiles = { "parse_pure_js_test.js", + "parse_embedded_js_test.html" }; + private Configuration conf; - + @Before public void setUp() { conf = NutchConfiguration.create(); conf.set("file.content.limit", "-1"); } - public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException { + public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, + ParseException, IOException { String urlString; Parse parse; - + urlString = "file:" + sampleDir + fileSeparator + sampleFiles; File file = new File(urlString); byte[] bytes = new byte[(int) file.length()]; DataInputStream dip = new DataInputStream(new FileInputStream(file)); dip.readFully(bytes); dip.close(); - + WebPage page = WebPage.newBuilder().build(); page.setBaseUrl(new Utf8(urlString)); page.setContent(ByteBuffer.wrap(bytes)); MimeUtil mutil = new MimeUtil(conf); String mime = mutil.getMimeType(file); page.setContentType(new Utf8(mime)); - + parse = new ParseUtil(conf).parse(urlString, page); return parse.getOutlinks(); } - + @Test - public void testOutlinkExtraction() throws ProtocolException, ParseException, IOException { + public void testOutlinkExtraction() throws ProtocolException, ParseException, + IOException { String[] filenames = new File(sampleDir).list(); for (int i = 0; i < filenames.length; i++) { if (filenames[i].endsWith(".js") == true) { - assertEquals("number of outlinks in .js test file should be 5", 5, getOutlinks(sampleFiles)); - // temporarily disabled as a suitable pure JS file could not be be found. - //} else { - //assertEquals("number of outlinks in .html file should be X", 5, getOutlinks(sampleFiles)); + assertEquals("number of outlinks in .js test file should be 5", 5, + getOutlinks(sampleFiles)); + // temporarily disabled as a suitable pure JS file could not be be + // found. + // } else { + // assertEquals("number of outlinks in .html file should be X", 5, + // getOutlinks(sampleFiles)); } } } Modified: nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java (original) +++ nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java Fri Jan 9 06:34:33 2015 @@ -21,3 +21,4 @@ * (see {@link org.apache.nutch.indexer.metadata}). */ package org.apache.nutch.parse.metatags; + Modified: nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java Fri Jan 9 06:34:33 2015 @@ -59,14 +59,15 @@ public class TestMetaTagsParser { /** * - * + * * @param fileName * This variable set test file. * @param useUtil * If value is True method use ParseUtil * @return If successfully document parsed, it return metatags */ - public Map<CharSequence, ByteBuffer> parseMetaTags(String fileName, boolean useUtil) { + public Map<CharSequence, ByteBuffer> parseMetaTags(String fileName, + boolean useUtil) { try { Configuration conf = NutchConfiguration.create(); String urlString = "file:" + sampleDir + fileSeparator + fileName;