Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java Thu Jan 29 05:38:59 2015 @@ -42,129 +42,129 @@ public class TestDOMContentUtils { private static final String[] testPages = { - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"http://www.nutch.org\">" - + " anchor </a><!--comment-->" + "</body></html>"), - - new String("<html><head><title> title </title><script> script </script>" - + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" - + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" - + "</body></html>"), - - new String("<html><head><title> </title>" + "</head><body> " - + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" - + "</a></a>" + "</body></html>"), - - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ... <li> <a href=/> home </a> </li> - // <li> <a href=/> <a href="1"> 1 </a> </a> </li> - // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> - new String("<html><head><title> my title </title>" - + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" - + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" - + "</body></html>"), - - // test frameset link extraction. The invalid frame in the middle - // will be - // fixed to a third standalone frame. - new String("<html><head><title> my title </title>" - + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" - + "</frame>" + "<frameset cols=\"20,*\">" - + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" - + "</frame>" + "<frame src=\"right.html\">" + "</frame>" - + "</frameset>" + "</frameset>" + "</body></html>"), - - // test <area> and <iframe> link extraction + url normalization - new String( - "<html><head><title> my title </title>" - + "</head><body>" - + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" - + "<map name=\"green\">" - + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" - + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" - + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" - + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " - + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), - - // test whitespace processing for plain text extraction - new String( - "<html><head>\n <title> my\t\n title\r\n </title>\n" - + " </head>\n" - + " <body>\n" - + " <h1> Whitespace\ttest </h1> \n" - + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" - + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" - + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" - + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" - + "<table>" - + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" - + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" - + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" - + "</table>put some text here<Br>and there." - + "<h2>End\tthis\rmadness\n!</h2>\r\n" - + " . . . ." + "</body> </html>"), - - // test that <a rel=nofollow> links are not returned - new String("<html><head></head><body>" - + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" - + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" - + "</body></html>"), - // test that POST form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - // test that all form actions are skipped - new String("<html><head></head><body>" - + "<form method='POST' action='/search.jsp'><input type=text>" - + "<input type=submit><p>test1</p></form>" - + "<form method='GET' action='/dummy.jsp'><input type=text>" - + "<input type=submit><p>test2</p></form></body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" - + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), - new String("<html><head><title> title </title>" + "</head><body>" - + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" - + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" - + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"http://www.nutch.org\">" + + " anchor </a><!--comment-->" + "</body></html>"), + + new String("<html><head><title> title </title><script> script </script>" + + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->" + + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>" + + "</body></html>"), + + new String("<html><head><title> </title>" + "</head><body> " + + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this" + + "</a></a>" + "</body></html>"), + + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ... <li> <a href=/> home </a> </li> + // <li> <a href=/> <a href="1"> 1 </a> </a> </li> + // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li> + new String("<html><head><title> my title </title>" + + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home" + + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>" + + "</body></html>"), + + // test frameset link extraction. The invalid frame in the middle + // will be + // fixed to a third standalone frame. + new String("<html><head><title> my title </title>" + + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">" + + "</frame>" + "<frameset cols=\"20,*\">" + + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>" + + "</frame>" + "<frame src=\"right.html\">" + "</frame>" + + "</frameset>" + "</frameset>" + "</body></html>"), + + // test <area> and <iframe> link extraction + url normalization + new String( + "<html><head><title> my title </title>" + + "</head><body>" + + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">" + + "<map name=\"green\">" + + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">" + + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">" + + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">" + + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> " + + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"), + + // test whitespace processing for plain text extraction + new String( + "<html><head>\n <title> my\t\n title\r\n </title>\n" + + " </head>\n" + + " <body>\n" + + " <h1> Whitespace\ttest </h1> \n" + + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n" + + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n" + + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>" + + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n" + + "<table>" + + " <tr><td>one</td><td>two</td><td>three</td></tr>\n" + + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>" + + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n" + + "</table>put some text here<Br>and there." + + "<h2>End\tthis\rmadness\n!</h2>\r\n" + + " . . . ." + "</body> </html>"), + + // test that <a rel=nofollow> links are not returned + new String("<html><head></head><body>" + + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>" + + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>" + + "</body></html>"), + // test that POST form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + // test that all form actions are skipped + new String("<html><head></head><body>" + + "<form method='POST' action='/search.jsp'><input type=text>" + + "<input type=submit><p>test1</p></form>" + + "<form method='GET' action='/dummy.jsp'><input type=text>" + + "<input type=submit><p>test2</p></form></body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>" + + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"), + new String("<html><head><title> title </title>" + "</head><body>" + + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>" + + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>" + + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), }; private static int SKIP = 9; private static String[] testBaseHrefs = { "http://www.nutch.org", - "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", - "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", - "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", - "http://www.nutch.org//", "http://www.nutch.org/", - "http://www.nutch.org/", "http://www.nutch.org/", - "http://www.nutch.org/;something" }; + "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/", + "http://www.nutch.org/docs/", "http://www.nutch.org/frames/", + "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/", + "http://www.nutch.org//", "http://www.nutch.org/", + "http://www.nutch.org/", "http://www.nutch.org/", + "http://www.nutch.org/;something" }; private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length]; private static URL[] testBaseHrefURLs = new URL[testPages.length]; private static final String[] answerText = { - "title body anchor", - "title body home bots", - "separate this from this", - "my title body home 1 2", - "my title", - "my title the bottom", - "my title Whitespace test whitespace test " - + "This is a whitespace test . Newlines should appear as space too. " - + "Tabs are spaces too. This is a break -> and the line after break . " - + "one two three space here space there no space " - + "one two two three three four put some text here and there. " - + "End this madness ! . . . .", "ignore ignore", "test1 test2", - "test1 test2", "title anchor1 anchor2 anchor3", - "title anchor1 anchor2 anchor3 anchor4 anchor5" }; + "title body anchor", + "title body home bots", + "separate this from this", + "my title body home 1 2", + "my title", + "my title the bottom", + "my title Whitespace test whitespace test " + + "This is a whitespace test . Newlines should appear as space too. " + + "Tabs are spaces too. This is a break -> and the line after break . " + + "one two three space here space there no space " + + "one two two three three four put some text here and there. " + + "End this madness ! . . . .", "ignore ignore", "test1 test2", + "test1 test2", "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 anchor4 anchor5" }; private static final String[] answerTitle = { "title", "title", "", - "my title", "my title", "my title", "my title", "", "", "", "title", - "title" }; + "my title", "my title", "my title", "my title", "", "", "", "title", + "title" }; // note: should be in page-order private static Outlink[][] answerOutlinks; @@ -196,36 +196,36 @@ public class TestDOMContentUtils { answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor"), }, { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, - { new Outlink("http://www.nutch.org/", "separate this"), + new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, + { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this"), }, - { new Outlink("http://www.nutch.org/", "home"), - new Outlink("http://www.nutch.org/docs/1", "1"), - new Outlink("http://www.nutch.org/docs/2", "2"), }, - { new Outlink("http://www.nutch.org/frames/top.html", ""), - new Outlink("http://www.nutch.org/frames/left.html", ""), - new Outlink("http://www.nutch.org/frames/invalid.html", ""), - new Outlink("http://www.nutch.org/frames/right.html", ""), }, - { new Outlink("http://www.nutch.org/maps/logo.gif", ""), - new Outlink("http://www.nutch.org/index.html", ""), - new Outlink("http://www.nutch.org/maps/#bottom", ""), - new Outlink("http://www.nutch.org/bot.html", ""), - new Outlink("http://www.nutch.org/docs/index.html", ""), }, - { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, - {}, - { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, - {}, - { new Outlink("http://www.nutch.org/;x", "anchor1"), - new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, - { - // this is tricky - see RFC3986 section 5.4.1 example 7 - new Outlink("http://www.nutch.org/g", "anchor1"), - new Outlink("http://www.nutch.org/g?y#s", "anchor2"), - new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), - new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), - new Outlink("http://www.nutch.org/;something?y=1;somethingelse", - "anchor5") } }; + { new Outlink("http://www.nutch.org/", "home"), + new Outlink("http://www.nutch.org/docs/1", "1"), + new Outlink("http://www.nutch.org/docs/2", "2"), }, + { new Outlink("http://www.nutch.org/frames/top.html", ""), + new Outlink("http://www.nutch.org/frames/left.html", ""), + new Outlink("http://www.nutch.org/frames/invalid.html", ""), + new Outlink("http://www.nutch.org/frames/right.html", ""), }, + { new Outlink("http://www.nutch.org/maps/logo.gif", ""), + new Outlink("http://www.nutch.org/index.html", ""), + new Outlink("http://www.nutch.org/maps/#bottom", ""), + new Outlink("http://www.nutch.org/bot.html", ""), + new Outlink("http://www.nutch.org/docs/index.html", ""), }, + { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, + {}, + { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, + {}, + { new Outlink("http://www.nutch.org/;x", "anchor1"), + new Outlink("http://www.nutch.org/g;x", "anchor2"), + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, + { + // this is tricky - see RFC3986 section 5.4.1 example 7 + new Outlink("http://www.nutch.org/g", "anchor1"), + new Outlink("http://www.nutch.org/g?y#s", "anchor2"), + new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), + new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), + new Outlink("http://www.nutch.org/;something?y=1;somethingelse", + "anchor5") } }; } @@ -256,7 +256,7 @@ public class TestDOMContentUtils { "expecting text: " + answerText[i] + System.getProperty("line.separator") + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerText[i], text)); + equalsIgnoreWhitespace(answerText[i], text)); } } @@ -272,7 +272,7 @@ public class TestDOMContentUtils { "expecting text: " + answerText[i] + System.getProperty("line.separator") + System.getProperty("line.separator") + "got text: " + text, - equalsIgnoreWhitespace(answerTitle[i], text)); + equalsIgnoreWhitespace(answerTitle[i], text)); } } @@ -318,19 +318,19 @@ public class TestDOMContentUtils { + outlinksString(o1) + System.getProperty("line.separator") + "got: " + System.getProperty("line.separator") + outlinksString(o2) + System.getProperty("line.separator"), - false); + false); } for (int i = 0; i < o1.length; i++) { if (!o1[i].equals(o2[i])) { Assert.assertTrue( "got wrong outlinks at position " + i - + System.getProperty("line.separator") + "answer: " - + System.getProperty("line.separator") + "'" + o1[i].getToUrl() - + "', anchor: '" + o1[i].getAnchor() + "'" - + System.getProperty("line.separator") + "got: " - + System.getProperty("line.separator") + "'" + o2[i].getToUrl() - + "', anchor: '" + o2[i].getAnchor() + "'", false); + + System.getProperty("line.separator") + "answer: " + + System.getProperty("line.separator") + "'" + o1[i].getToUrl() + + "', anchor: '" + o1[i].getAnchor() + "'" + + System.getProperty("line.separator") + "got: " + + System.getProperty("line.separator") + "'" + o2[i].getToUrl() + + "', anchor: '" + o2[i].getAnchor() + "'", false); } } }
Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java Thu Jan 29 05:38:59 2015 @@ -80,8 +80,8 @@ public class TestFeedParser { protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", - content).get(content.getUrl()); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); // check that there are 2 outlinks: // unlike the original parse-rss Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java Thu Jan 29 05:38:59 2015 @@ -31,18 +31,16 @@ import org.apache.nutch.crawl.CrawlDatum import org.junit.Assert; import org.junit.Test; -/** +/** * Test extraction of image metadata */ public class TestImageMetadata { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); + private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in - private String[] sampleFiles = { - "nutch_logo_tm.gif", - }; + private String[] sampleFiles = { "nutch_logo_tm.gif", }; @Test public void testIt() throws ProtocolException, ParseException { @@ -56,8 +54,10 @@ public class TestImageMetadata { Configuration conf = NutchConfiguration.create(); protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); Assert.assertEquals("121", parse.getData().getMeta("width")); Assert.assertEquals("48", parse.getData().getMeta("height")); Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java Thu Jan 29 05:38:59 2015 @@ -34,20 +34,20 @@ import org.junit.Test; import java.io.File; -/** +/** * Unit tests for MSWordParser. - * + * * @author John Xing */ public class TestMSWordParser { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); + private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-msword/build.xml during plugin compilation. // Check ./src/plugin/parse-msword/sample/README.txt for what they are. - private String[] sampleFiles = {"word97.doc"}; + private String[] sampleFiles = { "word97.doc" }; private String expectedText = "This is a sample doc file prepared for nutch."; @@ -59,19 +59,23 @@ public class TestMSWordParser { conf.set("file.content.limit", "-1"); } - public String getTextContent(String fileName) throws ProtocolException, ParseException { + public String getTextContent(String fileName) throws ProtocolException, + ParseException { String urlString = "file:" + sampleDir + fileSeparator + fileName; Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); return parse.getText(); } @Test public void testIt() throws ProtocolException, ParseException { - for (int i=0; i<sampleFiles.length; i++) { + for (int i = 0; i < sampleFiles.length; i++) { String found = getTextContent(sampleFiles[i]); - Assert.assertTrue("text found : '"+found+"'",found.startsWith(expectedText)); + Assert.assertTrue("text found : '" + found + "'", + found.startsWith(expectedText)); } } @@ -79,8 +83,10 @@ public class TestMSWordParser { public void testOpeningDocs() throws ProtocolException, ParseException { String[] filenames = new File(sampleDir).list(); for (int i = 0; i < filenames.length; i++) { - if (filenames[i].endsWith(".doc")==false) continue; - Assert.assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0); - } + if (filenames[i].endsWith(".doc") == false) + continue; + Assert.assertTrue("cann't read content of " + filenames[i], + getTextContent(filenames[i]).length() > 0); + } } } Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java Thu Jan 29 05:38:59 2015 @@ -31,22 +31,22 @@ import org.apache.nutch.util.NutchConfig import org.junit.Assert; import org.junit.Test; -/** +/** * Unit tests for OOParser. - * + * * @author Andrzej Bialecki */ public class TestOOParser { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); + private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-oo/build.xml during plugin compilation. - private String[] sampleFiles = {"ootest.odt", "ootest.sxw"}; + private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; private String expectedText; - + private String sampleText = "ootest.txt"; @Test @@ -58,31 +58,36 @@ public class TestOOParser { Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); - System.out.println("Expected : "+expectedText); + System.out.println("Expected : " + expectedText); - for (int i=0; i<sampleFiles.length; i++) { + for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - if (sampleFiles[i].startsWith("ootest")==false) continue; + if (sampleFiles[i].startsWith("ootest") == false) + continue; protocol = factory.getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); - // simply test for the presence of a text - the ordering of the elements may differ from what was expected + // simply test for the presence of a text - the ordering of the elements + // may differ from what was expected // in the previous tests - Assert.assertTrue(text!=null && text.length() > 0); + Assert.assertTrue(text != null && text.length() > 0); - System.out.println("Found "+sampleFiles[i]+": "+text); + System.out.println("Found " + sampleFiles[i] + ": " + text); } } - public TestOOParser() { + public TestOOParser() { try { // read the test string - FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + sampleText); + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + + sampleText); StringBuffer sb = new StringBuffer(); int len = 0; InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); @@ -99,5 +104,4 @@ public class TestOOParser { } } - } Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java Thu Jan 29 05:38:59 2015 @@ -31,23 +31,20 @@ import org.apache.nutch.crawl.CrawlDatum import org.junit.Assert; import org.junit.Test; -/** +/** * Unit tests for PdfParser. - * + * * @author John Xing */ public class TestPdfParser { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); + private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-pdf/build.xml during plugin compilation. // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. - private String[] sampleFiles = { - "pdftest.pdf", - "encrypted.pdf" - }; + private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; private String expectedText = "A VERY SMALL PDF FILE"; @@ -63,8 +60,10 @@ public class TestPdfParser { Configuration conf = NutchConfiguration.create(); protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); int index = parse.getText().indexOf(expectedText); Assert.assertTrue(index > 0); Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java Thu Jan 29 05:38:59 2015 @@ -63,12 +63,13 @@ public class TestRTFParser { Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + rtfFile; protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) + .getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( + content.getUrl()); String text = parse.getText(); - Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); + Assert.assertEquals("The quick brown fox jumps over the lazy dog", + text.trim()); String title = parse.getData().getTitle(); Metadata meta = parse.getData().getParseMeta(); Modified: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java Thu Jan 29 05:38:59 2015 @@ -34,120 +34,96 @@ import org.junit.Test; public class TestRobotsMetaProcessor { /* - - some sample tags: - - <meta name="robots" content="index,follow"> - <meta name="robots" content="noindex,follow"> - <meta name="robots" content="index,nofollow"> - <meta name="robots" content="noindex,nofollow"> - - <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> - + * + * some sample tags: + * + * <meta name="robots" content="index,follow"> <meta name="robots" + * content="noindex,follow"> <meta name="robots" content="index,nofollow"> + * <meta name="robots" content="noindex,nofollow"> + * + * <META HTTP-EQUIV="Pragma" CONTENT="no-cache"> */ + public static String[] tests = { + "<html><head><title>test page</title>" + + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " + + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"all\"> " + + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " + + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"none\"> " + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"noindex,follow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,nofollow\"> " + + "</head><body>" + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + + "<meta name=\"robots\" content=\"index,follow\"> " + + "<base href=\"http://www.nutch.org/\">" + "</head><body>" + + " some text" + "</body></html>", + + "<html><head><title>test page</title>" + "<meta name=\"robots\"> " + + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>" + + " some text" + "</body></html>", + + }; - public static String[] tests= - { - "<html><head><title>test page</title>" - + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " - + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"all\"> " - + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " - + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"none\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,nofollow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"noindex,follow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,nofollow\"> " - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\" content=\"index,follow\"> " - + "<base href=\"http://www.nutch.org/\">" - + "</head><body>" - + " some text" - + "</body></html>", - - "<html><head><title>test page</title>" - + "<meta name=\"robots\"> " - + "<base href=\"http://www.nutch.org/base/\">" - + "</head><body>" - + " some text" - + "</body></html>", - - }; - - public static final boolean[][] answers= { - {true, true, true}, // NONE - {false, false, true}, // all - {true, true, true}, // nOnE - {true, true, false}, // none - {true, true, false}, // noindex,nofollow - {true, false, false}, // noindex,follow - {false, true, false}, // index,nofollow - {false, false, false}, // index,follow - {false, false, false}, // missing! + public static final boolean[][] answers = { { true, true, true }, // NONE + { false, false, true }, // all + { true, true, true }, // nOnE + { true, true, false }, // none + { true, true, false }, // noindex,nofollow + { true, false, false }, // noindex,follow + { false, true, false }, // index,nofollow + { false, false, false }, // index,follow + { false, false, false }, // missing! }; private URL[][] currURLsAndAnswers; @Test public void testRobotsMetaProcessor() { - DOMFragmentParser parser= new DOMFragmentParser();; + DOMFragmentParser parser = new DOMFragmentParser(); + ; - try { - currURLsAndAnswers= new URL[][] { - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org"), null}, - {new URL("http://www.nutch.org/foo/"), - new URL("http://www.nutch.org/")}, - {new URL("http://www.nutch.org"), - new URL("http://www.nutch.org/base/")} - }; + try { + currURLsAndAnswers = new URL[][] { + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org"), null }, + { new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/") }, + { new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/") } }; } catch (Exception e) { Assert.assertTrue("couldn't make test URLs!", false); } - for (int i= 0; i < tests.length; i++) { - byte[] bytes= tests[i].getBytes(); + for (int i = 0; i < tests.length; i++) { + byte[] bytes = tests[i].getBytes(); DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); @@ -157,9 +133,8 @@ public class TestRobotsMetaProcessor { e.printStackTrace(); } - HTMLMetaTags robotsMeta= new HTMLMetaTags(); - HTMLMetaProcessor.getMetaTags(robotsMeta, node, - currURLsAndAnswers[i][0]); + HTMLMetaTags robotsMeta = new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]); Assert.assertTrue("got index wrong on test " + i, robotsMeta.getNoIndex() == answers[i][0]); @@ -167,13 +142,13 @@ public class TestRobotsMetaProcessor { robotsMeta.getNoFollow() == answers[i][1]); Assert.assertTrue("got cache wrong on test " + i, robotsMeta.getNoCache() == answers[i][2]); - Assert.assertTrue("got base href wrong on test " + i + " (got " - + robotsMeta.getBaseHref() + ")", - ( (robotsMeta.getBaseHref() == null) - && (currURLsAndAnswers[i][1] == null) ) - || ( (robotsMeta.getBaseHref() != null) - && robotsMeta.getBaseHref().equals( - currURLsAndAnswers[i][1]) ) ); + Assert + .assertTrue( + "got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) + || ((robotsMeta.getBaseHref() != null) && robotsMeta + .getBaseHref().equals(currURLsAndAnswers[i][1]))); } } Modified: nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original) +++ nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Thu Jan 29 05:38:59 2015 @@ -55,9 +55,12 @@ public class ZipParser implements Parser List<Outlink> outLinksList = new ArrayList<Outlink>(); try { - final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH); + final String contentLen = content.getMetadata().get( + Response.CONTENT_LENGTH); final int len = Integer.parseInt(contentLen); - if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); } + if (LOG.isDebugEnabled()) { + LOG.debug("ziplen: " + len); + } final byte[] contentInBytes = content.getContent(); if (contentLen != null && contentInBytes.length != len) { @@ -76,7 +79,8 @@ public class ZipParser implements Parser } catch (Exception e) { return new ParseStatus(ParseStatus.FAILED, - "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf()); + "Can't be handled as Zip document. " + e).getEmptyParseResult( + content.getUrl(), getConf()); } if (resultText == null) { @@ -89,11 +93,13 @@ public class ZipParser implements Parser outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, - resultTitle, outlinks, - content.getMetadata()); + resultTitle, outlinks, content.getMetadata()); - if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); } - return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData)); + if (LOG.isTraceEnabled()) { + LOG.trace("Zip file parsed sucessfully !!"); + } + return ParseResult.createParseResult(content.getUrl(), new ParseImpl( + resultText, parseData)); } public void setConf(Configuration conf) { Modified: nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original) +++ nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Thu Jan 29 05:38:59 2015 @@ -44,12 +44,13 @@ import org.apache.nutch.protocol.Content import org.apache.tika.Tika; /** - * + * * @author Rohit Kulkarni & Ashish Vaidya */ public class ZipTextExtractor { - - public static final Logger LOG = LoggerFactory.getLogger(ZipTextExtractor.class); + + public static final Logger LOG = LoggerFactory + .getLogger(ZipTextExtractor.class); private Configuration conf; @@ -57,21 +58,22 @@ public class ZipTextExtractor { public ZipTextExtractor(Configuration conf) { this.conf = conf; } - - public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException { + + public String extractText(InputStream input, String url, + List<Outlink> outLinksList) throws IOException { String resultText = ""; ZipInputStream zin = new ZipInputStream(input); ZipEntry entry; - + while ((entry = zin.getNextEntry()) != null) { - + if (!entry.isDirectory()) { int size = (int) entry.getSize(); byte[] b = new byte[size]; - for(int x = 0; x < size; x++) { + for (int x = 0; x < size; x++) { int err = zin.read(); - if(err != -1) { - b[x] = (byte)err; + if (err != -1) { + b[x] = (byte) err; } } String newurl = url + "/"; @@ -86,29 +88,33 @@ public class ZipTextExtractor { String contentType = tika.detect(fname); try { Metadata metadata = new Metadata(); - metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize())); + metadata.set(Response.CONTENT_LENGTH, + Long.toString(entry.getSize())); metadata.set(Response.CONTENT_TYPE, contentType); - Content content = new Content(newurl, base, b, contentType, metadata, this.conf); - Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl()); + Content content = new Content(newurl, base, b, contentType, + metadata, this.conf); + Parse parse = new ParseUtil(this.conf).parse(content).get( + content.getUrl()); ParseData theParseData = parse.getData(); Outlink[] theOutlinks = theParseData.getOutlinks(); - - for(int count = 0; count < theOutlinks.length; count++) { - outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor())); + + for (int count = 0; count < theOutlinks.length; count++) { + outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), + theOutlinks[count].getAnchor())); } - + resultText += entry.getName() + " " + parse.getText() + " "; } catch (ParseException e) { - if (LOG.isInfoEnabled()) { - LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage()); + if (LOG.isInfoEnabled()) { + LOG.info("fetch okay, but can't parse " + fname + ", reason: " + + e.getMessage()); } } } } } - + return resultText; } - -} +} Modified: nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java (original) +++ nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Parse ZIP files: embedded files are recursively passed to appropriate parsers. */ package org.apache.nutch.parse.zip; + Modified: nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original) +++ nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Thu Jan 29 05:38:59 2015 @@ -31,20 +31,20 @@ import org.apache.nutch.crawl.CrawlDatum import org.junit.Assert; import org.junit.Test; -/** +/** * Based on Unit tests for MSWordParser by John Xing - * + * * @author Rohit Kulkarni & Ashish Vaidya */ public class TestZipParser { private String fileSeparator = System.getProperty("file.separator"); // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data","."); - + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" - - private String[] sampleFiles = {"test.zip"}; + + private String[] sampleFiles = { "test.zip" }; private String expectedText = "textfile.txt This is text file number 1 "; @@ -60,8 +60,10 @@ public class TestZipParser { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl()); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get( + content.getUrl()); Assert.assertTrue(parse.getText().equals(expectedText)); } } Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original) +++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Thu Jan 29 05:38:59 2015 @@ -37,11 +37,12 @@ import org.apache.nutch.util.NutchConfig import crawlercommons.robots.BaseRobotRules; /** - * This class is a protocol plugin used for file: scheme. - * It creates {@link FileResponse} object and gets the content of the url from it. - * Configurable parameters are {@code file.content.limit} and {@code file.crawl.parent} - * in nutch-default.xml defined under "file properties" section. - * + * This class is a protocol plugin used for file: scheme. It creates + * {@link FileResponse} object and gets the content of the url from it. + * Configurable parameters are {@code file.content.limit} and + * {@code file.crawl.parent} in nutch-default.xml defined under + * "file properties" section. + * * @author John Xing */ public class File implements Protocol { @@ -61,7 +62,8 @@ public class File implements Protocol { private Configuration conf; - public File() {} + public File() { + } /** * Set the {@link Configuration} object @@ -80,52 +82,59 @@ public class File implements Protocol { public Configuration getConf() { return this.conf; } - - /** - * Set the length after at which content is truncated. + + /** + * Set the length after at which content is truncated. */ public void setMaxContentLength(int maxContentLength) { this.maxContentLength = maxContentLength; } - /** - * Creates a {@link FileResponse} object corresponding to the url and - * return a {@link ProtocolOutput} object as per the content received + /** + * Creates a {@link FileResponse} object corresponding to the url and return a + * {@link ProtocolOutput} object as per the content received * - * @param url Text containing the url - * @param datum The CrawlDatum object corresponding to the url + * @param url + * Text containing the url + * @param datum + * The CrawlDatum object corresponding to the url * - * @return {@link ProtocolOutput} object for the content of the file indicated by url + * @return {@link ProtocolOutput} object for the content of the file indicated + * by url */ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); try { URL u = new URL(urlString); - + int redirects = 0; - + while (true) { FileResponse response; - response = new FileResponse(u, datum, this, getConf()); // make a request - + response = new FileResponse(u, datum, this, getConf()); // make a + // request + int code = response.getCode(); - - if (code == 200) { // got a good response - return new ProtocolOutput(response.toContent()); // return it - - } else if (code == 304) { // got not modified - return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED); - } else if (code == 401) { // access denied / no read permissions - return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED)); + if (code == 200) { // got a good response + return new ProtocolOutput(response.toContent()); // return it - } else if (code == 404) { // no such file - return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND); + } else if (code == 304) { // got not modified + return new ProtocolOutput(response.toContent(), + ProtocolStatus.STATUS_NOTMODIFIED); + + } else if (code == 401) { // access denied / no read permissions + return new ProtocolOutput(response.toContent(), new ProtocolStatus( + ProtocolStatus.ACCESS_DENIED)); + + } else if (code == 404) { // no such file + return new ProtocolOutput(response.toContent(), + ProtocolStatus.STATUS_NOTFOUND); - } else if (code >= 300 && code < 400) { // handle redirect + } else if (code >= 300 && code < 400) { // handle redirect u = new URL(response.getHeader("Location")); if (LOG.isTraceEnabled()) { - LOG.trace("redirect to " + u); + LOG.trace("redirect to " + u); } if (symlinksAsRedirects) { return new ProtocolOutput(response.toContent(), new ProtocolStatus( @@ -136,18 +145,18 @@ public class File implements Protocol { ProtocolStatus.REDIR_EXCEEDED, u)); } redirects++; - - } else { // convert to exception + + } else { // convert to exception throw new FileError(code); } - } + } } catch (Exception e) { e.printStackTrace(); return new ProtocolOutput(null, new ProtocolStatus(e)); } } - /** + /** * Quick way for running this class. Useful for debugging. */ public static void main(String[] args) throws Exception { @@ -162,7 +171,7 @@ public class File implements Protocol { System.err.println(usage); System.exit(-1); } - + for (int i = 0; i < args.length; i++) { if (args[i].equals("-logLevel")) { logLevel = args[++i]; @@ -170,7 +179,7 @@ public class File implements Protocol { maxContentLength = Integer.parseInt(args[++i]); } else if (args[i].equals("-dumpContent")) { dumpContent = true; - } else if (i != args.length-1) { + } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); } else @@ -184,18 +193,19 @@ public class File implements Protocol { file.setMaxContentLength(maxContentLength); // set log level - //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); + // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); - ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum()); + ProtocolOutput output = file.getProtocolOutput(new Text(urlString), + new CrawlDatum()); Content content = output.getContent(); System.err.println("URL: " + content.getUrl()); System.err.println("Status: " + output.getStatus()); System.err.println("Content-Type: " + content.getContentType()); - System.err.println("Content-Length: " + - content.getMetadata().get(Response.CONTENT_LENGTH)); - System.err.println("Last-Modified: " + - content.getMetadata().get(Response.LAST_MODIFIED)); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); String redirectLocation = content.getMetadata().get("Location"); if (redirectLocation != null) { System.err.println("Location: " + redirectLocation); @@ -208,12 +218,11 @@ public class File implements Protocol { file = null; } - /** - * No robots parsing is done for file protocol. - * So this returns a set of empty rules which will allow every url. + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. */ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { return RobotRulesParser.EMPTY_RULES; } } - Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java (original) +++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java Thu Jan 29 05:38:59 2015 @@ -17,13 +17,16 @@ package org.apache.nutch.protocol.file; -/** Thrown for File error codes. +/** + * Thrown for File error codes. */ public class FileError extends FileException { private int code; - - public int getCode(int code) { return code; } + + public int getCode(int code) { + return code; + } public FileError(int code) { super("File Error: " + code); Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (original) +++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java Thu Jan 29 05:38:59 2015 @@ -95,6 +95,7 @@ public class FileResponse { /** * Default public constructor + * * @param url * @param datum * @param file @@ -103,13 +104,13 @@ public class FileResponse { * @throws IOException */ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) - throws FileException, IOException { + throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; - + MIME = new MimeUtil(conf); tika = new Tika(); @@ -139,16 +140,16 @@ public class FileResponse { this.content = null; // url.toURI() is only in j2se 1.5.0 - //java.io.File f = new java.io.File(url.toURI()); + // java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { - this.code = 404; // http Not Found + this.code = 404; // http Not Found return; } if (!f.canRead()) { - this.code = 401; // http Unauthorized + this.code = 401; // http Unauthorized return; } @@ -157,20 +158,23 @@ public class FileResponse { // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers - //hdrs.put("Location", f.getCanonicalFile().toURI()); + // hdrs.put("Location", f.getCanonicalFile().toURI()); // - // we want to automatically escape characters that are illegal in URLs. - // It is recommended that new code convert an abstract pathname into a URL - // by first converting it into a URI, via the toURI method, and then + // we want to automatically escape characters that are illegal in URLs. + // It is recommended that new code convert an abstract pathname into a + // URL + // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. - headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); + headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL() + .toString()); - this.code = 300; // http redirect + this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; - this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); + this.headers.set("Last-Modified", + HttpDateFormat.toString(f.lastModified())); return; } @@ -240,6 +244,7 @@ public class FileResponse { /** * get dir list as http response + * * @param f * @throws IOException */ @@ -265,6 +270,7 @@ public class FileResponse { /** * generate html page from dir list + * * @param list * @param path * @param includeDotDot Modified: nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java (original) +++ nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java Thu Jan 29 05:38:59 2015 @@ -49,7 +49,8 @@ public class TestProtocolFile { private String sampleDir = System.getProperty("test.data", "."); private static final String[] testTextFiles = new String[] { - "testprotocolfile.txt", "testprotocolfile_(encoded).txt", "testprotocolfile_%28encoded%29.txt" }; + "testprotocolfile.txt", "testprotocolfile_(encoded).txt", + "testprotocolfile_%28encoded%29.txt" }; private static final CrawlDatum datum = new CrawlDatum(); @@ -90,8 +91,8 @@ public class TestProtocolFile { Assert.assertNotNull(output.getContent().getContentType()); Assert.assertEquals(expectedMimeType, output.getContent().getContentType()); Assert.assertNotNull(output.getContent().getMetadata()); - Assert.assertEquals(expectedMimeType, - output.getContent().getMetadata().get(Response.CONTENT_TYPE)); + Assert.assertEquals(expectedMimeType, output.getContent().getMetadata() + .get(Response.CONTENT_TYPE)); }