p...

lewismc Wed, 28 Jan 2015 21:39:30 -0800

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
 Thu Jan 29 05:38:59 2015
@@ -42,129 +42,129 @@ public class TestDOMContentUtils {
 
   private static final String[] testPages = {
 
-    new String("<html><head><title> title </title><script> script </script>"
-        + "</head><body> body <a href=\"http://www.nutch.org\";>"
-        + " anchor </a><!--comment-->" + "</body></html>"),
-
-        new String("<html><head><title> title </title><script> script 
</script>"
-            + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
-            + "<style> style </style>" + " <a href=\"bot.html\">" + " bots 
</a>"
-            + "</body></html>"),
-
-            new String("<html><head><title> </title>" + "</head><body> "
-                + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
-                + "</a></a>" + "</body></html>"),
-
-                // this one relies on certain neko fixup behavior, possibly
-                // distributing the anchors into the LI's-but not the other
-                // anchors (outside of them, instead)! So you get a tree that
-                // looks like:
-                // ... <li> <a href=/> home </a> </li>
-                // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-                // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> 
</li>
-                new String("<html><head><title> my title </title>"
-                    + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> 
home"
-                    + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + 
"</ul>"
-                    + "</body></html>"),
-
-                    // test frameset link extraction. The invalid frame in the 
middle
-                    // will be
-                    // fixed to a third standalone frame.
-                    new String("<html><head><title> my title </title>"
-                        + "</head><frameset rows=\"20,*\"> " + "<frame 
src=\"top.html\">"
-                        + "</frame>" + "<frameset cols=\"20,*\">"
-                        + "<frame src=\"left.html\">" + "<frame 
src=\"invalid.html\"/>"
-                        + "</frame>" + "<frame src=\"right.html\">" + 
"</frame>"
-                        + "</frameset>" + "</frameset>" + "</body></html>"),
-
-                        // test <area> and <iframe> link extraction + url 
normalization
-                        new String(
-                            "<html><head><title> my title </title>"
-                                + "</head><body>"
-                                + "<img src=\"logo.gif\" usemap=\"#green\" 
border=\"0\">"
-                                + "<map name=\"green\">"
-                                + "<area shape=\"polygon\" 
coords=\"19,44,45,11,87\" href=\"../index.html\">"
-                                + "<area shape=\"rect\" 
coords=\"128,132,241,179\" href=\"#bottom\">"
-                                + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
-                                + "</map>" + "<a name=\"bottom\"/><h1> the 
bottom </h1> "
-                                + "<iframe src=\"../docs/index.html\"/>" + 
"</body></html>"),
-
-                                // test whitespace processing for plain text 
extraction
-                                new String(
-                                    "<html><head>\n <title> my\t\n  title\r\n 
</title>\n"
-                                        + " </head>\n"
-                                        + " <body>\n"
-                                        + "    <h1> Whitespace\ttest  </h1> \n"
-                                        + "\t<a href=\"../index.html\">\n  
\twhitespace  test\r\n\t</a>  \t\n"
-                                        + "    <p> This is<span> a 
whitespace<span></span> test</span>. Newlines\n"
-                                        + "should appear as space 
too.</p><p>Tabs\tare spaces too.\n</p>"
-                                        + "    This\t<b>is a</b> break 
-&gt;<br>and the line after<i> break</i>.<br>\n"
-                                        + "<table>"
-                                        + "    
<tr><td>one</td><td>two</td><td>three</td></tr>\n"
-                                        + "    <tr><td>space here </td><td> 
space there</td><td>no space</td></tr>"
-                                        + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-                                        + "</table>put some text here<Br>and 
there."
-                                        + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-                                        + "         .        .        .        
 ." + "</body>  </html>"),
-
-                                        // test that <a rel=nofollow> links 
are not returned
-                                        new String("<html><head></head><body>"
-                                            + "<a 
href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
-                                            + "<a rel=\"nofollow\" 
href=\"http://www.nutch.org\";> ignore </a>"
-                                            + "</body></html>"),
-                                            // test that POST form actions are 
skipped
-                                            new 
String("<html><head></head><body>"
-                                                + "<form method='POST' 
action='/search.jsp'><input type=text>"
-                                                + "<input 
type=submit><p>test1</p></form>"
-                                                + "<form method='GET' 
action='/dummy.jsp'><input type=text>"
-                                                + "<input 
type=submit><p>test2</p></form></body></html>"),
-                                                // test that all form actions 
are skipped
-                                                new 
String("<html><head></head><body>"
-                                                    + "<form method='POST' 
action='/search.jsp'><input type=text>"
-                                                    + "<input 
type=submit><p>test1</p></form>"
-                                                    + "<form method='GET' 
action='/dummy.jsp'><input type=text>"
-                                                    + "<input 
type=submit><p>test2</p></form></body></html>"),
-                                                    new 
String("<html><head><title> title </title>" + "</head><body>"
-                                                        + "<a 
href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
-                                                        + "<a 
href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
-                                                        new 
String("<html><head><title> title </title>" + "</head><body>"
-                                                            + "<a 
href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
-                                                            + "<a 
href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
-                                                            + "<a 
href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\";>"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+
+      // test frameset link extraction. The invalid frame in the middle
+      // will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
+              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
 
   private static int SKIP = 9;
 
   private static String[] testBaseHrefs = { "http://www.nutch.org";,
-    "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
-    "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
-    "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
-    "http://www.nutch.org//";, "http://www.nutch.org/";,
-    "http://www.nutch.org/";, "http://www.nutch.org/";,
-  "http://www.nutch.org/;something"; };
+      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
+      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
+      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
+      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org/";, "http://www.nutch.org/";,
+      "http://www.nutch.org/;something"; };
 
   private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
 
   private static URL[] testBaseHrefURLs = new URL[testPages.length];
 
   private static final String[] answerText = {
-    "title body anchor",
-    "title body home bots",
-    "separate this from this",
-    "my title body home 1 2",
-    "my title",
-    "my title the bottom",
-    "my title Whitespace test whitespace test "
-        + "This is a whitespace test . Newlines should appear as space too. "
-        + "Tabs are spaces too. This is a break -> and the line after break . "
-        + "one two three space here space there no space "
-        + "one two two three three four put some text here and there. "
-        + "End this madness ! . . . .", "ignore ignore", "test1 test2",
-        "test1 test2", "title anchor1 anchor2 anchor3",
-  "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break 
. "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
 
   private static final String[] answerTitle = { "title", "title", "",
-    "my title", "my title", "my title", "my title", "", "", "", "title",
-  "title" };
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title" };
 
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
@@ -196,36 +196,36 @@ public class TestDOMContentUtils {
     answerOutlinks = new Outlink[][] {
         { new Outlink("http://www.nutch.org";, "anchor"), },
         { new Outlink("http://www.nutch.org/";, "home"),
-          new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
-          { new Outlink("http://www.nutch.org/";, "separate this"),
+            new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
+        { new Outlink("http://www.nutch.org/";, "separate this"),
             new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
-            { new Outlink("http://www.nutch.org/";, "home"),
-              new Outlink("http://www.nutch.org/docs/1";, "1"),
-              new Outlink("http://www.nutch.org/docs/2";, "2"), },
-              { new Outlink("http://www.nutch.org/frames/top.html";, ""),
-                new Outlink("http://www.nutch.org/frames/left.html";, ""),
-                new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
-                new Outlink("http://www.nutch.org/frames/right.html";, ""), },
-                { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
-                  new Outlink("http://www.nutch.org/index.html";, ""),
-                  new Outlink("http://www.nutch.org/maps/#bottom";, ""),
-                  new Outlink("http://www.nutch.org/bot.html";, ""),
-                  new Outlink("http://www.nutch.org/docs/index.html";, ""), },
-                  { new Outlink("http://www.nutch.org/index.html";, "whitespace 
test"), },
-                  {},
-                  { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
-                  {},
-                  { new Outlink("http://www.nutch.org/;x";, "anchor1"),
-                    new Outlink("http://www.nutch.org/g;x";, "anchor2"),
-                    new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
-                    {
-                      // this is tricky - see RFC3986 section 5.4.1 example 7
-                      new Outlink("http://www.nutch.org/g";, "anchor1"),
-                      new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
-                      new Outlink("http://www.nutch.org/;something?y=1";, 
"anchor3"),
-                      new Outlink("http://www.nutch.org/;something?y=1#s";, 
"anchor4"),
-                      new 
Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
-                          "anchor5") } };
+        { new Outlink("http://www.nutch.org/";, "home"),
+            new Outlink("http://www.nutch.org/docs/1";, "1"),
+            new Outlink("http://www.nutch.org/docs/2";, "2"), },
+        { new Outlink("http://www.nutch.org/frames/top.html";, ""),
+            new Outlink("http://www.nutch.org/frames/left.html";, ""),
+            new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
+            new Outlink("http://www.nutch.org/frames/right.html";, ""), },
+        { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
+            new Outlink("http://www.nutch.org/index.html";, ""),
+            new Outlink("http://www.nutch.org/maps/#bottom";, ""),
+            new Outlink("http://www.nutch.org/bot.html";, ""),
+            new Outlink("http://www.nutch.org/docs/index.html";, ""), },
+        { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), },
+        {},
+        { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
+        {},
+        { new Outlink("http://www.nutch.org/;x";, "anchor1"),
+            new Outlink("http://www.nutch.org/g;x";, "anchor2"),
+            new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
+        {
+            // this is tricky - see RFC3986 section 5.4.1 example 7
+            new Outlink("http://www.nutch.org/g";, "anchor1"),
+            new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
+            new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
+            new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
+            new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
+                "anchor5") } };
 
   }
 
@@ -256,7 +256,7 @@ public class TestDOMContentUtils {
           "expecting text: " + answerText[i]
               + System.getProperty("line.separator")
               + System.getProperty("line.separator") + "got text: " + text,
-              equalsIgnoreWhitespace(answerText[i], text));
+          equalsIgnoreWhitespace(answerText[i], text));
     }
   }
 
@@ -272,7 +272,7 @@ public class TestDOMContentUtils {
           "expecting text: " + answerText[i]
               + System.getProperty("line.separator")
               + System.getProperty("line.separator") + "got text: " + text,
-              equalsIgnoreWhitespace(answerTitle[i], text));
+          equalsIgnoreWhitespace(answerTitle[i], text));
     }
   }
 
@@ -318,19 +318,19 @@ public class TestDOMContentUtils {
               + outlinksString(o1) + System.getProperty("line.separator")
               + "got: " + System.getProperty("line.separator")
               + outlinksString(o2) + System.getProperty("line.separator"),
-              false);
+          false);
     }
 
     for (int i = 0; i < o1.length; i++) {
       if (!o1[i].equals(o2[i])) {
         Assert.assertTrue(
             "got wrong outlinks at position " + i
-            + System.getProperty("line.separator") + "answer: "
-            + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
-            + "', anchor: '" + o1[i].getAnchor() + "'"
-            + System.getProperty("line.separator") + "got: "
-            + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
-            + "', anchor: '" + o2[i].getAnchor() + "'", false);
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
       }
     }
   }


Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
 Thu Jan 29 05:38:59 2015
@@ -80,8 +80,8 @@ public class TestFeedParser {
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new Text(urlString),
           new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
-          content).get(content.getUrl());
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
 
       // check that there are 2 outlinks:
       // unlike the original parse-rss

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
 Thu Jan 29 05:38:59 2015
@@ -31,18 +31,16 @@ import org.apache.nutch.crawl.CrawlDatum
 import org.junit.Assert;
 import org.junit.Test;
 
-/** 
+/**
  * Test extraction of image metadata
  */
 public class TestImageMetadata {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
+  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  private String[] sampleFiles = {
-      "nutch_logo_tm.gif",
-  };
+  private String[] sampleFiles = { "nutch_logo_tm.gif", };
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
@@ -56,8 +54,10 @@ public class TestImageMetadata {
 
       Configuration conf = NutchConfiguration.create();
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
 
       Assert.assertEquals("121", parse.getData().getMeta("width"));
       Assert.assertEquals("48", parse.getData().getMeta("height"));

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
 Thu Jan 29 05:38:59 2015
@@ -34,20 +34,20 @@ import org.junit.Test;
 
 import java.io.File;
 
-/** 
+/**
  * Unit tests for MSWordParser.
- *
+ * 
  * @author John Xing
  */
 public class TestMSWordParser {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
+  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-msword/build.xml during plugin compilation.
   // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
-  private String[] sampleFiles = {"word97.doc"};
+  private String[] sampleFiles = { "word97.doc" };
 
   private String expectedText = "This is a sample doc file prepared for 
nutch.";
 
@@ -59,19 +59,23 @@ public class TestMSWordParser {
     conf.set("file.content.limit", "-1");
   }
 
-  public String getTextContent(String fileName) throws ProtocolException, 
ParseException {
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
     String urlString = "file:" + sampleDir + fileSeparator + fileName;
     Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
     return parse.getText();
   }
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
-    for (int i=0; i<sampleFiles.length; i++) {
+    for (int i = 0; i < sampleFiles.length; i++) {
       String found = getTextContent(sampleFiles[i]);
-      Assert.assertTrue("text found : 
'"+found+"'",found.startsWith(expectedText));
+      Assert.assertTrue("text found : '" + found + "'",
+          found.startsWith(expectedText));
     }
   }
 
@@ -79,8 +83,10 @@ public class TestMSWordParser {
   public void testOpeningDocs() throws ProtocolException, ParseException {
     String[] filenames = new File(sampleDir).list();
     for (int i = 0; i < filenames.length; i++) {
-      if (filenames[i].endsWith(".doc")==false) continue;
-      Assert.assertTrue("cann't read content of " + filenames[i], 
getTextContent(filenames[i]).length() > 0);
-    }      
+      if (filenames[i].endsWith(".doc") == false)
+        continue;
+      Assert.assertTrue("cann't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
+    }
   }
 }

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
 Thu Jan 29 05:38:59 2015
@@ -31,22 +31,22 @@ import org.apache.nutch.util.NutchConfig
 import org.junit.Assert;
 import org.junit.Test;
 
-/** 
+/**
  * Unit tests for OOParser.
- *
+ * 
  * @author Andrzej Bialecki
  */
 public class TestOOParser {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
+  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-oo/build.xml during plugin compilation.
-  private String[] sampleFiles = {"ootest.odt", "ootest.sxw"};
+  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
 
   private String expectedText;
-  
+
   private String sampleText = "ootest.txt";
 
   @Test
@@ -58,31 +58,36 @@ public class TestOOParser {
     Protocol protocol;
     ProtocolFactory factory = new ProtocolFactory(conf);
 
-    System.out.println("Expected : "+expectedText);
+    System.out.println("Expected : " + expectedText);
 
-    for (int i=0; i<sampleFiles.length; i++) {
+    for (int i = 0; i < sampleFiles.length; i++) {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
-      if (sampleFiles[i].startsWith("ootest")==false) continue;
+      if (sampleFiles[i].startsWith("ootest") == false)
+        continue;
 
       protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
 
       String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
 
-      // simply test for the presence of a text - the ordering of the elements 
may differ from what was expected
+      // simply test for the presence of a text - the ordering of the elements
+      // may differ from what was expected
       // in the previous tests
-      Assert.assertTrue(text!=null && text.length() > 0);
+      Assert.assertTrue(text != null && text.length() > 0);
 
-      System.out.println("Found "+sampleFiles[i]+": "+text);
+      System.out.println("Found " + sampleFiles[i] + ": " + text);
     }
   }
 
-  public TestOOParser() { 
+  public TestOOParser() {
     try {
       // read the test string
-      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + 
sampleText);
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+          + sampleText);
       StringBuffer sb = new StringBuffer();
       int len = 0;
       InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
@@ -99,5 +104,4 @@ public class TestOOParser {
     }
   }
 
-
 }

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
 Thu Jan 29 05:38:59 2015
@@ -31,23 +31,20 @@ import org.apache.nutch.crawl.CrawlDatum
 import org.junit.Assert;
 import org.junit.Test;
 
-/** 
+/**
  * Unit tests for PdfParser.
- *
+ * 
  * @author John Xing
  */
 public class TestPdfParser {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
+  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-pdf/build.xml during plugin compilation.
   // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
-  private String[] sampleFiles = {
-      "pdftest.pdf",
-      "encrypted.pdf"
-  };
+  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
 
   private String expectedText = "A VERY SMALL PDF FILE";
 
@@ -63,8 +60,10 @@ public class TestPdfParser {
 
       Configuration conf = NutchConfiguration.create();
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
 
       int index = parse.getText().indexOf(expectedText);
       Assert.assertTrue(index > 0);

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
 Thu Jan 29 05:38:59 2015
@@ -63,12 +63,13 @@ public class TestRTFParser {
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
+        content.getUrl());
     String text = parse.getText();
-    Assert.assertEquals("The quick brown fox jumps over the lazy dog", 
text.trim());
+    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
+        text.trim());
 
     String title = parse.getData().getTitle();
     Metadata meta = parse.getData().getParseMeta();

Modified: 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
 Thu Jan 29 05:38:59 2015
@@ -34,120 +34,96 @@ import org.junit.Test;
 public class TestRobotsMetaProcessor {
 
   /*
-
-  some sample tags:
-
-  <meta name="robots" content="index,follow">
-  <meta name="robots" content="noindex,follow">
-  <meta name="robots" content="index,nofollow">
-  <meta name="robots" content="noindex,nofollow">
-
-  <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
    */
 
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
 
-  public static String[] tests= 
-    {
-    "<html><head><title>test page</title>"
-        + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
-        + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
-        + "</head><body>"
-        + " some text"
-        + "</body></html>",
-
-        "<html><head><title>test page</title>"
-            + "<meta name=\"robots\" content=\"all\"> "
-            + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
-            + "</head><body>"
-            + " some text"
-            + "</body></html>",
-
-            "<html><head><title>test page</title>"
-                + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
-                + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
-                + "</head><body>"
-                + " some text"
-                + "</body></html>",
-
-                "<html><head><title>test page</title>"
-                    + "<meta name=\"robots\" content=\"none\"> "
-                    + "</head><body>"
-                    + " some text"
-                    + "</body></html>",
-
-                    "<html><head><title>test page</title>"
-                        + "<meta name=\"robots\" content=\"noindex,nofollow\"> 
"
-                        + "</head><body>"
-                        + " some text"
-                        + "</body></html>",
-
-                        "<html><head><title>test page</title>"
-                            + "<meta name=\"robots\" 
content=\"noindex,follow\"> "
-                            + "</head><body>"
-                            + " some text"
-                            + "</body></html>",
-
-                            "<html><head><title>test page</title>"
-                                + "<meta name=\"robots\" 
content=\"index,nofollow\"> "
-                                + "</head><body>"
-                                + " some text"
-                                + "</body></html>",
-
-                                "<html><head><title>test page</title>"
-                                    + "<meta name=\"robots\" 
content=\"index,follow\"> "
-                                    + "<base href=\"http://www.nutch.org/\";>"
-                                    + "</head><body>"
-                                    + " some text"
-                                    + "</body></html>",
-
-                                    "<html><head><title>test page</title>"
-                                        + "<meta name=\"robots\"> "
-                                        + "<base 
href=\"http://www.nutch.org/base/\";>"
-                                        + "</head><body>"
-                                        + " some text"
-                                        + "</body></html>",
-
-    };
-
-  public static final boolean[][] answers= {
-    {true, true, true},     // NONE
-    {false, false, true},   // all
-    {true, true, true},     // nOnE
-    {true, true, false},    // none
-    {true, true, false},    // noindex,nofollow
-    {true, false, false},   // noindex,follow
-    {false, true, false},   // index,nofollow
-    {false, false, false},  // index,follow
-    {false, false, false},  // missing!
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
   };
 
   private URL[][] currURLsAndAnswers;
 
   @Test
   public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser= new DOMFragmentParser();;
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
 
-    try { 
-      currURLsAndAnswers= new URL[][] {
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org";), null},
-          {new URL("http://www.nutch.org/foo/";), 
-            new URL("http://www.nutch.org/";)},
-            {new URL("http://www.nutch.org";), 
-              new URL("http://www.nutch.org/base/";)}
-      };
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org/foo/";),
+              new URL("http://www.nutch.org/";) },
+          { new URL("http://www.nutch.org";),
+              new URL("http://www.nutch.org/base/";) } };
     } catch (Exception e) {
       Assert.assertTrue("couldn't make test URLs!", false);
     }
 
-    for (int i= 0; i < tests.length; i++) {
-      byte[] bytes= tests[i].getBytes();
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
 
       DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
 
@@ -157,9 +133,8 @@ public class TestRobotsMetaProcessor {
         e.printStackTrace();
       }
 
-      HTMLMetaTags robotsMeta= new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
-          currURLsAndAnswers[i][0]);
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
currURLsAndAnswers[i][0]);
 
       Assert.assertTrue("got index wrong on test " + i,
           robotsMeta.getNoIndex() == answers[i][0]);
@@ -167,13 +142,13 @@ public class TestRobotsMetaProcessor {
           robotsMeta.getNoFollow() == answers[i][1]);
       Assert.assertTrue("got cache wrong on test " + i,
           robotsMeta.getNoCache() == answers[i][2]);
-      Assert.assertTrue("got base href wrong on test " + i + " (got "
-          + robotsMeta.getBaseHref() + ")",
-          ( (robotsMeta.getBaseHref() == null)
-              && (currURLsAndAnswers[i][1] == null) )
-              || ( (robotsMeta.getBaseHref() != null)
-                  && robotsMeta.getBaseHref().equals(
-                      currURLsAndAnswers[i][1]) ) );
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] 
== null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
 
     }
   }

Modified: 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
 Thu Jan 29 05:38:59 2015
@@ -55,9 +55,12 @@ public class ZipParser implements Parser
     List<Outlink> outLinksList = new ArrayList<Outlink>();
 
     try {
-      final String contentLen = 
content.getMetadata().get(Response.CONTENT_LENGTH);
+      final String contentLen = content.getMetadata().get(
+          Response.CONTENT_LENGTH);
       final int len = Integer.parseInt(contentLen);
-      if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); }
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("ziplen: " + len);
+      }
       final byte[] contentInBytes = content.getContent();
 
       if (contentLen != null && contentInBytes.length != len) {
@@ -76,7 +79,8 @@ public class ZipParser implements Parser
 
     } catch (Exception e) {
       return new ParseStatus(ParseStatus.FAILED,
-          "Can't be handled as Zip document. " + 
e).getEmptyParseResult(content.getUrl(), getConf());
+          "Can't be handled as Zip document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
     }
 
     if (resultText == null) {
@@ -89,11 +93,13 @@ public class ZipParser implements Parser
 
     outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
     final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
-                                              resultTitle, outlinks,
-                                              content.getMetadata());
+        resultTitle, outlinks, content.getMetadata());
 
-    if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
-    return ParseResult.createParseResult(content.getUrl(), new 
ParseImpl(resultText, parseData));
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Zip file parsed sucessfully !!");
+    }
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(
+        resultText, parseData));
   }
 
   public void setConf(Configuration conf) {

Modified: 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 Thu Jan 29 05:38:59 2015
@@ -44,12 +44,13 @@ import org.apache.nutch.protocol.Content
 import org.apache.tika.Tika;
 
 /**
- *
+ * 
  * @author Rohit Kulkarni & Ashish Vaidya
  */
 public class ZipTextExtractor {
-  
-  public static final Logger LOG = 
LoggerFactory.getLogger(ZipTextExtractor.class);
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ZipTextExtractor.class);
 
   private Configuration conf;
 
@@ -57,21 +58,22 @@ public class ZipTextExtractor {
   public ZipTextExtractor(Configuration conf) {
     this.conf = conf;
   }
-  
-  public String extractText(InputStream input, String url, List<Outlink> 
outLinksList) throws IOException {
+
+  public String extractText(InputStream input, String url,
+      List<Outlink> outLinksList) throws IOException {
     String resultText = "";
     ZipInputStream zin = new ZipInputStream(input);
     ZipEntry entry;
-    
+
     while ((entry = zin.getNextEntry()) != null) {
-      
+
       if (!entry.isDirectory()) {
         int size = (int) entry.getSize();
         byte[] b = new byte[size];
-        for(int x = 0; x < size; x++) {
+        for (int x = 0; x < size; x++) {
           int err = zin.read();
-          if(err != -1) {
-            b[x] = (byte)err;
+          if (err != -1) {
+            b[x] = (byte) err;
           }
         }
         String newurl = url + "/";
@@ -86,29 +88,33 @@ public class ZipTextExtractor {
           String contentType = tika.detect(fname);
           try {
             Metadata metadata = new Metadata();
-            metadata.set(Response.CONTENT_LENGTH, 
Long.toString(entry.getSize()));
+            metadata.set(Response.CONTENT_LENGTH,
+                Long.toString(entry.getSize()));
             metadata.set(Response.CONTENT_TYPE, contentType);
-            Content content = new Content(newurl, base, b, contentType, 
metadata, this.conf);
-            Parse parse = new 
ParseUtil(this.conf).parse(content).get(content.getUrl());
+            Content content = new Content(newurl, base, b, contentType,
+                metadata, this.conf);
+            Parse parse = new ParseUtil(this.conf).parse(content).get(
+                content.getUrl());
             ParseData theParseData = parse.getData();
             Outlink[] theOutlinks = theParseData.getOutlinks();
-            
-            for(int count = 0; count < theOutlinks.length; count++) {
-              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), 
theOutlinks[count].getAnchor()));
+
+            for (int count = 0; count < theOutlinks.length; count++) {
+              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
+                  theOutlinks[count].getAnchor()));
             }
-            
+
             resultText += entry.getName() + " " + parse.getText() + " ";
           } catch (ParseException e) {
-            if (LOG.isInfoEnabled()) { 
-              LOG.info("fetch okay, but can't parse " + fname + ", reason: " + 
e.getMessage());
+            if (LOG.isInfoEnabled()) {
+              LOG.info("fetch okay, but can't parse " + fname + ", reason: "
+                  + e.getMessage());
             }
           }
         }
       }
     }
-    
+
     return resultText;
   }
-  
-}
 
+}

Modified: 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Parse ZIP files: embedded files are recursively passed to appropriate 
parsers.
  */
 package org.apache.nutch.parse.zip;
+

Modified: 
nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 Thu Jan 29 05:38:59 2015
@@ -31,20 +31,20 @@ import org.apache.nutch.crawl.CrawlDatum
 import org.junit.Assert;
 import org.junit.Test;
 
-/** 
+/**
  * Based on Unit tests for MSWordParser by John Xing
- *
+ * 
  * @author Rohit Kulkarni & Ashish Vaidya
  */
 public class TestZipParser {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
-  
+  private String sampleDir = System.getProperty("test.data", ".");
+
   // Make sure sample files are copied to "test.data"
-  
-  private String[] sampleFiles = {"test.zip"};
+
+  private String[] sampleFiles = { "test.zip" };
 
   private String expectedText = "textfile.txt This is text file number 1 ";
 
@@ -60,8 +60,10 @@ public class TestZipParser {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
-      parse = new 
ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
+          content.getUrl());
       Assert.assertTrue(parse.getText().equals(expectedText));
     }
   }

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Thu Jan 29 05:38:59 2015
@@ -37,11 +37,12 @@ import org.apache.nutch.util.NutchConfig
 import crawlercommons.robots.BaseRobotRules;
 
 /**
- * This class is a protocol plugin used for file: scheme.
- * It creates {@link FileResponse} object and gets the content of the url from 
it.
- * Configurable parameters are {@code file.content.limit} and {@code 
file.crawl.parent} 
- * in nutch-default.xml defined under "file properties" section.
- *
+ * This class is a protocol plugin used for file: scheme. It creates
+ * {@link FileResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code file.content.limit} and
+ * {@code file.crawl.parent} in nutch-default.xml defined under
+ * "file properties" section.
+ * 
  * @author John Xing
  */
 public class File implements Protocol {
@@ -61,7 +62,8 @@ public class File implements Protocol {
 
   private Configuration conf;
 
-  public File() {}
+  public File() {
+  }
 
   /**
    * Set the {@link Configuration} object
@@ -80,52 +82,59 @@ public class File implements Protocol {
   public Configuration getConf() {
     return this.conf;
   }
-  
-  /** 
-   * Set the length after at which content is truncated. 
+
+  /**
+   * Set the length after at which content is truncated.
    */
   public void setMaxContentLength(int maxContentLength) {
     this.maxContentLength = maxContentLength;
   }
 
-  /** 
-   * Creates a {@link FileResponse} object corresponding to the url and 
-   * return a {@link ProtocolOutput} object as per the content received
+  /**
+   * Creates a {@link FileResponse} object corresponding to the url and return 
a
+   * {@link ProtocolOutput} object as per the content received
    * 
-   * @param url Text containing the url
-   * @param datum The CrawlDatum object corresponding to the url
+   * @param url
+   *          Text containing the url
+   * @param datum
+   *          The CrawlDatum object corresponding to the url
    * 
-   * @return {@link ProtocolOutput} object for the content of the file 
indicated by url
+   * @return {@link ProtocolOutput} object for the content of the file 
indicated
+   *         by url
    */
   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
-  
+
       int redirects = 0;
-  
+
       while (true) {
         FileResponse response;
-        response = new FileResponse(u, datum, this, getConf());   // make a 
request
-  
+        response = new FileResponse(u, datum, this, getConf()); // make a
+                                                                // request
+
         int code = response.getCode();
-  
-        if (code == 200) {                          // got a good response
-          return new ProtocolOutput(response.toContent());              // 
return it
-  
-        } else if (code == 304) {                   // got not modified
-          return new ProtocolOutput(response.toContent(), 
ProtocolStatus.STATUS_NOTMODIFIED);
 
-        } else if (code == 401) {                   // access denied / no read 
permissions
-          return new ProtocolOutput(response.toContent(), new 
ProtocolStatus(ProtocolStatus.ACCESS_DENIED));
+        if (code == 200) { // got a good response
+          return new ProtocolOutput(response.toContent()); // return it
 
-        } else if (code == 404) {                   // no such file
-          return new ProtocolOutput(response.toContent(), 
ProtocolStatus.STATUS_NOTFOUND);
+        } else if (code == 304) { // got not modified
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatus.STATUS_NOTMODIFIED);
+
+        } else if (code == 401) { // access denied / no read permissions
+          return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+              ProtocolStatus.ACCESS_DENIED));
+
+        } else if (code == 404) { // no such file
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatus.STATUS_NOTFOUND);
 
-        } else if (code >= 300 && code < 400) {     // handle redirect
+        } else if (code >= 300 && code < 400) { // handle redirect
           u = new URL(response.getHeader("Location"));
           if (LOG.isTraceEnabled()) {
-            LOG.trace("redirect to " + u); 
+            LOG.trace("redirect to " + u);
           }
           if (symlinksAsRedirects) {
             return new ProtocolOutput(response.toContent(), new ProtocolStatus(
@@ -136,18 +145,18 @@ public class File implements Protocol {
                 ProtocolStatus.REDIR_EXCEEDED, u));
           }
           redirects++;
-  
-        } else {                                    // convert to exception
+
+        } else { // convert to exception
           throw new FileError(code);
         }
-      } 
+      }
     } catch (Exception e) {
       e.printStackTrace();
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
 
-  /** 
+  /**
    * Quick way for running this class. Useful for debugging.
    */
   public static void main(String[] args) throws Exception {
@@ -162,7 +171,7 @@ public class File implements Protocol {
       System.err.println(usage);
       System.exit(-1);
     }
-      
+
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-logLevel")) {
         logLevel = args[++i];
@@ -170,7 +179,7 @@ public class File implements Protocol {
         maxContentLength = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-dumpContent")) {
         dumpContent = true;
-      } else if (i != args.length-1) {
+      } else if (i != args.length - 1) {
         System.err.println(usage);
         System.exit(-1);
       } else
@@ -184,18 +193,19 @@ public class File implements Protocol {
       file.setMaxContentLength(maxContentLength);
 
     // set log level
-    //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new 
CrawlDatum());
+    ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
+        new CrawlDatum());
     Content content = output.getContent();
 
     System.err.println("URL: " + content.getUrl());
     System.err.println("Status: " + output.getStatus());
     System.err.println("Content-Type: " + content.getContentType());
-    System.err.println("Content-Length: " +
-                       content.getMetadata().get(Response.CONTENT_LENGTH));
-    System.err.println("Last-Modified: " +
-                       content.getMetadata().get(Response.LAST_MODIFIED));
+    System.err.println("Content-Length: "
+        + content.getMetadata().get(Response.CONTENT_LENGTH));
+    System.err.println("Last-Modified: "
+        + content.getMetadata().get(Response.LAST_MODIFIED));
     String redirectLocation = content.getMetadata().get("Location");
     if (redirectLocation != null) {
       System.err.println("Location: " + redirectLocation);
@@ -208,12 +218,11 @@ public class File implements Protocol {
     file = null;
   }
 
-  /** 
-   * No robots parsing is done for file protocol. 
-   * So this returns a set of empty rules which will allow every url.
+  /**
+   * No robots parsing is done for file protocol. So this returns a set of 
empty
+   * rules which will allow every url.
    */
   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return RobotRulesParser.EMPTY_RULES;
   }
 }
-

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
 Thu Jan 29 05:38:59 2015
@@ -17,13 +17,16 @@
 
 package org.apache.nutch.protocol.file;
 
-/** Thrown for File error codes.
+/**
+ * Thrown for File error codes.
  */
 public class FileError extends FileException {
 
   private int code;
-  
-  public int getCode(int code) { return code; }
+
+  public int getCode(int code) {
+    return code;
+  }
 
   public FileError(int code) {
     super("File Error: " + code);

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Thu Jan 29 05:38:59 2015
@@ -95,6 +95,7 @@ public class FileResponse {
 
   /**
    * Default public constructor
+   * 
    * @param url
    * @param datum
    * @param file
@@ -103,13 +104,13 @@ public class FileResponse {
    * @throws IOException
    */
   public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
-    throws FileException, IOException {
+      throws FileException, IOException {
 
     this.orig = url.toString();
     this.base = url.toString();
     this.file = file;
     this.conf = conf;
-    
+
     MIME = new MimeUtil(conf);
     tika = new Tika();
 
@@ -139,16 +140,16 @@ public class FileResponse {
       this.content = null;
 
       // url.toURI() is only in j2se 1.5.0
-      //java.io.File f = new java.io.File(url.toURI());
+      // java.io.File f = new java.io.File(url.toURI());
       java.io.File f = new java.io.File(path);
 
       if (!f.exists()) {
-        this.code = 404;  // http Not Found
+        this.code = 404; // http Not Found
         return;
       }
 
       if (!f.canRead()) {
-        this.code = 401;  // http Unauthorized
+        this.code = 401; // http Unauthorized
         return;
       }
 
@@ -157,20 +158,23 @@ public class FileResponse {
       // where case is insensitive
       if (!f.equals(f.getCanonicalFile())) {
         // set headers
-        //hdrs.put("Location", f.getCanonicalFile().toURI());
+        // hdrs.put("Location", f.getCanonicalFile().toURI());
         //
-        // we want to automatically escape characters that are illegal in 
URLs. 
-        // It is recommended that new code convert an abstract pathname into a 
URL 
-        // by first converting it into a URI, via the toURI method, and then 
+        // we want to automatically escape characters that are illegal in URLs.
+        // It is recommended that new code convert an abstract pathname into a
+        // URL
+        // by first converting it into a URI, via the toURI method, and then
         // converting the URI into a URL via the URI.toURL method.
-        headers.set(Response.LOCATION, 
f.getCanonicalFile().toURI().toURL().toString());
+        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
+            .toString());
 
-        this.code = 300;  // http redirect
+        this.code = 300; // http redirect
         return;
       }
       if (f.lastModified() <= datum.getModifiedTime()) {
         this.code = 304;
-        this.headers.set("Last-Modified", 
HttpDateFormat.toString(f.lastModified()));
+        this.headers.set("Last-Modified",
+            HttpDateFormat.toString(f.lastModified()));
         return;
       }
 
@@ -240,6 +244,7 @@ public class FileResponse {
 
   /**
    * get dir list as http response
+   * 
    * @param f
    * @throws IOException
    */
@@ -265,6 +270,7 @@ public class FileResponse {
 
   /**
    * generate html page from dir list
+   * 
    * @param list
    * @param path
    * @param includeDotDot

Modified: 
nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
 Thu Jan 29 05:38:59 2015
@@ -49,7 +49,8 @@ public class TestProtocolFile {
   private String sampleDir = System.getProperty("test.data", ".");
 
   private static final String[] testTextFiles = new String[] {
-    "testprotocolfile.txt", "testprotocolfile_(encoded).txt", 
"testprotocolfile_%28encoded%29.txt" };
+      "testprotocolfile.txt", "testprotocolfile_(encoded).txt",
+      "testprotocolfile_%28encoded%29.txt" };
 
   private static final CrawlDatum datum = new CrawlDatum();
 
@@ -90,8 +91,8 @@ public class TestProtocolFile {
     Assert.assertNotNull(output.getContent().getContentType());
     Assert.assertEquals(expectedMimeType, 
output.getContent().getContentType());
     Assert.assertNotNull(output.getContent().getMetadata());
-    Assert.assertEquals(expectedMimeType,
-        output.getContent().getMetadata().get(Response.CONTENT_TYPE));
+    Assert.assertEquals(expectedMimeType, output.getContent().getMetadata()
+        .get(Response.CONTENT_TYPE));
 
   }

svn commit: r1655526 [19/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Reply via email to