s...

lewismc Thu, 08 Jan 2015 22:35:31 -0800

Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 Fri Jan  9 06:34:33 2015
@@ -36,326 +36,253 @@ import org.junit.Before;
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-/** 
+/**
  * Unit tests for DOMContentUtils.
  */
 public class TestDOMContentUtils {
 
-  private static final String[] testPages= { 
-    new String("<html><head><title> title </title><script> script </script>"
-               + "</head><body> body <a href=\"http://www.nutch.org\";>"
-               + " anchor </a><!--comment-->"
-               + "</body></html>"),
-    new String("<html><head><title> title </title><script> script </script>"
-               + "</head><body> body <a href=\"/\">"
-               + " home </a><!--comment-->"
-               + "<style> style </style>"
-               + " <a href=\"bot.html\">"
-               + " bots </a>"
-               + "</body></html>"),
-    new String("<html><head><title> </title>"
-               + "</head><body> "
-               + "<a href=\"/\"> separate this "
-               + "<a href=\"ok\"> from this"
-               + "</a></a>"
-               + "</body></html>"),
-    // this one relies on certain neko fixup behavior, possibly
-    // distributing the anchors into the LI's-but not the other
-    // anchors (outside of them, instead)!  So you get a tree that
-    // looks like:
-    // ... <li> <a href=/> home </a> </li>
-    //     <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-    //     <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
-    new String("<html><head><title> my title </title>"
-               + "</head><body> body "
-               + "<ul>"
-               + "<li> <a href=\"/\"> home"
-               + "<li> <a href=\"1\"> 1"
-               + "<li> <a href=\"2\"> 2"
-               + "</ul>"
-               + "</body></html>"),
-    // test frameset link extraction. The invalid frame in the middle will be
-    // fixed to a third standalone frame.
-    new String("<html><head><title> my title </title>"
-               + "</head><frameset rows=\"20,*\"> "
-               + "<frame src=\"top.html\">"
-               + "</frame>"
-               + "<frameset cols=\"20,*\">"
-               + "<frame src=\"left.html\">"
-               + "<frame src=\"invalid.html\"/>"
-               + "</frame>"
-               + "<frame src=\"right.html\">"
-               + "</frame>"
-               + "</frameset>"
-               + "</frameset>"
-               + "</body></html>"),
-    // test <area> and <iframe> link extraction + url normalization
-    new String("<html><head><title> my title </title>"
-               + "</head><body>"
-               + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-                          + "<map name=\"green\">"
-                          + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
-                          + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
-                          + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
-                          + "</map>"
-               + "<a name=\"bottom\"/><h1> the bottom </h1> "
-               + "<iframe src=\"../docs/index.html\"/>"
-               + "</body></html>"),
-    // test whitespace processing for plain text extraction
-    new String("<html><head>\n <title> my\t\n  title\r\n </title>\n"
-               + " </head>\n"
-               + " <body>\n"
-               + "    <h1> Whitespace\ttest  </h1> \n"
-               + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a> 
 \t\n"
-               + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
-               + "should appear as space too.</p><p>Tabs\tare spaces 
too.\n</p>"
-               + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
-               + "<table>"
-               + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
-               + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
-               + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-               + "</table>put some text here<Br>and there."
-               + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-               + "         .        .        .         ."
-               + "</body>  </html>"),
-
-    // test that <a rel=nofollow> links are not returned
-    new String("<html><head></head><body>"
-               + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore 
</a>"
-               + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore 
</a>"
-               + "</body></html>"),
-    // test that POST form actions are skipped
-    new String("<html><head></head><body>"
-            + "<form method='POST' action='/search.jsp'><input type=text>"
-            + "<input type=submit><p>test1</p></form>"
-            + "<form method='GET' action='/dummy.jsp'><input type=text>"
-            + "<input type=submit><p>test2</p></form></body></html>"),
-    // test that all form actions are skipped
-    new String("<html><head></head><body>"
-            + "<form method='POST' action='/search.jsp'><input type=text>"
-            + "<input type=submit><p>test1</p></form>"
-            + "<form method='GET' action='/dummy.jsp'><input type=text>"
-            + "<input type=submit><p>test2</p></form></body></html>"),
-    new String("<html><head><title> title </title>"
-      + "</head><body>"
-      + "<a href=\";x\">anchor1</a>"
-      + "<a href=\"g;x\">anchor2</a>"
-      + "<a href=\"g;x?y#s\">anchor3</a>"
-      + "</body></html>"),  
-    new String("<html><head><title> title </title>"
-        + "</head><body>"
-        + "<a href=\"g\">anchor1</a>"
-        + "<a href=\"g?y#s\">anchor2</a>"
-        + "<a href=\"?y=1\">anchor3</a>"
-        + "<a href=\"?y=1#s\">anchor4</a>"
-        + "<a href=\"?y=1;somethingelse\">anchor5</a>"
-        + "</body></html>"), 
-  };
-  
+  private static final String[] testPages = {
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\";>"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+      // test frameset link extraction. The invalid frame in the middle will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" 
href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" 
href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" 
href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  
\t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. 
Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> 
break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no 
space</td></tr>"
+              + 
"\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\"; rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\";> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), 
};
+
   private static int SKIP = 9;
 
-  private static String[] testBaseHrefs= {
-    "http://www.nutch.org";,     
-    "http://www.nutch.org/docs/foo.html";,     
-    "http://www.nutch.org/docs/";,     
-    "http://www.nutch.org/docs/";,
-    "http://www.nutch.org/frames/";,     
-    "http://www.nutch.org/maps/";,
-    "http://www.nutch.org/whitespace/";,
-    "http://www.nutch.org//";,
-    "http://www.nutch.org/";,
-    "http://www.nutch.org/";,
-    "http://www.nutch.org/";,
-    "http://www.nutch.org/;something";
-  };
-    
-  private static final DocumentFragment testDOMs[]=
-    new DocumentFragment[testPages.length];
-
-  private static URL[] testBaseHrefURLs= 
-    new URL[testPages.length];
-
-
-  private static final String[] answerText= {
-    "title body anchor",
-    "title body home bots",
-    "separate this from this",
-    "my title body home 1 2",
-    "my title",
-    "my title the bottom",
-    "my title Whitespace test whitespace test "
-        + "This is a whitespace test . Newlines should appear as space too. "
-        + "Tabs are spaces too. This is a break -> and the line after break . "
-        + "one two three space here space there no space "
-        + "one two two three three four put some text here and there. "
-        + "End this madness ! . . . .",
-    "ignore ignore",
-    "test1 test2",
-    "test1 test2",
-    "title anchor1 anchor2 anchor3",
-    "title anchor1 anchor2 anchor3 anchor4 anchor5"
-  };
-
-  private static final String[] answerTitle= {
-    "title",
-    "title",
-    "",
-    "my title",
-    "my title",
-    "my title",
-    "my title",
-    "",
-    "",
-    "",
-    "title",
-    "title"
-  };
+  private static String[] testBaseHrefs = { "http://www.nutch.org";,
+      "http://www.nutch.org/docs/foo.html";, "http://www.nutch.org/docs/";,
+      "http://www.nutch.org/docs/";, "http://www.nutch.org/frames/";,
+      "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";,
+      "http://www.nutch.org//";, "http://www.nutch.org/";,
+      "http://www.nutch.org/";, "http://www.nutch.org/";,
+      "http://www.nutch.org/;something"; };
+
+  private static final DocumentFragment testDOMs[] = new 
DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break 
. "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title" };
 
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
-  
+
   private static Configuration conf;
   private static DOMContentUtils utils = null;
-  
+
   @Before
   public void setup() {
     conf = NutchConfiguration.create();
     conf.setBoolean("parser.html.form.use_action", true);
     utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser= new DOMFragmentParser();
+    DOMFragmentParser parser = new DOMFragmentParser();
     try {
-      parser.setFeature(
-          
"http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";,
-          true);
-    } catch (SAXException e) {}
-    for (int i= 0; i < testPages.length; i++) {
-        DocumentFragment node= 
-          new HTMLDocumentImpl().createDocumentFragment();
-        try {
-          parser.parse(
-            new InputSource( 
-              new ByteArrayInputStream(testPages[i].getBytes()) ),
+      parser
+          .setFeature(
+              
"http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";,
+              true);
+    } catch (SAXException e) {
+    }
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
             node);
-          testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
-        } catch (Exception e) {
-          assertTrue("caught exception: " + e, false);
-        } 
-      testDOMs[i]= node;
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
     }
     try {
-    answerOutlinks = new Outlink[][]{ 
-        {
-          new Outlink("http://www.nutch.org";, "anchor"),
-        },
-        {
-          new Outlink("http://www.nutch.org/";, "home"),
-          new Outlink("http://www.nutch.org/docs/bot.html";, "bots"),
-        },
-        {
-          new Outlink("http://www.nutch.org/";, "separate this"),
-          new Outlink("http://www.nutch.org/docs/ok";, "from this"),
-        },
-        {
-          new Outlink("http://www.nutch.org/";, "home"),
-          new Outlink("http://www.nutch.org/docs/1";, "1"),
-          new Outlink("http://www.nutch.org/docs/2";, "2"),
-        },
-        {
-          new Outlink("http://www.nutch.org/frames/top.html";, ""),
-          new Outlink("http://www.nutch.org/frames/left.html";, ""),
-          new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
-          new Outlink("http://www.nutch.org/frames/right.html";, ""),
-        },
-        {
-          new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
-          new Outlink("http://www.nutch.org/index.html";, ""),
-          new Outlink("http://www.nutch.org/maps/#bottom";, ""),
-          new Outlink("http://www.nutch.org/bot.html";, ""),
-          new Outlink("http://www.nutch.org/docs/index.html";, ""),
-        },
-        {
-          new Outlink("http://www.nutch.org/index.html";, "whitespace test"),
-        },
-        {
-        },
-        {
-          new Outlink("http://www.nutch.org/dummy.jsp";, "test2"),
-        },
-        {
-        },
-        {
-          new Outlink("http://www.nutch.org/;x";, "anchor1"),
-          new Outlink("http://www.nutch.org/g;x";, "anchor2"),
-          new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3")
-        },
-        {
-          // this is tricky - see RFC3986 section 5.4.1 example 7
-          new Outlink("http://www.nutch.org/g";, "anchor1"),
-          new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
-          new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
-          new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
-          new Outlink("http://www.nutch.org/;something?y=1;somethingelse";, 
"anchor5")
-        }
-    };
+      answerOutlinks = new Outlink[][] {
+          { new Outlink("http://www.nutch.org";, "anchor"), },
+          { new Outlink("http://www.nutch.org/";, "home"),
+              new Outlink("http://www.nutch.org/docs/bot.html";, "bots"), },
+          { new Outlink("http://www.nutch.org/";, "separate this"),
+              new Outlink("http://www.nutch.org/docs/ok";, "from this"), },
+          { new Outlink("http://www.nutch.org/";, "home"),
+              new Outlink("http://www.nutch.org/docs/1";, "1"),
+              new Outlink("http://www.nutch.org/docs/2";, "2"), },
+          { new Outlink("http://www.nutch.org/frames/top.html";, ""),
+              new Outlink("http://www.nutch.org/frames/left.html";, ""),
+              new Outlink("http://www.nutch.org/frames/invalid.html";, ""),
+              new Outlink("http://www.nutch.org/frames/right.html";, ""), },
+          { new Outlink("http://www.nutch.org/maps/logo.gif";, ""),
+              new Outlink("http://www.nutch.org/index.html";, ""),
+              new Outlink("http://www.nutch.org/maps/#bottom";, ""),
+              new Outlink("http://www.nutch.org/bot.html";, ""),
+              new Outlink("http://www.nutch.org/docs/index.html";, ""), },
+          { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), 
},
+          {},
+          { new Outlink("http://www.nutch.org/dummy.jsp";, "test2"), },
+          {},
+          { new Outlink("http://www.nutch.org/;x";, "anchor1"),
+              new Outlink("http://www.nutch.org/g;x";, "anchor2"),
+              new Outlink("http://www.nutch.org/g;x?y#s";, "anchor3") },
+          {
+              // this is tricky - see RFC3986 section 5.4.1 example 7
+              new Outlink("http://www.nutch.org/g";, "anchor1"),
+              new Outlink("http://www.nutch.org/g?y#s";, "anchor2"),
+              new Outlink("http://www.nutch.org/;something?y=1";, "anchor3"),
+              new Outlink("http://www.nutch.org/;something?y=1#s";, "anchor4"),
+              new Outlink("http://www.nutch.org/;something?y=1;somethingelse";,
+                  "anchor5") } };
 
     } catch (MalformedURLException e) {
-        
-  }
+
+    }
   }
 
   private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-    StringTokenizer st1= new StringTokenizer(s1);
-    StringTokenizer st2= new StringTokenizer(s2);
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
 
     while (st1.hasMoreTokens()) {
-      if (!st2.hasMoreTokens()) 
+      if (!st2.hasMoreTokens())
         return false;
-      if ( ! st1.nextToken().equals(st2.nextToken()) )
+      if (!st1.nextToken().equals(st2.nextToken()))
         return false;
     }
-    if (st2.hasMoreTokens()) 
+    if (st2.hasMoreTokens())
       return false;
     return true;
   }
 
   @Test
   public void testGetText() {
-    if (testDOMs[0] == null) 
+    if (testDOMs[0] == null)
       setup();
-    for (int i= 0; i < testPages.length; i++) {
-      StringBuilder sb= new StringBuilder();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuilder sb = new StringBuilder();
       utils.getText(sb, testDOMs[i]);
-      String text= sb.toString();
-      assertTrue("expecting text: " + answerText[i] 
-                 + System.getProperty("line.separator") 
-                 + System.getProperty("line.separator") 
-                 + "got text: "+ text, 
-                 equalsIgnoreWhitespace(answerText[i], text));
+      String text = sb.toString();
+      assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
     }
   }
 
   @Test
   public void testGetTitle() {
-    if (testDOMs[0] == null) 
+    if (testDOMs[0] == null)
       setup();
-    for (int i= 0; i < testPages.length; i++) {
-      StringBuilder sb= new StringBuilder();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuilder sb = new StringBuilder();
       utils.getTitle(sb, testDOMs[i]);
-      String text= sb.toString();
-      assertTrue("expecting text: " + answerText[i] 
-                 + System.getProperty("line.separator") 
-                 + System.getProperty("line.separator") 
-                 + "got text: "+ text, 
-                 equalsIgnoreWhitespace(answerTitle[i], text));
+      String text = sb.toString();
+      assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
     }
   }
 
   @Test
   public void testGetOutlinks() {
-    if (testDOMs[0] == null) 
+    if (testDOMs[0] == null)
       setup();
-    for (int i= 0; i < testPages.length; i++) {
-      ArrayList<Outlink> outlinks= new ArrayList<Outlink>();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
       if (i == SKIP) {
         conf.setBoolean("parser.html.form.use_action", false);
         utils.setConf(conf);
@@ -364,52 +291,48 @@ public class TestDOMContentUtils {
         utils.setConf(conf);
       }
       utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
-      Outlink[] outlinkArr= new Outlink[outlinks.size()];
-      outlinkArr= outlinks.toArray(outlinkArr);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = outlinks.toArray(outlinkArr);
       compareOutlinks(answerOutlinks[i], outlinkArr);
     }
   }
 
   private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-    for (int i= 0; i < o.length; i++) {
+    for (int i = 0; i < o.length; i++) {
       sb.append(o[i].toString());
       sb.append(System.getProperty("line.separator"));
     }
   }
 
   private static final String outlinksString(Outlink[] o) {
-    StringBuffer sb= new StringBuffer();
+    StringBuffer sb = new StringBuffer();
     appendOutlinks(sb, o);
     return sb.toString();
   }
 
   private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
     if (o1.length != o2.length) {
-      assertTrue("got wrong number of outlinks (expecting " + o1.length 
-                 + ", got " + o2.length + ")" 
-                 + System.getProperty("line.separator") 
-                 + "answer: " + System.getProperty("line.separator") 
-                 + outlinksString(o1) 
-                 + System.getProperty("line.separator") 
-                 + "got: " + System.getProperty("line.separator") 
-                 + outlinksString(o2)
-                 + System.getProperty("line.separator"),
-                 false
-        );
+      assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
     }
 
-    for (int i= 0; i < o1.length; i++) {
+    for (int i = 0; i < o1.length; i++) {
       if (!o1[i].equals(o2[i])) {
-        assertTrue("got wrong outlinks at position " + i
-                   + System.getProperty("line.separator") 
-                   + "answer: " + System.getProperty("line.separator") 
-                   + "'" + o1[i].getToUrl() + "', anchor: '" + 
o1[i].getAnchor() + "'"
-                   + System.getProperty("line.separator") 
-                   + "got: " + System.getProperty("line.separator") 
-                   + "'" + o2[i].getToUrl() + "', anchor: '" + 
o2[i].getAnchor() + "'",
-                   false
-          );
-        
+        assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+
       }
     }
   }


Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
 Fri Jan  9 06:34:33 2015
@@ -36,71 +36,55 @@ import static org.junit.Assert.*;
 
 public class TestHtmlParser {
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(TestHtmlParser.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TestHtmlParser.class);
+
+  private static final String encodingTestKeywords = "franÃ§ais, espaÃ±ol, 
ÑÑÑÑÐºÐ¸Ð¹ ÑÐ·ÑÐº, ÄeÅ¡tina, ÎµÎ»Î»Î·Î½Î¹ÎºÎ¬";
+  private static final String encodingTestBody = "<ul>\n  <li>franÃ§ais\n  
<li>espaÃ±ol\n  <li>ÑÑÑÑÐºÐ¸Ð¹ ÑÐ·ÑÐº\n  <li>ÄeÅ¡tina\n  
<li>ÎµÎ»Î»Î·Î½Î¹ÎºÎ¬\n</ul>";
+  private static final String encodingTestContent = "<title>"
+      + encodingTestKeywords + "</title>\n"
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
+      + "</meta>\n" + "</head>\n<body>" + encodingTestBody + 
"</body>\n</html>";
+
+  private static String[][] encodingTestPages = {
+      {
+          "HTML4, utf-8, meta http-equiv, no quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; 
charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML4, utf-8, meta http-equiv, single quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv='Content-Type' content='text/html; 
charset=utf-8' />"
+              + encodingTestContent },
+      {
+          "XHTML, utf-8, meta http-equiv, double quotes",
+          "utf-8",
+          "<?xml version=\"1.0\"?>\n<html 
xmlns=\"http://www.w3.org/1999/xhtml\";>"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; 
charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML5, utf-8, meta charset",
+          "utf-8",
+          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+              + encodingTestContent },
+      { "HTML5, utf-8, BOM", "utf-8",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+      { "HTML5, utf-16, BOM", "utf-16",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
 
-  private static final String encodingTestKeywords = 
-      "franÃ§ais, espaÃ±ol, ÑÑÑÑÐºÐ¸Ð¹ ÑÐ·ÑÐº, ÄeÅ¡tina, 
ÎµÎ»Î»Î·Î½Î¹ÎºÎ¬";
-  private static final String encodingTestBody =
-      "<ul>\n  <li>franÃ§ais\n  <li>espaÃ±ol\n  <li>ÑÑÑÑÐºÐ¸Ð¹ ÑÐ·ÑÐº\n  
<li>ÄeÅ¡tina\n  <li>ÎµÎ»Î»Î·Î½Î¹ÎºÎ¬\n</ul>";
-  private static final String encodingTestContent =
-      "<title>" + encodingTestKeywords + "</title>\n"
-          + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + 
"</meta>\n"
-          + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
-
-  private static String[][] encodingTestPages= {
-    { 
-      "HTML4, utf-8, meta http-equiv, no quotes",
-      "utf-8",
-      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
-          + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n"
-          + "<html>\n<head>\n"
-          + "<meta http-equiv=Content-Type content=\"text/html; 
charset=utf-8\" />"
-          + encodingTestContent
-    },
-    { 
-      "HTML4, utf-8, meta http-equiv, single quotes",
-      "utf-8",
-      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
-          + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n"
-          + "<html>\n<head>\n"
-          + "<meta http-equiv='Content-Type' content='text/html; 
charset=utf-8' />"
-          + encodingTestContent
-    },
-    { 
-      "XHTML, utf-8, meta http-equiv, double quotes",
-      "utf-8",
-      "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\";>"
-          + "<html>\n<head>\n"
-          + "<meta http-equiv=\"Content-Type\" content=\"text/html; 
charset=utf-8\" />"
-          + encodingTestContent
-    },
-    { 
-      "HTML5, utf-8, meta charset",
-      "utf-8",
-      "<!DOCTYPE html>\n<html>\n<head>\n"
-          + "<meta charset=\"utf-8\">"
-          + encodingTestContent
-    },
-    { 
-      "HTML5, utf-8, BOM",
-      "utf-8",
-      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
-          + encodingTestContent
-    },
-    { 
-      "HTML5, utf-16, BOM",
-      "utf-16",
-      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
-          + encodingTestContent
-    }
-  };
-  
   private Configuration conf;
   private Parser parser;
-  
+
   private static final String dummyUrl = "http://dummy.url/";;
 
-  
   @Before
   public void setup() {
     conf = NutchConfiguration.create();
@@ -115,25 +99,25 @@ public class TestHtmlParser {
     page.setContentType(new Utf8("text/html"));
     return page;
   }
-  
+
   protected Parse parse(WebPage page) {
     return parser.getParse(dummyUrl, page);
   }
 
-
   @Test
   public void testEncodingDetection() {
     for (String[] testPage : encodingTestPages) {
       String name = testPage[0];
       Charset charset = Charset.forName(testPage[1]);
       byte[] contentBytes = testPage[2].getBytes(charset);
-      //Parse parse = parse(contentBytes);
+      // Parse parse = parse(contentBytes);
       WebPage page = page(contentBytes);
       Parse parse = parse(page);
       String text = parse.getText();
       String title = parse.getTitle();
-      //String keywords = parse.getMeta("keywords");
-      String keywords = Bytes.toString(page.getMetadata().get(new 
Utf8("keywords")));
+      // String keywords = parse.getMeta("keywords");
+      String keywords = Bytes.toString(page.getMetadata().get(
+          new Utf8("keywords")));
       LOG.info(name);
       LOG.info("title:\t" + title);
       LOG.info("keywords:\t" + keywords);

Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
 Fri Jan  9 06:34:33 2015
@@ -34,120 +34,96 @@ import static org.junit.Assert.*;
 public class TestRobotsMetaProcessor {
 
   /*
-
-  some sample tags:
-
-  <meta name="robots" content="index,follow">
-  <meta name="robots" content="noindex,follow">
-  <meta name="robots" content="index,nofollow">
-  <meta name="robots" content="noindex,nofollow">
-
-  <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-
-  */
-
-
-  public static String[] tests= 
-  {
-    "<html><head><title>test page</title>"
-    + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
-    + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\" content=\"all\"> "
-    + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
-    + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\" content=\"none\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\" content=\"noindex,follow\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\" content=\"index,nofollow\"> "
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\" content=\"index,follow\"> "
-    + "<base href=\"http://www.nutch.org/\";>"
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
-
-    "<html><head><title>test page</title>"
-    + "<meta name=\"robots\"> "
-    + "<base href=\"http://www.nutch.org/base/\";>"
-    + "</head><body>"
-    + " some text"
-    + "</body></html>",
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\";>" + "</head><body>"
+          + " some text" + "</body></html>",
 
   };
 
-  public static final boolean[][] answers= {
-    {true, true, true},     // NONE
-    {false, false, true},   // all
-    {true, true, true},     // nOnE
-    {true, true, false},    // none
-    {true, true, false},    // noindex,nofollow
-    {true, false, false},   // noindex,follow
-    {false, true, false},   // index,nofollow
-    {false, false, false},  // index,follow
-    {false, false, false},  // missing!
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
   };
 
   private URL[][] currURLsAndAnswers;
 
   @Test
   public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser= new DOMFragmentParser();;
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
 
-    try { 
-      currURLsAndAnswers= new URL[][] {
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org";), null},
-        {new URL("http://www.nutch.org/foo/";), 
-         new URL("http://www.nutch.org/";)},
-        {new URL("http://www.nutch.org";), 
-         new URL("http://www.nutch.org/base/";)}
-      };
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org";), null },
+          { new URL("http://www.nutch.org/foo/";),
+              new URL("http://www.nutch.org/";) },
+          { new URL("http://www.nutch.org";),
+              new URL("http://www.nutch.org/base/";) } };
     } catch (Exception e) {
       assertTrue("couldn't make test URLs!", false);
     }
 
-    for (int i= 0; i < tests.length; i++) {
-      byte[] bytes= tests[i].getBytes();
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
 
       DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
 
@@ -157,24 +133,22 @@ public class TestRobotsMetaProcessor {
         e.printStackTrace();
       }
 
-      HTMLMetaTags robotsMeta= new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
-                                                  currURLsAndAnswers[i][0]);
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
currURLsAndAnswers[i][0]);
 
       assertTrue("got index wrong on test " + i,
-                 robotsMeta.getNoIndex() == answers[i][0]);
+          robotsMeta.getNoIndex() == answers[i][0]);
       assertTrue("got follow wrong on test " + i,
-                 robotsMeta.getNoFollow() == answers[i][1]);
+          robotsMeta.getNoFollow() == answers[i][1]);
       assertTrue("got cache wrong on test " + i,
-                 robotsMeta.getNoCache() == answers[i][2]);
-      assertTrue("got base href wrong on test " + i + " (got "
-                 + robotsMeta.getBaseHref() + ")",
-                 ( (robotsMeta.getBaseHref() == null)
-                    && (currURLsAndAnswers[i][1] == null) )
-                 || ( (robotsMeta.getBaseHref() != null)
-                      && robotsMeta.getBaseHref().equals(
-                        currURLsAndAnswers[i][1]) ) );
-      
+          robotsMeta.getNoCache() == answers[i][2]);
+      assertTrue(
+          "got base href wrong on test " + i + " (got "
+              + robotsMeta.getBaseHref() + ")",
+          ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == 
null))
+              || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                  .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
     }
   }
 

Modified: 
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 Fri Jan  9 06:34:33 2015
@@ -56,11 +56,10 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 
 /**
- * This class is a heuristic link extractor for JavaScript files and
- * code snippets. The general idea of a two-pass regex matching comes from
- * Heritrix. Parts of the code come from OutlinkExtractor.java
- * by Stephan Strittmatter.
- *
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter.
+ * 
  * @author Andrzej Bialecki &lt;a...@getopt.org&gt;
  */
 public class JSParseFilter implements ParseFilter, Parser {
@@ -72,11 +71,17 @@ public class JSParseFilter implements Pa
 
   /**
    * Scan the JavaScript looking for possible {@link Outlink}'s
-   * @param url URL of the {@link WebPage} to be parsed 
-   * @param page {@link WebPage} object relative to the URL
-   * @param parse {@link Parse} object holding parse status
-   * @param metatags within the {@link NutchDocument}
-   * @param doc The {@link NutchDocument} object
+   * 
+   * @param url
+   *          URL of the {@link WebPage} to be parsed
+   * @param page
+   *          {@link WebPage} object relative to the URL
+   * @param parse
+   *          {@link Parse} object holding parse status
+   * @param metatags
+   *          within the {@link NutchDocument}
+   * @param doc
+   *          The {@link NutchDocument} object
    * @return parse the actual {@link Parse} object
    */
   @Override
@@ -98,28 +103,34 @@ public class JSParseFilter implements Pa
     return parse;
   }
 
-  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, 
List<Outlink> outlinks) {
+  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
+      List<Outlink> outlinks) {
     if (n instanceof Element) {
       String name = n.getNodeName();
       if (name.equalsIgnoreCase("script")) {
         @SuppressWarnings("unused")
         String lang = null;
         Node lNode = n.getAttributes().getNamedItem("language");
-        if (lNode == null) lang = "javascript";
-        else lang = lNode.getNodeValue();
+        if (lNode == null)
+          lang = "javascript";
+        else
+          lang = lNode.getNodeValue();
         StringBuffer script = new StringBuffer();
         NodeList nn = n.getChildNodes();
         if (nn.getLength() > 0) {
           for (int i = 0; i < nn.getLength(); i++) {
-            if (i > 0) script.append('\n');
+            if (i > 0)
+              script.append('\n');
             script.append(nn.item(i).getNodeValue());
           }
           // This logging makes the output very messy.
-          //if (LOG.isInfoEnabled()) {
-          //  LOG.info("script: language=" + lang + ", text: " + 
script.toString());
-          //}
+          // if (LOG.isInfoEnabled()) {
+          // LOG.info("script: language=" + lang + ", text: " +
+          // script.toString());
+          // }
           Outlink[] links = getJSLinks(script.toString(), "", base);
-          if (links != null && links.length > 0) 
outlinks.addAll(Arrays.asList(links));
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
           // no other children of interest here, go one level up.
           return;
         }
@@ -131,7 +142,8 @@ public class JSParseFilter implements Pa
           // Window: onload,onunload
           // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
           // Keyboard: onkeydown,onkeypress,onkeyup
-          // Mouse: 
onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+          // Mouse:
+          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
           Node anode = attrs.item(i);
           Outlink[] links = null;
           if (anode.getNodeName().startsWith("on")) {
@@ -142,7 +154,8 @@ public class JSParseFilter implements Pa
               links = getJSLinks(val, "", base);
             }
           }
-          if (links != null && links.length > 0) 
outlinks.addAll(Arrays.asList(links));
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
         }
       }
     }
@@ -154,42 +167,51 @@ public class JSParseFilter implements Pa
 
   /**
    * Set the {@link Configuration} object
-   * @param url URL of the {@link WebPage} which is parsed
-   * @param page {@link WebPage} object relative to the URL
+   * 
+   * @param url
+   *          URL of the {@link WebPage} which is parsed
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * @return parse the actual {@link Parse} object
    */
   @Override
   public Parse getParse(String url, WebPage page) {
     String type = TableUtil.toString(page.getContentType());
-    if (type != null && !type.trim().equals("") && 
!type.toLowerCase().startsWith("application/x-javascript"))
-      return 
ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_INVALID_FORMAT,
-          "Content not JavaScript: '" + type + "'", getConf());
+    if (type != null && !type.trim().equals("")
+        && !type.toLowerCase().startsWith("application/x-javascript"))
+      return ParseStatusUtils.getEmptyParse(
+          ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '"
+              + type + "'", getConf());
     String script = Bytes.toString(page.getContent());
     Outlink[] outlinks = getJSLinks(script, "", url);
-    if (outlinks == null) outlinks = new Outlink[0];
+    if (outlinks == null)
+      outlinks = new Outlink[0];
     // Title? use the first line of the script...
     String title;
     int idx = script.indexOf('\n');
     if (idx != -1) {
-      if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
+      if (idx > MAX_TITLE_LEN)
+        idx = MAX_TITLE_LEN;
       title = script.substring(0, idx);
     } else {
       idx = Math.min(MAX_TITLE_LEN, script.length());
       title = script.substring(0, idx);
     }
-    Parse parse =
-      new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS);
+    Parse parse = new Parse(script, title, outlinks,
+        ParseStatusUtils.STATUS_SUCCESS);
     return parse;
   }
 
   private static final String STRING_PATTERN = 
"(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
   // A simple pattern. This allows also invalid URL characters.
   private static final String URI_PATTERN = 
"(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+
   // Alternative pattern, which limits valid url characters.
-  //private static final String URI_PATTERN = 
"(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+  // private static final String URI_PATTERN =
+  // 
"(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
 
   /**
-   *  This method extracts URLs from literals embedded in JavaScript.
+   * This method extracts URLs from literals embedded in JavaScript.
    */
   private Outlink[] getJSLinks(String plainText, String anchor, String base) {
 
@@ -199,8 +221,8 @@ public class JSParseFilter implements Pa
     try {
       baseURL = new URL(base);
     } catch (Exception e) {
-      if (LOG.isErrorEnabled()) { 
-        LOG.error("error assigning base URL", e); 
+      if (LOG.isErrorEnabled()) {
+        LOG.error("error assigning base URL", e);
       }
     }
 
@@ -208,10 +230,10 @@ public class JSParseFilter implements Pa
       final PatternCompiler cp = new Perl5Compiler();
       final Pattern pattern = cp.compile(STRING_PATTERN,
           Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-          | Perl5Compiler.MULTILINE_MASK);
+              | Perl5Compiler.MULTILINE_MASK);
       final Pattern pattern1 = cp.compile(URI_PATTERN,
           Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-          | Perl5Compiler.MULTILINE_MASK);
+              | Perl5Compiler.MULTILINE_MASK);
       final PatternMatcher matcher = new Perl5Matcher();
 
       final PatternMatcher matcher1 = new Perl5Matcher();
@@ -220,28 +242,28 @@ public class JSParseFilter implements Pa
       MatchResult result;
       String url;
 
-      //loop the matches
+      // loop the matches
       while (matcher.contains(input, pattern)) {
         result = matcher.getMatch();
         url = result.group(2);
         PatternMatcherInput input1 = new PatternMatcherInput(url);
         if (!matcher1.matches(input1, pattern1)) {
-          if (LOG.isTraceEnabled()) { 
-               LOG.trace(" - invalid '" + url + "'"); 
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(" - invalid '" + url + "'");
           }
           continue;
         }
         if (url.startsWith("www.")) {
           url = "http://"; + url;
         } else {
-          // See if candidate URL is parseable.  If not, pass and move on to
+          // See if candidate URL is parseable. If not, pass and move on to
           // the next match.
           try {
             url = new URL(baseURL, url).toString();
           } catch (MalformedURLException ex) {
             if (LOG.isTraceEnabled()) {
-              LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
-                  baseURL + "'", ex);
+              LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
+                  + baseURL + "'", ex);
             }
             continue;
           }
@@ -255,14 +277,14 @@ public class JSParseFilter implements Pa
     } catch (Exception ex) {
       // if it is a malformed URL we just throw it away and continue with
       // extraction.
-      if (LOG.isErrorEnabled()) { 
-        LOG.error(" - invalid or malformed URL", ex); 
+      if (LOG.isErrorEnabled()) {
+        LOG.error(" - invalid or malformed URL", ex);
       }
     }
 
     final Outlink[] retval;
 
-    //create array of the Outlinks
+    // create array of the Outlinks
     if (outlinks != null && outlinks.size() > 0) {
       retval = outlinks.toArray(new Outlink[0]);
     } else {
@@ -273,8 +295,10 @@ public class JSParseFilter implements Pa
   }
 
   /**
-   * Main method which can be run from command line with the plugin option.
-   * The method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js 
baseURL  
+   * Main method which can be run from command line with the plugin option. The
+   * method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js
+   * baseURL
+   * 
    * @param args
    * @throws Exception
    */
@@ -287,7 +311,8 @@ public class JSParseFilter implements Pa
     BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
     StringBuffer sb = new StringBuffer();
     String line = null;
-    while ((line = br.readLine()) != null) sb.append(line + "\n");
+    while ((line = br.readLine()) != null)
+      sb.append(line + "\n");
     JSParseFilter parseFilter = new JSParseFilter();
     parseFilter.setConf(NutchConfiguration.create());
     Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
@@ -311,10 +336,9 @@ public class JSParseFilter implements Pa
   }
 
   /**
-   * Gets all the fields for a given {@link WebPage}
-   * Many datastores need to setup the mapreduce job by specifying the fields
-   * needed. All extensions that work on WebPage are able to specify what 
fields
-   * they need.
+   * Gets all the fields for a given {@link WebPage} Many datastores need to
+   * setup the mapreduce job by specifying the fields needed. All extensions
+   * that work on WebPage are able to specify what fields they need.
    */
   @Override
   public Collection<WebPage.Field> getFields() {

Modified: 
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
 Fri Jan  9 06:34:33 2015
@@ -20,3 +20,4 @@
  * from JavaScript files and embedded JavaScript code snippets.
  */
 package org.apache.nutch.parse.js;
+

Modified: 
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
 Fri Jan  9 06:34:33 2015
@@ -38,9 +38,9 @@ import java.nio.ByteBuffer;
 import static org.junit.Assert.assertEquals;
 
 /**
- * JUnit test case for {@link JSParseFilter} which tests 
- * 1. That 5 outlinks are extracted from JavaScript snippets embedded in HTML
- * 2. That X outlinks are extracted from a pure JavaScript file (this is 
temporarily disabled)
+ * JUnit test case for {@link JSParseFilter} which tests 1. That 5 outlinks are
+ * extracted from JavaScript snippets embedded in HTML 2. That X outlinks are
+ * extracted from a pure JavaScript file (this is temporarily disabled)
  * 
  * @author lewismc
  */
@@ -54,47 +54,53 @@ public class TestJSParseFilter {
 
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-js/build.xml during plugin compilation.
-  private String[] sampleFiles = { "parse_pure_js_test.js", 
"parse_embedded_js_test.html" };
-         
+  private String[] sampleFiles = { "parse_pure_js_test.js",
+      "parse_embedded_js_test.html" };
+
   private Configuration conf;
-         
+
   @Before
   public void setUp() {
     conf = NutchConfiguration.create();
     conf.set("file.content.limit", "-1");
   }
 
-  public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, 
ParseException, IOException {
+  public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException,
+      ParseException, IOException {
     String urlString;
     Parse parse;
-       
+
     urlString = "file:" + sampleDir + fileSeparator + sampleFiles;
     File file = new File(urlString);
     byte[] bytes = new byte[(int) file.length()];
     DataInputStream dip = new DataInputStream(new FileInputStream(file));
     dip.readFully(bytes);
     dip.close();
-    
+
     WebPage page = WebPage.newBuilder().build();
     page.setBaseUrl(new Utf8(urlString));
     page.setContent(ByteBuffer.wrap(bytes));
     MimeUtil mutil = new MimeUtil(conf);
     String mime = mutil.getMimeType(file);
     page.setContentType(new Utf8(mime));
-       
+
     parse = new ParseUtil(conf).parse(urlString, page);
     return parse.getOutlinks();
   }
-  
+
   @Test
-  public void testOutlinkExtraction() throws ProtocolException, 
ParseException, IOException {
+  public void testOutlinkExtraction() throws ProtocolException, ParseException,
+      IOException {
     String[] filenames = new File(sampleDir).list();
     for (int i = 0; i < filenames.length; i++) {
       if (filenames[i].endsWith(".js") == true) {
-        assertEquals("number of outlinks in .js test file should be 5", 5, 
getOutlinks(sampleFiles));
-        // temporarily disabled as a suitable pure JS file could not be be 
found.
-        //} else {
-        //assertEquals("number of outlinks in .html file should be X", 5, 
getOutlinks(sampleFiles));
+        assertEquals("number of outlinks in .js test file should be 5", 5,
+            getOutlinks(sampleFiles));
+        // temporarily disabled as a suitable pure JS file could not be be
+        // found.
+        // } else {
+        // assertEquals("number of outlinks in .html file should be X", 5,
+        // getOutlinks(sampleFiles));
       }
     }
   }

Modified: 
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
 Fri Jan  9 06:34:33 2015
@@ -21,3 +21,4 @@
  * (see {@link org.apache.nutch.indexer.metadata}).
  */
 package org.apache.nutch.parse.metatags;
+

Modified: 
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
 Fri Jan  9 06:34:33 2015
@@ -59,14 +59,15 @@ public class TestMetaTagsParser {
 
   /**
    * 
-   *
+   * 
    * @param fileName
    *          This variable set test file.
    * @param useUtil
    *          If value is True method use ParseUtil
    * @return If successfully document parsed, it return metatags
    */
-  public Map<CharSequence, ByteBuffer> parseMetaTags(String fileName, boolean 
useUtil) {
+  public Map<CharSequence, ByteBuffer> parseMetaTags(String fileName,
+      boolean useUtil) {
     try {
       Configuration conf = NutchConfiguration.create();
       String urlString = "file:" + sampleDir + fileSeparator + fileName;

svn commit: r1650447 [17/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ s...

Reply via email to