Author: siren
Date: Sat Mar 10 13:39:04 2007
New Revision: 516788

URL: http://svn.apache.org/viewvc?view=rev&rev=516788
Log:
change parse-js to use regular expressions from jre, add junit test, moved 
package.html to proper place

Added:
    
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
      - copied unchanged from r516662, 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
    lucene/nutch/trunk/src/plugin/parse-js/src/test/
    lucene/nutch/trunk/src/plugin/parse-js/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/
    
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java
Removed:
    
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
Modified:
    lucene/nutch/trunk/src/plugin/build.xml
    
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=516788&r1=516787&r2=516788
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sat Mar 10 13:39:04 2007
@@ -89,6 +89,7 @@
      <ant dir="ontology" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-html" target="test"/>
+     <ant dir="parse-js" target="test"/>
      <!-- <ant dir="parse-mp3" target="test"/> -->
      <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>

Modified: 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=516788&r1=516787&r2=516788
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
 Sat Mar 10 13:39:04 2007
@@ -25,6 +25,8 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -40,13 +42,6 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -54,11 +49,24 @@
 import org.w3c.dom.NodeList;
 
 /**
- * This class is a heuristic link extractor for JavaScript files and
- * code snippets. The general idea of a two-pass regex matching comes from
- * Heritrix. Parts of the code come from OutlinkExtractor.java
- * by Stephan Strittmatter.
- *
+ * <p>
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter.
+ * </p>
+ * 
+ * <p>
+ * This Filter extracts javascript from following locations:
+ * </p>
+ * <li>from inside &lt;script> tags</li>
+ * <li>from html 4.0 events like Window: onload,onunload, Form:
+ * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard:
+ * onkeydown,onkeypress,onkeyup Mouse:
+ * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+ * </li>
+ * <li>a href starting with literal "javascript"</li>
+ * 
+ * 
  * @author Andrzej Bialecki &lt;[EMAIL PROTECTED]&gt;
  */
 public class JSParseFilter implements HtmlParseFilter, Parser {
@@ -97,6 +105,7 @@
         Node lNode = n.getAttributes().getNamedItem("language");
         if (lNode == null) lang = "javascript";
         else lang = lNode.getNodeValue();
+        //XXX lang is not checked??
         StringBuffer script = new StringBuffer();
         NodeList nn = n.getChildNodes();
         if (nn.getLength() > 0) {
@@ -104,9 +113,9 @@
             if (i > 0) script.append('\n');
             script.append(nn.item(i).getNodeValue());
           }
-          // if (LOG.isInfoEnabled()) {
-          //   LOG.info("script: language=" + lang + ", text: " + 
script.toString());
-          // }
+          if (LOG.isDebugEnabled()) {
+            LOG.info("script: language=" + lang + ", text: " + 
script.toString());
+          }
           Outlink[] links = getJSLinks(script.toString(), "", base);
           if (links != null && links.length > 0) 
outlinks.addAll(Arrays.asList(links));
           // no other children of interest here, go one level up.
@@ -175,7 +184,7 @@
   /**
    *  This method extracts URLs from literals embedded in JavaScript.
    */
-  private Outlink[] getJSLinks(String plainText, String anchor, String base) {
+  Outlink[] getJSLinks(String plainText, String anchor, String base) {
 
     final List outlinks = new ArrayList();
     URL baseURL = null;
@@ -187,30 +196,27 @@
     }
 
     try {
-      final PatternCompiler cp = new Perl5Compiler();
-      final Pattern pattern = cp.compile(STRING_PATTERN,
-          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
-      final Pattern pattern1 = cp.compile(URI_PATTERN,
-              Perl5Compiler.CASE_INSENSITIVE_MASK | 
Perl5Compiler.READ_ONLY_MASK
-                  | Perl5Compiler.MULTILINE_MASK);
-      final PatternMatcher matcher = new Perl5Matcher();
-
-      final PatternMatcher matcher1 = new Perl5Matcher();
-      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+      final Pattern stringPattern = Pattern.compile(STRING_PATTERN,
+          Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+      final Pattern urlPattern = Pattern.compile(URI_PATTERN,
+              Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+      
+      final Matcher quoted = stringPattern.matcher(plainText);
 
-      MatchResult result;
       String url;
 
       //loop the matches
-      while (matcher.contains(input, pattern)) {
-        result = matcher.getMatch();
-        url = result.group(2);
-        PatternMatcherInput input1 = new PatternMatcherInput(url);
-        if (!matcher1.matches(input1, pattern1)) {
+      while (quoted.find()) {
+        String quotedString = quoted.group(2);
+        Matcher urls = urlPattern.matcher(quotedString);
+        
+        if (!urls.find()) {
           //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); 
}
           continue;
         }
+
+        url = urls.group();
+        
         if (url.startsWith("www.")) {
             url = "http://"; + url;
         } else {

Added: 
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java?view=auto&rev=516788
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java
 Sat Mar 10 13:39:04 2007
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.js;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class JSParseFilterTest extends TestCase {
+
+  static Configuration conf = NutchConfiguration.create();
+
+  public void testExtractUrlFromEmbeddedHTML() {
+    String js = "// embedded html\n"
+        + "html = \"<a href=\\\"http://www.example.com\\\";>example</a>";
+    assertFound(js, "http://www.example.com";, "http://www.example.com/";);
+  }
+
+  public void testExtractUrlFromParameterValue() {
+    String js = "// parameter\n" + "url = \"http://www.example.com\"";;
+    assertFound(js, "http://www.example.com";, "http://www.example.com/";);
+  }
+
+  public void testExtractUrlFromFunctionParameter() {
+    String js = "load('http://www.example.com/');return false;";
+    assertFound(js, "http://www.example.com";, "http://www.example.com/";);
+
+    js = "load(\"http://www.example.com/\";);";
+    assertFound(js, "http://www.example.com";, "http://www.example.com/";);
+
+    js = "load(\"http://www.example.com/\";);";
+    assertFound(js, "http://www.example.com";, "http://www.example.com/";);
+  }
+
+  public void testExtractUrlMissingProtocol() {
+    String js = "// parameter\n" + "url = \"www.example.com\"";
+    assertFound(js, "http://www.example.com";, "http://www.example.com/";);
+  }
+
+  public void testExtractUrlServerRelative() {
+    String js = "// parameter\n" + "url = \"/page.html\"";
+    assertFound(js, "http://www.example.com/foo/bar/";,
+        "http://www.example.com/page.html";);
+  }
+
+  public void testExtractUrlRelative() {
+    String js = "// parameter\n" + "url = \"page.html\"";
+    assertFound(js, "http://www.example.com";,
+        "http://www.example.com/page.html";);
+  }
+
+  private void assertFound(String js, String base, String expected) {
+    System.out.println("jS:" + js);
+    JSParseFilter filter = new JSParseFilter();
+    filter.setConf(conf);
+    Outlink[] links = filter.getJSLinks(js, "anchor", base);
+    assertSame(1, links.length);
+    assertEquals(expected + "!='" + links[0].getToUrl() + "'", expected,
+        links[0].getToUrl());
+  }
+
+}



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to