Author: siren Date: Sat Mar 10 13:39:04 2007 New Revision: 516788 URL: http://svn.apache.org/viewvc?view=rev&rev=516788 Log: change parse-js to use regular expressions from jre, add junit test, moved package.html to proper place
Added: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html - copied unchanged from r516662, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html lucene/nutch/trunk/src/plugin/parse-js/src/test/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java Removed: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html Modified: lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=516788&r1=516787&r2=516788 ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Sat Mar 10 13:39:04 2007 @@ -89,6 +89,7 @@ <ant dir="ontology" target="test"/> <!--ant dir="parse-ext" target="test"/--> <ant dir="parse-html" target="test"/> + <ant dir="parse-js" target="test"/> <!-- <ant dir="parse-mp3" target="test"/> --> <ant dir="parse-msexcel" target="test"/> <ant dir="parse-mspowerpoint" target="test"/> Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=516788&r1=516787&r2=516788 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Mar 10 13:39:04 2007 @@ -25,6 +25,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -40,13 +42,6 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.Pattern; -import org.apache.oro.text.regex.PatternCompiler; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.PatternMatcherInput; -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -54,11 +49,24 @@ import org.w3c.dom.NodeList; /** - * This class is a heuristic link extractor for JavaScript files and - * code snippets. The general idea of a two-pass regex matching comes from - * Heritrix. Parts of the code come from OutlinkExtractor.java - * by Stephan Strittmatter. - * + * <p> + * This class is a heuristic link extractor for JavaScript files and code + * snippets. The general idea of a two-pass regex matching comes from Heritrix. + * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter. + * </p> + * + * <p> + * This Filter extracts javascript from following locations: + * </p> + * <li>from inside <script> tags</li> + * <li>from html 4.0 events like Window: onload,onunload, Form: + * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard: + * onkeydown,onkeypress,onkeyup Mouse: + * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup + * </li> + * <li>a href starting with literal "javascript"</li> + * + * * @author Andrzej Bialecki <[EMAIL PROTECTED]> */ public class JSParseFilter implements HtmlParseFilter, Parser { @@ -97,6 +105,7 @@ Node lNode = n.getAttributes().getNamedItem("language"); if (lNode == null) lang = "javascript"; else lang = lNode.getNodeValue(); + //XXX lang is not checked?? StringBuffer script = new StringBuffer(); NodeList nn = n.getChildNodes(); if (nn.getLength() > 0) { @@ -104,9 +113,9 @@ if (i > 0) script.append('\n'); script.append(nn.item(i).getNodeValue()); } - // if (LOG.isInfoEnabled()) { - // LOG.info("script: language=" + lang + ", text: " + script.toString()); - // } + if (LOG.isDebugEnabled()) { + LOG.info("script: language=" + lang + ", text: " + script.toString()); + } Outlink[] links = getJSLinks(script.toString(), "", base); if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); // no other children of interest here, go one level up. @@ -175,7 +184,7 @@ /** * This method extracts URLs from literals embedded in JavaScript. */ - private Outlink[] getJSLinks(String plainText, String anchor, String base) { + Outlink[] getJSLinks(String plainText, String anchor, String base) { final List outlinks = new ArrayList(); URL baseURL = null; @@ -187,30 +196,27 @@ } try { - final PatternCompiler cp = new Perl5Compiler(); - final Pattern pattern = cp.compile(STRING_PATTERN, - Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); - final Pattern pattern1 = cp.compile(URI_PATTERN, - Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); - final PatternMatcher matcher = new Perl5Matcher(); - - final PatternMatcher matcher1 = new Perl5Matcher(); - final PatternMatcherInput input = new PatternMatcherInput(plainText); + final Pattern stringPattern = Pattern.compile(STRING_PATTERN, + Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); + final Pattern urlPattern = Pattern.compile(URI_PATTERN, + Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); + + final Matcher quoted = stringPattern.matcher(plainText); - MatchResult result; String url; //loop the matches - while (matcher.contains(input, pattern)) { - result = matcher.getMatch(); - url = result.group(2); - PatternMatcherInput input1 = new PatternMatcherInput(url); - if (!matcher1.matches(input1, pattern1)) { + while (quoted.find()) { + String quotedString = quoted.group(2); + Matcher urls = urlPattern.matcher(quotedString); + + if (!urls.find()) { //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); } continue; } + + url = urls.group(); + if (url.startsWith("www.")) { url = "http://" + url; } else { Added: lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java?view=auto&rev=516788 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java (added) +++ lucene/nutch/trunk/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/JSParseFilterTest.java Sat Mar 10 13:39:04 2007 @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.js; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class JSParseFilterTest extends TestCase { + + static Configuration conf = NutchConfiguration.create(); + + public void testExtractUrlFromEmbeddedHTML() { + String js = "// embedded html\n" + + "html = \"<a href=\\\"http://www.example.com\\\">example</a>"; + assertFound(js, "http://www.example.com", "http://www.example.com/"); + } + + public void testExtractUrlFromParameterValue() { + String js = "// parameter\n" + "url = \"http://www.example.com\""; + assertFound(js, "http://www.example.com", "http://www.example.com/"); + } + + public void testExtractUrlFromFunctionParameter() { + String js = "load('http://www.example.com/');return false;"; + assertFound(js, "http://www.example.com", "http://www.example.com/"); + + js = "load(\"http://www.example.com/\");"; + assertFound(js, "http://www.example.com", "http://www.example.com/"); + + js = "load(\"http://www.example.com/\");"; + assertFound(js, "http://www.example.com", "http://www.example.com/"); + } + + public void testExtractUrlMissingProtocol() { + String js = "// parameter\n" + "url = \"www.example.com\""; + assertFound(js, "http://www.example.com", "http://www.example.com/"); + } + + public void testExtractUrlServerRelative() { + String js = "// parameter\n" + "url = \"/page.html\""; + assertFound(js, "http://www.example.com/foo/bar/", + "http://www.example.com/page.html"); + } + + public void testExtractUrlRelative() { + String js = "// parameter\n" + "url = \"page.html\""; + assertFound(js, "http://www.example.com", + "http://www.example.com/page.html"); + } + + private void assertFound(String js, String base, String expected) { + System.out.println("jS:" + js); + JSParseFilter filter = new JSParseFilter(); + filter.setConf(conf); + Outlink[] links = filter.getJSLinks(js, "anchor", base); + assertSame(1, links.length); + assertEquals(expected + "!='" + links[0].getToUrl() + "'", expected, + links[0].getToUrl()); + } + +} ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs