Author: jukka Date: Fri Dec 21 01:12:20 2007 New Revision: 606140 URL: http://svn.apache.org/viewvc?rev=606140&view=rev Log: TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex - Patch from Niall Pemberton
Added: incubator/tika/trunk/src/test/java/org/apache/tika/utils/ incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/pom.xml incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=606140&r1=606139&r2=606140&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Fri Dec 21 01:12:20 2007 @@ -137,3 +137,6 @@ 62. TIKA-104 - Add utility methods to throw IOException with the caused intialized (jukka & Niall Pemberton) + +63. TIKA-106 - Remove dependency on Jakarta ORO - use JDK 1.4 Regex + (Niall Pemberton) Modified: incubator/tika/trunk/pom.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=606140&r1=606139&r2=606140&view=diff ============================================================================== --- incubator/tika/trunk/pom.xml (original) +++ incubator/tika/trunk/pom.xml Fri Dec 21 01:12:20 2007 @@ -195,11 +195,6 @@ <version>1.1.1</version> </dependency> <dependency> - <groupId>oro</groupId> - <artifactId>oro</artifactId> - <version>2.0.8</version> - </dependency> - <dependency> <groupId>nekohtml</groupId> <artifactId>nekohtml</artifactId> <version>0.9.5</version> Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=606140&r1=606139&r2=606140&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java Fri Dec 21 01:12:20 2007 @@ -20,7 +20,6 @@ import java.io.InputStream; import java.io.StringWriter; -import org.apache.oro.text.regex.MalformedPatternException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.TeeContentHandler; @@ -38,11 +37,6 @@ */ public class ParserPostProcessor extends ParserDecorator { - private static final String LINK_PATTERN = - "([A-Za-z][A-Za-z0-9+.-]{1,120}:" - + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" - + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; - /** * Creates a post-processing decorator for the given parser. * @@ -70,12 +64,8 @@ int length = Math.min(content.length(), 500); metadata.set("summary", content.substring(0, length)); - try { - for (String link : RegexUtils.extract(content, LINK_PATTERN)) { - metadata.add("outlinks", link); - } - } catch (MalformedPatternException e) { - throw new TikaException("Malformed URL pattern", e); + for (String link : RegexUtils.extractLinks(content)) { + metadata.add("outlinks", link); } } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java?rev=606140&r1=606139&r2=606140&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/RegexUtils.java Fri Dec 21 01:12:20 2007 @@ -17,17 +17,10 @@ package org.apache.tika.utils; import java.util.ArrayList; +import java.util.Collections; import java.util.List; - -import org.apache.log4j.Logger; -import org.apache.oro.text.regex.MalformedPatternException; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.Pattern; -import org.apache.oro.text.regex.PatternCompiler; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.PatternMatcherInput; -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract @@ -37,32 +30,37 @@ */ public class RegexUtils { - static Logger logger = Logger.getRootLogger(); - - public static List<String> extract(String content, String regex) - throws MalformedPatternException { + /** + * Regex pattern to get URLs within a plain text. + * + * @see <a + * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html + * </a> + */ + private static final String LINKS_REGEX = + "([A-Za-z][A-Za-z0-9+.-]{1,120}:" + + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" + + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + + private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE); + + /** + * Extract urls from plain text. + * + * @param content The plain text content to examine + * @return List of urls within found in the plain text + */ + public static List<String> extractLinks(String content) { + if (content == null || content.length() == 0) { + return Collections.emptyList(); + } List<String> extractions = new ArrayList<String>(); - final PatternCompiler cp = new Perl5Compiler(); - final Pattern pattern = cp.compile(regex, - Perl5Compiler.CASE_INSENSITIVE_MASK - | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); - final PatternMatcher matcher = new Perl5Matcher(); - - final PatternMatcherInput input = new PatternMatcherInput(content); - - MatchResult result; - String extractedContent; - - while (matcher.contains(input, pattern)) { - result = matcher.getMatch(); - extractedContent = result.group(0); - extractions.add(extractedContent); + final Matcher matcher = LINKS_PATTERN.matcher(content); + while (matcher.find()) { + extractions.add(matcher.group()); } - return extractions; } - } Added: incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java?rev=606140&view=auto ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java (added) +++ incubator/tika/trunk/src/test/java/org/apache/tika/utils/RegexUtilsTest.java Fri Dec 21 01:12:20 2007 @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.utils; + +import java.util.List; +import junit.framework.TestCase; + +/** + * Test case for [EMAIL PROTECTED] RegexUtils}. + * + * @version $Revision$ $Date$ + */ +public class RegexUtilsTest extends TestCase { + + /** + * Test [EMAIL PROTECTED] RegexUtils#extractLinks(String)} with no links. + */ + + public void testExtractLinksNone() { + List<String> links = null; + + links = RegexUtils.extractLinks(null); + assertNotNull(links); + assertEquals(0, links.size()); + + links = RegexUtils.extractLinks(""); + assertNotNull(links); + assertEquals(0, links.size()); + + links = RegexUtils.extractLinks( + "Test with no links " + + "What about www.google.com"); + assertNotNull(links); + assertEquals(0, links.size()); + } + + + /** + * Test [EMAIL PROTECTED] RegexUtils#extractLinks(String)} for http. + */ + public void testExtractLinksHttp() { + List<String> links = RegexUtils.extractLinks( + "Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html"); + + assertTrue("Url not found!", links.size() == 3); + assertEquals("Wrong URL", "http://www.nutch.org/index.html", links.get(0)); + assertEquals("Wrong URL", "http://www.google.de", links.get(1)); + assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", links.get(2)); + } + + /** + * Test [EMAIL PROTECTED] RegexUtils#extractLinks(String)} for ftp. + */ + public void testExtractLinksFtp() { + List<String> links = RegexUtils.extractLinks( + "Test with ftp://www.nutch.org is it found? " + + "What about www.google.com at ftp://www.google.de"); + + assertTrue("Url not found!", links.size() == 2); + assertEquals("Wrong URL", "ftp://www.nutch.org", links.get(0)); + assertEquals("Wrong URL", "ftp://www.google.de", links.get(1)); + } +}