Added Regex Utility to aid in parsing of content
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/124e01e9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/124e01e9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/124e01e9 Branch: refs/heads/master Commit: 124e01e9c3b89c500a877f31fc90ff168f7b52fa Parents: 65822f2 Author: mfranklin <[email protected]> Authored: Tue May 13 12:23:38 2014 -0400 Committer: mfranklin <[email protected]> Committed: Wed May 14 11:31:09 2014 -0400 ---------------------------------------------------------------------- .../org/apache/streams/regex/RegexUtils.java | 83 ++++++++++++++++++++ .../apache/streams/regex/RegexUtilsTest.java | 76 ++++++++++++++++++ 2 files changed, 159 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/124e01e9/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java new file mode 100644 index 0000000..bf5c03a --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Provides utilities for extracting matches from content + */ +public class RegexUtils { + + private static final Map<String, Pattern> patternCache = new ConcurrentHashMap<String, Pattern>(); + + private RegexUtils() {} + + /** + * Extracts matches of the given pattern in the content and returns them as a list. + * @param pattern the pattern for the substring to match. For example, [0-9]* matches 911 in Emergency number is 911. + * @param content the complete content to find matches in. + * @return a non-null list of matches. + */ + public static List<String> extractMatches(String pattern, String content) { + return getMatches(pattern, content, -1); + } + + /** + * Extracts matches of the given pattern that are bounded by separation characters and returns them as a list. + * @param pattern the pattern for the substring to match. For example, [0-9]* matches 911 in Emergency number is 911. + * @param content the complete content to find matches in. + * @return a non-null list of matches. + */ + public static List<String> extractWordMatches(String pattern, String content) { + pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,]|$)"; + return getMatches(pattern, content, 2); + } + + protected static List<String> getMatches(String pattern, String content, int capture) { + Matcher m = getPattern(pattern).matcher(content); + List<String> result = new LinkedList<String>(); + while(m.find()) { + String group = capture > 0 ? m.group(capture) : m.group(); + if(group != null && !group.equals("")) { + result.add(group); + } + } + return result; + } + + private static Pattern getPattern(String pattern) { + Pattern p; + if (patternCache.containsKey(pattern)) { + p = patternCache.get(pattern); + } else { + p = Pattern.compile(pattern); + patternCache.put(pattern, p); + } + return p; + } + + +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/124e01e9/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java new file mode 100644 index 0000000..7dad4ca --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + + +@RunWith(Parameterized.class) +public class RegexUtilsTest { + + private final String pattern; + private final String content; + private final int wordMatchCount; + private final int regularMatchCount; + + public RegexUtilsTest(String pattern, String content, int regularMatchCount, int wordMatchCount) { + this.pattern = pattern; + this.content = content; + this.wordMatchCount = wordMatchCount; + this.regularMatchCount = regularMatchCount; + } + + @Parameterized.Parameters + public static Collection<Object[]> parameters() { + return Arrays.asList(new Object[][]{ + {"[0-9]*", "The number for emergencies is 911.", 1, 1}, + {"#\\w+", "This is#freakingcrazydude.", 1, 0}, + {"#\\w+", "This is #freakingcrazydude.", 1, 1}, + {"#\\w+", "This is #freakingcrazydude!", 1, 1}, + {"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 1}, + {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude party", 2, 2}, + {"#\\w+", "This is#freakingcrazydude I went to the #freakingcrazydude party", 2, 1}, + {"#\\w+", "#what does the fox say?", 1, 1}, + {"#\\w+", "#what does the fox #say", 2, 2} + }); + } + + + @Test + public void testMatches_simple() { + List<String> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content); + assertThat(wordResults.size(), is(equalTo(wordMatchCount))); + + List<String> regularResults = RegexUtils.extractMatches(this.pattern, this.content); + assertThat(regularResults.size(), is(equalTo(regularMatchCount))); + } + +}
