Added Regex Utility to aid in parsing of content

Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/124e01e9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/124e01e9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/124e01e9

Branch: refs/heads/master
Commit: 124e01e9c3b89c500a877f31fc90ff168f7b52fa
Parents: 65822f2
Author: mfranklin <[email protected]>
Authored: Tue May 13 12:23:38 2014 -0400
Committer: mfranklin <[email protected]>
Committed: Wed May 14 11:31:09 2014 -0400

----------------------------------------------------------------------
 .../org/apache/streams/regex/RegexUtils.java    | 83 ++++++++++++++++++++
 .../apache/streams/regex/RegexUtilsTest.java    | 76 ++++++++++++++++++
 2 files changed, 159 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/124e01e9/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
 
b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
new file mode 100644
index 0000000..bf5c03a
--- /dev/null
+++ 
b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Provides utilities for extracting matches from content
+ */
+public class RegexUtils {
+
+    private static final Map<String, Pattern> patternCache = new 
ConcurrentHashMap<String, Pattern>();
+
+    private RegexUtils() {}
+
+    /**
+     * Extracts matches of the given pattern in the content and returns them 
as a list.
+     * @param pattern the pattern for the substring to match.  For example, 
[0-9]* matches 911 in Emergency number is 911.
+     * @param content the complete content to find matches in.
+     * @return a non-null list of matches.
+     */
+    public static List<String> extractMatches(String pattern, String content) {
+        return getMatches(pattern, content, -1);
+    }
+
+    /**
+     * Extracts matches of the given pattern that are bounded by separation 
characters and returns them as a list.
+     * @param pattern the pattern for the substring to match.  For example, 
[0-9]* matches 911 in Emergency number is 911.
+     * @param content the complete content to find matches in.
+     * @return a non-null list of matches.
+     */
+    public static List<String> extractWordMatches(String pattern, String 
content) {
+        pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,]|$)";
+        return getMatches(pattern, content, 2);
+    }
+
+    protected static List<String> getMatches(String pattern, String content, 
int capture) {
+        Matcher m = getPattern(pattern).matcher(content);
+        List<String> result = new LinkedList<String>();
+        while(m.find()) {
+            String group = capture > 0 ? m.group(capture) : m.group();
+            if(group != null && !group.equals("")) {
+                result.add(group);
+            }
+        }
+        return result;
+    }
+
+    private static Pattern getPattern(String pattern) {
+        Pattern p;
+        if (patternCache.containsKey(pattern)) {
+            p = patternCache.get(pattern);
+        } else {
+            p = Pattern.compile(pattern);
+            patternCache.put(pattern, p);
+        }
+        return p;
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/124e01e9/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
 
b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
new file mode 100644
index 0000000..7dad4ca
--- /dev/null
+++ 
b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+
+@RunWith(Parameterized.class)
+public class RegexUtilsTest {
+
+    private final String pattern;
+    private final String content;
+    private final int wordMatchCount;
+    private final int regularMatchCount;
+
+    public RegexUtilsTest(String pattern, String content, int 
regularMatchCount, int wordMatchCount) {
+        this.pattern = pattern;
+        this.content = content;
+        this.wordMatchCount = wordMatchCount;
+        this.regularMatchCount = regularMatchCount;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> parameters() {
+        return Arrays.asList(new Object[][]{
+                {"[0-9]*", "The number for emergencies is 911.", 1, 1},
+                {"#\\w+", "This is#freakingcrazydude.", 1, 0},
+                {"#\\w+", "This is #freakingcrazydude.", 1, 1},
+                {"#\\w+", "This is #freakingcrazydude!", 1, 1},
+                {"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 
1},
+                {"#\\w+", "This is #freakingcrazydude I went to the 
#freakingcrazydude party", 2, 2},
+                {"#\\w+", "This is#freakingcrazydude I went to the 
#freakingcrazydude party", 2, 1},
+                {"#\\w+", "#what does the fox say?", 1, 1},
+                {"#\\w+", "#what does the fox #say", 2, 2}
+        });
+    }
+
+
+    @Test
+    public void testMatches_simple() {
+        List<String> wordResults = RegexUtils.extractWordMatches(this.pattern, 
this.content);
+        assertThat(wordResults.size(), is(equalTo(wordMatchCount)));
+
+        List<String> regularResults = RegexUtils.extractMatches(this.pattern, 
this.content);
+        assertThat(regularResults.size(), is(equalTo(regularMatchCount)));
+    }
+
+}

Reply via email to