Added hashtag processor
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/204977ec Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/204977ec Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/204977ec Branch: refs/heads/master Commit: 204977ec87d039b12e89e51098d7137f19f5f7ab Parents: 4ae10fd Author: mfranklin <[email protected]> Authored: Tue May 13 16:10:55 2014 -0400 Committer: mfranklin <[email protected]> Committed: Wed May 14 11:31:10 2014 -0400 ---------------------------------------------------------------------- streams-contrib/streams-processor-regex/pom.xml | 11 ++- .../streams/regex/RegexHashtagExtractor.java | 97 ++++++++++++++++++++ .../org/apache/streams/regex/RegexUtils.java | 26 ++++-- .../regex/RegexHashtagExtractorTest.java | 72 +++++++++++++++ .../apache/streams/regex/RegexUtilsTest.java | 14 +-- 5 files changed, 204 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/pom.xml ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/pom.xml b/streams-contrib/streams-processor-regex/pom.xml index 094013d..57f661a 100644 --- a/streams-contrib/streams-processor-regex/pom.xml +++ b/streams-contrib/streams-processor-regex/pom.xml @@ -30,5 +30,14 @@ <artifactId>streams-processor-regex</artifactId> - + <dependencies> + <dependency> + <groupId>org.apache.streams</groupId> + <artifactId>streams-pojo</artifactId> + </dependency> + <dependency> + <groupId>org.apache.streams</groupId> + <artifactId>streams-core</artifactId> + </dependency> + </dependencies> </project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java new file mode 100644 index 0000000..fe392b7 --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import org.apache.streams.core.StreamsDatum; +import org.apache.streams.core.StreamsProcessor; +import org.apache.streams.pojo.json.Activity; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.streams.data.util.ActivityUtil.ensureExtensions; + +/** + * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the Hashtags and add + * them to the appropriate extensions object + */ +public class RegexHashtagExtractor implements StreamsProcessor{ + + public final static String DEFAULT_PATTERN = "#\\w+"; + public final static String PATTERN_CONFIG_KEY = "HashtagPattern"; + public final static String EXTENSION_KEY = "hashtags"; + + private String hashPattern; + + public String getHashPattern() { + return hashPattern; + } + + @Override + public List<StreamsDatum> process(StreamsDatum entry) { + if(!(entry.getDocument() instanceof Activity)) { + return Lists.newArrayList(); + } + if(Strings.isNullOrEmpty(hashPattern)) { + prepare(null); + } + Activity activity = (Activity)entry.getDocument(); + Map<String, List<Integer>> matches = RegexUtils.extractMatches(hashPattern, activity.getContent()); + Set<String> hashtags = ensureHashtagsExtension(activity); + for(String key : matches.keySet()) { + hashtags.add(key.substring(1)); + } + return Lists.newArrayList(entry); + } + + @Override + public void prepare(Object configurationObject) { + if(configurationObject instanceof Map) { + if(((Map)configurationObject).containsKey(PATTERN_CONFIG_KEY)) { + hashPattern = (String)((Map)configurationObject).get(PATTERN_CONFIG_KEY); + } + } else if(configurationObject instanceof String) { + hashPattern = (String)configurationObject; + } else { + hashPattern = DEFAULT_PATTERN; + } + } + + @Override + public void cleanUp() { + //NOP + } + + protected Set<String> ensureHashtagsExtension(Activity activity) { + Map<String, Object> extensions = ensureExtensions(activity); + Set<String> hashtags; + if(extensions.containsKey(EXTENSION_KEY)) { + hashtags = Sets.newHashSet((Iterable<String>) extensions.get(EXTENSION_KEY)); + } else { + hashtags = Sets.newHashSet(); + extensions.put(EXTENSION_KEY, hashtags); + } + return hashtags; + } +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java index 41c3ee5..662fc98 100644 --- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUtils.java @@ -19,10 +19,11 @@ package org.apache.streams.regex; -import java.util.LinkedList; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -31,7 +32,7 @@ import java.util.regex.Pattern; */ public class RegexUtils { - private static final Map<String, Pattern> patternCache = new ConcurrentHashMap<String, Pattern>(); + private static final Map<String, Pattern> patternCache = Maps.newConcurrentMap(); private RegexUtils() {} @@ -41,7 +42,7 @@ public class RegexUtils { * @param content the complete content to find matches in. * @return a non-null list of matches. */ - public static List<String> extractMatches(String pattern, String content) { + public static Map<String, List<Integer>> extractMatches(String pattern, String content) { return getMatches(pattern, content, -1); } @@ -51,21 +52,28 @@ public class RegexUtils { * @param content the complete content to find matches in. * @return a non-null list of matches. */ - public static List<String> extractWordMatches(String pattern, String content) { + public static Map<String, List<Integer>> extractWordMatches(String pattern, String content) { pattern = "(^|\\s)(" + pattern + ")([\\s!\\.;,?]|$)"; return getMatches(pattern, content, 2); } - protected static List<String> getMatches(String pattern, String content, int capture) { + protected static Map<String, List<Integer>> getMatches(String pattern, String content, int capture) { Matcher m = getPattern(pattern).matcher(content); - List<String> result = new LinkedList<String>(); + Map<String, List<Integer>> matches = Maps.newHashMap(); while(m.find()) { String group = capture > 0 ? m.group(capture) : m.group(); if(group != null && !group.equals("")) { - result.add(group); + List<Integer> indices; + if(matches.containsKey(group)) { + indices = matches.get(group); + } else { + indices = Lists.newArrayList(); + matches.put(group, indices); + } + indices.add(m.start()); } } - return result; + return matches; } private static Pattern getPattern(String pattern) { http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java new file mode 100644 index 0000000..d2912b7 --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + + +import com.google.common.collect.Sets; +import org.apache.streams.core.StreamsDatum; +import org.apache.streams.pojo.json.Activity; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Set; + +import static org.apache.streams.data.util.ActivityUtil.ensureExtensions; +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +@RunWith(Parameterized.class) +public class RegexHashtagExtractorTest { + + private Activity activity; + private Set<String> hashtags; + + public RegexHashtagExtractorTest(String activityContent, Set<String> hashtags) { + this.activity = new Activity(); + this.activity.setContent(activityContent); + this.hashtags = hashtags; + } + + @Parameterized.Parameters + public static Collection<Object[]> params() { + return Arrays.asList(new Object[][]{ + {"This is the #content of a standard tweet", Sets.newHashSet("content")}, + {"This is the content of a standard tweet", Sets.newHashSet()}, + {"This is the #content of a standard #tweet", Sets.newHashSet("content", "tweet")}, + {"This is the body of a #fbpost. It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")} + }); + } + + @Test + public void testExtraction() { + StreamsDatum datum = new StreamsDatum(activity, "Test"); + List<StreamsDatum> result = new RegexHashtagExtractor().process(datum); + assertThat(result.size(), is(equalTo(1))); + Activity output = (Activity)result.get(0).getDocument(); + Set<String> extracted = (Set) ensureExtensions(output).get(RegexHashtagExtractor.EXTENSION_KEY); + Sets.SetView<String> diff = Sets.difference(extracted, hashtags); + assertThat(diff.size(), is(equalTo(0))); + } +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/204977ec/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java index eed1327..fc2b9f6 100644 --- a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java +++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUtilsTest.java @@ -27,6 +27,7 @@ import org.junit.runners.Parameterized; import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Map; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.is; @@ -56,10 +57,11 @@ public class RegexUtilsTest { {"#\\w+", "This is #freakingcrazydude.", 1, 1}, {"#\\w+", "This is #freakingcrazydude!", 1, 1}, {"#\\w+", "This is #freakingcrazydude I went to the moon", 1, 1}, - {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude? party", 2, 2}, - {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude; party", 2, 2}, - {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude, party", 2, 2}, - {"#\\w+", "This is#freakingcrazydude I went to the #freakingcrazydude party", 2, 1}, + {"#\\w+", "This is #freakingcrazydude I went to the #crazy? party", 2, 2}, + {"#\\w+", "This is #freakingcrazydude I went to the #crazy; party", 2, 2}, + {"#\\w+", "This is #freakingcrazydude I went to the #crazy, party", 2, 2}, + {"#\\w+", "This is#freakingcrazydude I went to the #crazy party", 2, 1}, + {"#\\w+", "This is #freakingcrazydude I went to the #freakingcrazydude party", 1, 1}, {"#\\w+", "#what does the fox say?", 1, 1}, {"#\\w+", "#what does the fox #say", 2, 2} }); @@ -68,10 +70,10 @@ public class RegexUtilsTest { @Test public void testMatches_simple() { - List<String> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content); + Map<String, List<Integer>> wordResults = RegexUtils.extractWordMatches(this.pattern, this.content); assertThat(wordResults.size(), is(equalTo(wordMatchCount))); - List<String> regularResults = RegexUtils.extractMatches(this.pattern, this.content); + Map<String, List<Integer>> regularResults = RegexUtils.extractMatches(this.pattern, this.content); assertThat(regularResults.size(), is(equalTo(regularMatchCount))); }
