refactored hashtag processor to use new abstraction
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/37d378de Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/37d378de Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/37d378de Branch: refs/heads/master Commit: 37d378de5d6acca0b6e906cc46322dcdb0ab0f08 Parents: 1bbfaca Author: mfranklin <[email protected]> Authored: Wed May 14 10:23:01 2014 -0400 Committer: mfranklin <[email protected]> Committed: Wed May 14 11:31:10 2014 -0400 ---------------------------------------------------------------------- .../streams/regex/RegexHashtagExtractor.java | 64 ++------------------ .../regex/RegexHashtagExtractorTest.java | 4 +- 2 files changed, 8 insertions(+), 60 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/37d378de/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java index fe392b7..1e565c8 100644 --- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java @@ -19,79 +19,25 @@ package org.apache.streams.regex; -import com.google.common.base.Strings; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import org.apache.streams.core.StreamsDatum; import org.apache.streams.core.StreamsProcessor; -import org.apache.streams.pojo.json.Activity; - -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static org.apache.streams.data.util.ActivityUtil.ensureExtensions; /** * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the Hashtags and add * them to the appropriate extensions object */ -public class RegexHashtagExtractor implements StreamsProcessor{ +public class RegexHashtagExtractor extends AbstractRegexExtensionExtractor<String> implements StreamsProcessor{ public final static String DEFAULT_PATTERN = "#\\w+"; public final static String PATTERN_CONFIG_KEY = "HashtagPattern"; public final static String EXTENSION_KEY = "hashtags"; - private String hashPattern; - - public String getHashPattern() { - return hashPattern; - } - - @Override - public List<StreamsDatum> process(StreamsDatum entry) { - if(!(entry.getDocument() instanceof Activity)) { - return Lists.newArrayList(); - } - if(Strings.isNullOrEmpty(hashPattern)) { - prepare(null); - } - Activity activity = (Activity)entry.getDocument(); - Map<String, List<Integer>> matches = RegexUtils.extractMatches(hashPattern, activity.getContent()); - Set<String> hashtags = ensureHashtagsExtension(activity); - for(String key : matches.keySet()) { - hashtags.add(key.substring(1)); - } - return Lists.newArrayList(entry); + public RegexHashtagExtractor() { + super(PATTERN_CONFIG_KEY, EXTENSION_KEY, DEFAULT_PATTERN); } - @Override - public void prepare(Object configurationObject) { - if(configurationObject instanceof Map) { - if(((Map)configurationObject).containsKey(PATTERN_CONFIG_KEY)) { - hashPattern = (String)((Map)configurationObject).get(PATTERN_CONFIG_KEY); - } - } else if(configurationObject instanceof String) { - hashPattern = (String)configurationObject; - } else { - hashPattern = DEFAULT_PATTERN; - } - } @Override - public void cleanUp() { - //NOP - } - - protected Set<String> ensureHashtagsExtension(Activity activity) { - Map<String, Object> extensions = ensureExtensions(activity); - Set<String> hashtags; - if(extensions.containsKey(EXTENSION_KEY)) { - hashtags = Sets.newHashSet((Iterable<String>) extensions.get(EXTENSION_KEY)); - } else { - hashtags = Sets.newHashSet(); - extensions.put(EXTENSION_KEY, hashtags); - } - return hashtags; + protected String prepareObject(String extracted) { + return extracted.substring(1); } } http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/37d378de/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java index d2912b7..55e007e 100644 --- a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java +++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java @@ -55,11 +55,13 @@ public class RegexHashtagExtractorTest { {"This is the #content of a standard tweet", Sets.newHashSet("content")}, {"This is the content of a standard tweet", Sets.newHashSet()}, {"This is the #content of a standard #tweet", Sets.newHashSet("content", "tweet")}, - {"This is the body of a #fbpost. It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")} + {"UNIX æ¶é´1400000000 ç§â¦â¦ ï¼è¯¥ç¡è§äºï¼åä½å¤ç«åï¼#ç¨åºå#", Sets.newHashSet("ç¨åºå")}, + {"This is the body of a #fbpost. It can have multiple lines of #content, as well as much more detailed and flowery #language.", Sets.newHashSet("content", "fbpost", "language")} }); } @Test + @SuppressWarnings("unchecked") public void testExtraction() { StreamsDatum datum = new StreamsDatum(activity, "Test"); List<StreamsDatum> result = new RegexHashtagExtractor().process(datum);
