refactored hashtag processor to use new abstraction

Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/37d378de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/37d378de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/37d378de

Branch: refs/heads/master
Commit: 37d378de5d6acca0b6e906cc46322dcdb0ab0f08
Parents: 1bbfaca
Author: mfranklin <[email protected]>
Authored: Wed May 14 10:23:01 2014 -0400
Committer: mfranklin <[email protected]>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../streams/regex/RegexHashtagExtractor.java    | 64 ++------------------
 .../regex/RegexHashtagExtractorTest.java        |  4 +-
 2 files changed, 8 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/37d378de/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
 
b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
index fe392b7..1e565c8 100644
--- 
a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
+++ 
b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexHashtagExtractor.java
@@ -19,79 +19,25 @@
 
 package org.apache.streams.regex;
 
-import com.google.common.base.Strings;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import org.apache.streams.core.StreamsDatum;
 import org.apache.streams.core.StreamsProcessor;
-import org.apache.streams.pojo.json.Activity;
-
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
 
 /**
  * Processes the content of an {@link org.apache.streams.pojo.json.Activity} 
object to extract the Hashtags and add
  * them to the appropriate extensions object
  */
-public class RegexHashtagExtractor implements StreamsProcessor{
+public class RegexHashtagExtractor extends 
AbstractRegexExtensionExtractor<String> implements StreamsProcessor{
 
     public final static String DEFAULT_PATTERN = "#\\w+";
     public final static String PATTERN_CONFIG_KEY = "HashtagPattern";
     public final static String EXTENSION_KEY = "hashtags";
 
-    private String hashPattern;
-
-    public String getHashPattern() {
-        return hashPattern;
-    }
-
-    @Override
-    public List<StreamsDatum> process(StreamsDatum entry) {
-        if(!(entry.getDocument() instanceof Activity)) {
-            return Lists.newArrayList();
-        }
-        if(Strings.isNullOrEmpty(hashPattern)) {
-            prepare(null);
-        }
-        Activity activity = (Activity)entry.getDocument();
-        Map<String, List<Integer>> matches = 
RegexUtils.extractMatches(hashPattern, activity.getContent());
-        Set<String> hashtags = ensureHashtagsExtension(activity);
-        for(String key : matches.keySet()) {
-            hashtags.add(key.substring(1));
-        }
-        return Lists.newArrayList(entry);
+    public RegexHashtagExtractor() {
+        super(PATTERN_CONFIG_KEY, EXTENSION_KEY, DEFAULT_PATTERN);
     }
 
-    @Override
-    public void prepare(Object configurationObject) {
-        if(configurationObject instanceof Map) {
-            if(((Map)configurationObject).containsKey(PATTERN_CONFIG_KEY)) {
-                hashPattern = 
(String)((Map)configurationObject).get(PATTERN_CONFIG_KEY);
-            }
-        } else if(configurationObject instanceof String) {
-            hashPattern = (String)configurationObject;
-        } else {
-            hashPattern = DEFAULT_PATTERN;
-        }
-    }
 
     @Override
-    public void cleanUp() {
-        //NOP
-    }
-
-    protected Set<String> ensureHashtagsExtension(Activity activity) {
-        Map<String, Object> extensions = ensureExtensions(activity);
-        Set<String> hashtags;
-        if(extensions.containsKey(EXTENSION_KEY)) {
-            hashtags = Sets.newHashSet((Iterable<String>) 
extensions.get(EXTENSION_KEY));
-        } else {
-            hashtags = Sets.newHashSet();
-            extensions.put(EXTENSION_KEY, hashtags);
-        }
-        return hashtags;
+    protected String prepareObject(String extracted) {
+        return extracted.substring(1);
     }
 }

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/37d378de/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
 
b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
index d2912b7..55e007e 100644
--- 
a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
+++ 
b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexHashtagExtractorTest.java
@@ -55,11 +55,13 @@ public class RegexHashtagExtractorTest {
                 {"This is the #content of a standard tweet", 
Sets.newHashSet("content")},
                 {"This is the content of a standard tweet", Sets.newHashSet()},
                 {"This is the #content of a standard #tweet", 
Sets.newHashSet("content", "tweet")},
-                {"This is the body of a #fbpost.  It can have multiple lines 
of #content, as well as much more detailed and flowery #language.", 
Sets.newHashSet("content", "fbpost", "language")}
+                {"UNIX 时间1400000000 秒…… 
(该睡觉了,各位夜猫子)#程序员#", Sets.newHashSet("程序员")},
+                {"This is the body of a #fbpost. It can have multiple lines of 
#content, as well as much more detailed and flowery #language.", 
Sets.newHashSet("content", "fbpost", "language")}
         });
     }
 
     @Test
+    @SuppressWarnings("unchecked")
     public void testExtraction() {
         StreamsDatum datum = new StreamsDatum(activity, "Test");
         List<StreamsDatum> result = new RegexHashtagExtractor().process(datum);

Reply via email to