added mentions extractor

Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/75578e99
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/75578e99
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/75578e99

Branch: refs/heads/master
Commit: 75578e994c2bfbeea4e69487138523348e99bedf
Parents: 37d378d
Author: mfranklin <[email protected]>
Authored: Wed May 14 10:23:36 2014 -0400
Committer: mfranklin <[email protected]>
Committed: Wed May 14 11:31:10 2014 -0400

----------------------------------------------------------------------
 .../streams/regex/RegexMentionsExtractor.java   | 48 +++++++++++
 .../regex/RegexMentionExtractorTest.java        | 83 ++++++++++++++++++++
 2 files changed, 131 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/75578e99/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
 
b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
new file mode 100644
index 0000000..dbf4540
--- /dev/null
+++ 
b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+import com.google.common.collect.Maps;
+import org.apache.streams.core.StreamsProcessor;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Processes the content of an {@link org.apache.streams.pojo.json.Activity} 
object to extract the @user mentions and add
+ * them to the appropriate extensions object
+ */
+public class RegexMentionsExtractor extends 
AbstractRegexExtensionExtractor<Map<String, Object>> implements 
StreamsProcessor {
+    public static final String DEFAULT_PATTERN = "@\\w+";
+    public static final String PATTERN_CONFIG_KEY = "MentionPattern";
+    public static final String EXTENSION_KEY = "user_mentions";
+    public static final String DISPLAY_KEY = "displayName";
+
+    protected RegexMentionsExtractor() {
+        super(PATTERN_CONFIG_KEY, EXTENSION_KEY, DEFAULT_PATTERN);
+    }
+
+    @Override
+    protected Map<String, Object> prepareObject(String extracted) {
+        HashMap<String, Object> mention = Maps.newHashMap();
+        mention.put(DISPLAY_KEY, extracted.substring(1));
+        return mention;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/75578e99/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
 
b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
new file mode 100644
index 0000000..0379c09
--- /dev/null
+++ 
b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.streams.regex;
+
+
+import com.google.common.collect.Sets;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.pojo.json.Activity;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.*;
+
+import static org.apache.streams.data.util.ActivityUtil.ensureExtensions;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+@RunWith(Parameterized.class)
+public class RegexMentionExtractorTest {
+
+    private Activity activity;
+    private Set<Map<String, Object>> mentions;
+
+    public RegexMentionExtractorTest(String activityContent, Set<Map<String, 
Object>> hashtags) {
+        this.activity = new Activity();
+        this.activity.setContent(activityContent);
+        this.mentions = hashtags;
+    }
+
+    @Parameterized.Parameters
+    public static Collection<Object[]> params() {
+        return Arrays.asList(new Object[][]{
+                {"This is the @content of a standard tweet", 
Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "content");
+                }})},
+                {"This is the content of a standard tweet", 
Sets.newHashSet(new HashMap<String, Object>())},
+                {"This is the @content of a standard @tweet",  
Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "content");
+                }},new HashMap<String, Object>() {{
+                    put("displayName", "tweet");
+                }})},
+                {"UNIX 时间1400000000 秒…… 
(该睡觉了,各位夜猫子)@程序员#", Sets.newHashSet(new 
HashMap<String, Object>() {{
+                    put("displayName", "程序员");
+                }})},
+                {"This is the body of a @fbpost. It can have multiple lines of 
#content, as well as much more detailed and flowery @language.",  
Sets.newHashSet(new HashMap<String, Object>() {{
+                    put("displayName", "fbpost");
+                }},new HashMap<String, Object>() {{
+                    put("displayName", "language");
+                }})}
+        });
+    }
+
+    @Test
+    @SuppressWarnings("unchecked")
+    public void testExtraction() {
+        StreamsDatum datum = new StreamsDatum(activity, "Test");
+        List<StreamsDatum> result = new 
RegexMentionsExtractor().process(datum);
+        assertThat(result.size(), is(equalTo(1)));
+        Activity output = (Activity)result.get(0).getDocument();
+        Set<String> extracted = (Set) 
ensureExtensions(output).get(RegexMentionsExtractor.EXTENSION_KEY);
+        Sets.SetView<String> diff = Sets.difference(extracted, mentions);
+        assertThat(diff.size(), is(equalTo(0)));
+    }
+}

Reply via email to