added mentions extractor
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/75578e99 Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/75578e99 Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/75578e99 Branch: refs/heads/master Commit: 75578e994c2bfbeea4e69487138523348e99bedf Parents: 37d378d Author: mfranklin <[email protected]> Authored: Wed May 14 10:23:36 2014 -0400 Committer: mfranklin <[email protected]> Committed: Wed May 14 11:31:10 2014 -0400 ---------------------------------------------------------------------- .../streams/regex/RegexMentionsExtractor.java | 48 +++++++++++ .../regex/RegexMentionExtractorTest.java | 83 ++++++++++++++++++++ 2 files changed, 131 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/75578e99/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java new file mode 100644 index 0000000..dbf4540 --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexMentionsExtractor.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; +import com.google.common.collect.Maps; +import org.apache.streams.core.StreamsProcessor; + +import java.util.HashMap; +import java.util.Map; + +/** + * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the @user mentions and add + * them to the appropriate extensions object + */ +public class RegexMentionsExtractor extends AbstractRegexExtensionExtractor<Map<String, Object>> implements StreamsProcessor { + public static final String DEFAULT_PATTERN = "@\\w+"; + public static final String PATTERN_CONFIG_KEY = "MentionPattern"; + public static final String EXTENSION_KEY = "user_mentions"; + public static final String DISPLAY_KEY = "displayName"; + + protected RegexMentionsExtractor() { + super(PATTERN_CONFIG_KEY, EXTENSION_KEY, DEFAULT_PATTERN); + } + + @Override + protected Map<String, Object> prepareObject(String extracted) { + HashMap<String, Object> mention = Maps.newHashMap(); + mention.put(DISPLAY_KEY, extracted.substring(1)); + return mention; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/75578e99/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java new file mode 100644 index 0000000..0379c09 --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexMentionExtractorTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + + +import com.google.common.collect.Sets; +import org.apache.streams.core.StreamsDatum; +import org.apache.streams.pojo.json.Activity; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.*; + +import static org.apache.streams.data.util.ActivityUtil.ensureExtensions; +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +@RunWith(Parameterized.class) +public class RegexMentionExtractorTest { + + private Activity activity; + private Set<Map<String, Object>> mentions; + + public RegexMentionExtractorTest(String activityContent, Set<Map<String, Object>> hashtags) { + this.activity = new Activity(); + this.activity.setContent(activityContent); + this.mentions = hashtags; + } + + @Parameterized.Parameters + public static Collection<Object[]> params() { + return Arrays.asList(new Object[][]{ + {"This is the @content of a standard tweet", Sets.newHashSet(new HashMap<String, Object>() {{ + put("displayName", "content"); + }})}, + {"This is the content of a standard tweet", Sets.newHashSet(new HashMap<String, Object>())}, + {"This is the @content of a standard @tweet", Sets.newHashSet(new HashMap<String, Object>() {{ + put("displayName", "content"); + }},new HashMap<String, Object>() {{ + put("displayName", "tweet"); + }})}, + {"UNIX æ¶é´1400000000 ç§â¦â¦ ï¼è¯¥ç¡è§äºï¼åä½å¤ç«åï¼@ç¨åºå#", Sets.newHashSet(new HashMap<String, Object>() {{ + put("displayName", "ç¨åºå"); + }})}, + {"This is the body of a @fbpost. It can have multiple lines of #content, as well as much more detailed and flowery @language.", Sets.newHashSet(new HashMap<String, Object>() {{ + put("displayName", "fbpost"); + }},new HashMap<String, Object>() {{ + put("displayName", "language"); + }})} + }); + } + + @Test + @SuppressWarnings("unchecked") + public void testExtraction() { + StreamsDatum datum = new StreamsDatum(activity, "Test"); + List<StreamsDatum> result = new RegexMentionsExtractor().process(datum); + assertThat(result.size(), is(equalTo(1))); + Activity output = (Activity)result.get(0).getDocument(); + Set<String> extracted = (Set) ensureExtensions(output).get(RegexMentionsExtractor.EXTENSION_KEY); + Sets.SetView<String> diff = Sets.difference(extracted, mentions); + assertThat(diff.size(), is(equalTo(0))); + } +}
