Added URL extractor
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/a1b02094 Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/a1b02094 Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/a1b02094 Branch: refs/heads/master Commit: a1b02094ebd61233888635d6bcbe0ce383a6c009 Parents: 75578e9 Author: mfranklin <[email protected]> Authored: Wed May 14 11:18:06 2014 -0400 Committer: mfranklin <[email protected]> Committed: Wed May 14 11:31:10 2014 -0400 ---------------------------------------------------------------------- .../regex/AbstractRegexExtensionExtractor.java | 5 +- .../apache/streams/regex/RegexUrlExtractor.java | 68 +++++++++++++++++++ .../streams/regex/RegexUrlExtractorTest.java | 70 ++++++++++++++++++++ 3 files changed, 141 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java index 6774962..23d1ad5 100644 --- a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/AbstractRegexExtensionExtractor.java @@ -26,6 +26,7 @@ import org.apache.streams.core.StreamsDatum; import org.apache.streams.core.StreamsProcessor; import org.apache.streams.pojo.json.Activity; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; @@ -63,7 +64,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce } Activity activity = (Activity)entry.getDocument(); Map<String, List<Integer>> matches = RegexUtils.extractMatches(pattern, activity.getContent()); - Set<T> entities = ensureMentionExtension(activity); + Collection<T> entities = ensureTargetObject(activity); for(String key : matches.keySet()) { entities.add(prepareObject(key)); } @@ -96,7 +97,7 @@ public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProce protected abstract T prepareObject(String extracted); @SuppressWarnings("unchecked") - protected Set<T> ensureMentionExtension(Activity activity) { + protected Collection<T> ensureTargetObject(Activity activity) { Map<String, Object> extensions = ensureExtensions(activity); Set<T> hashtags; if(extensions.containsKey(extensionKey)) { http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java new file mode 100644 index 0000000..5d37b3a --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/main/java/org/apache/streams/regex/RegexUrlExtractor.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + +import org.apache.streams.core.StreamsProcessor; +import org.apache.streams.pojo.json.Activity; + +import java.util.Collection; + +/** + * Processes the content of an {@link org.apache.streams.pojo.json.Activity} object to extract the URLs and add + * them to the appropriate extensions object + */ +public class RegexUrlExtractor extends AbstractRegexExtensionExtractor<String> implements StreamsProcessor { + + //Temporarily copied from streams-processor-urls so as not to force a dependency on that provider. This should + //be moved to a common utility package + public final static String DEFAULT_PATTERN = + "(?:(?:https?|ftp)://)" + + "(?:\\S+(?::\\S*)?@)?" + + "(?:" + + "(?!(?:10|127)(?:\\.\\d{1,3}){3})" + + "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + + "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + + "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + + "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + + "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + + "|" + + "(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)" + + "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*" + + "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" + + ")" + + "(?::\\d{2,5})?" + + "(?:/[^\\s]*)?"; + + public final static String PATTERN_CONFIG_KEY = "URLPattern"; + + public RegexUrlExtractor() { + super(PATTERN_CONFIG_KEY, null, DEFAULT_PATTERN); + } + + @Override + protected String prepareObject(String extracted) { + return extracted; + } + + @Override + protected Collection<String> ensureTargetObject(Activity activity) { + return activity.getLinks(); + } +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/a1b02094/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java new file mode 100644 index 0000000..38b8dab --- /dev/null +++ b/streams-contrib/streams-processor-regex/src/test/java/org/apache/streams/regex/RegexUrlExtractorTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.streams.regex; + + +import com.google.common.collect.Sets; +import org.apache.streams.core.StreamsDatum; +import org.apache.streams.pojo.json.Activity; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.*; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.assertThat; + +@RunWith(Parameterized.class) +public class RegexUrlExtractorTest { + + private Activity activity; + private Set<String> links; + + public RegexUrlExtractorTest(String activityContent, Set<String> links) { + this.activity = new Activity(); + this.activity.setContent(activityContent); + this.links = links; + } + + @Parameterized.Parameters + public static Collection<Object[]> params() { + return Arrays.asList(new Object[][]{ + {"This is the http://t.co/foo of a standard tweet", Sets.newHashSet("http://t.co/foo")}, + {"This is the https://t.co/foo of a standard tweet", Sets.newHashSet("https://t.co/foo")}, + {"This is the http://amd.com/test of a standard tweet", Sets.newHashSet("http://amd.com/test")}, + {"This is the content of a standard tweet", Sets.newHashSet()}, + {"This is the http://www.google.com/articles/awesome?with=query¶ms=true of a standard @tweet", Sets.newHashSet("http://www.google.com/articles/awesome?with=query¶ms=true")} + }); + } + + @Test + @SuppressWarnings("unchecked") + public void testExtraction() { + StreamsDatum datum = new StreamsDatum(activity, "Test"); + List<StreamsDatum> result = new RegexUrlExtractor().process(datum); + assertThat(result.size(), is(equalTo(1))); + Activity output = (Activity)result.get(0).getDocument(); + Set<String> extracted = Sets.newHashSet(output.getLinks()); + Sets.SetView<String> diff = Sets.difference(links, extracted); + assertThat(diff.size(), is(equalTo(0))); + } +}
