[37/53] [abbrv] git commit: initial version of boiler pipe processor (originally authored by @smashew)

sblackmon Thu, 17 Apr 2014 13:30:16 -0700

initial version of boiler pipe processor (originally authored by @smashew)


Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/da2d80c7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/da2d80c7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/da2d80c7

Branch: refs/heads/master
Commit: da2d80c74991dd86d45aed50edd2252a1697cb12
Parents: f1518b3
Author: sblackmon <[email protected]>
Authored: Wed Apr 2 12:23:49 2014 -0500
Committer: sblackmon <[email protected]>
Committed: Wed Apr 2 12:23:49 2014 -0500

----------------------------------------------------------------------
 streams-contrib/streams-processor-tika/pom.xml  | 139 ++++++++++
 .../org/apache/streams/tika/CategoryParser.java |  95 +++++++
 .../org/apache/streams/tika/LinkExpander.java   | 251 +++++++++++++++++++
 .../org/apache/streams/tika/TikaProcessor.java  | 104 ++++++++
 .../apache/streams/tika/BoilerPipeArticle.json  |  80 ++++++
 .../java/org/apache/streams/util/DateUtil.java  | 174 +++++++++++++
 6 files changed, 843 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/pom.xml
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-tika/pom.xml 
b/streams-contrib/streams-processor-tika/pom.xml
new file mode 100644
index 0000000..b320d38
--- /dev/null
+++ b/streams-contrib/streams-processor-tika/pom.xml
@@ -0,0 +1,139 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+
+    <modelVersion>4.0.0</modelVersion>
+    <artifactId>streams-processor-tika</artifactId>
+    <version>0.1-SNAPSHOT</version>
+
+    <parent>
+        <groupId>org.apache.streams</groupId>
+        <artifactId>streams-contrib</artifactId>
+        <version>0.1-SNAPSHOT</version>
+    </parent>
+
+    <properties>
+        <tika.version>1.5</tika.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.streams</groupId>
+            <artifactId>streams-config</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.streams</groupId>
+            <artifactId>streams-core</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.streams</groupId>
+            <artifactId>streams-pojo</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.streams</groupId>
+            <artifactId>streams-processor-urls</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-annotations</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.jsonschema2pojo</groupId>
+            <artifactId>jsonschema2pojo-core</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${tika.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <version>${tika.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <sourceDirectory>src/main/java</sourceDirectory>
+        <testSourceDirectory>src/test/java</testSourceDirectory>
+        <resources>
+            <resource>
+                <directory>src/main/resources</directory>
+            </resource>
+        </resources>
+        <testResources>
+            <testResource>
+                <directory>src/test/resources</directory>
+            </testResource>
+        </testResources>
+        <plugins>
+             <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>build-helper-maven-plugin</artifactId>
+                <version>1.8</version>
+                <executions>
+                    <execution>
+                        <id>add-source</id>
+                        <phase>generate-sources</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                        </goals>
+                        <configuration>
+                            <sources>
+                                
<source>target/generated-sources/jsonschema2pojo/**/*.java</source>
+                            </sources>
+                        </configuration>
+                    </execution>
+                    <execution>
+                        <id>add-source-jaxb2</id>
+                        <phase>generate-sources</phase>
+                        <goals>
+                            <goal>add-source</goal>
+                        </goals>
+                        <configuration>
+                            <sources>
+                                <source>target/generated-sources/jaxb2</source>
+                            </sources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.jsonschema2pojo</groupId>
+                <artifactId>jsonschema2pojo-maven-plugin</artifactId>
+                <configuration>
+                    <addCompileSourceRoot>true</addCompileSourceRoot>
+                    <generateBuilders>true</generateBuilders>
+                    <sourcePaths>
+                        
<sourcePath>src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json</sourcePath>
+                    </sourcePaths>
+                    
<outputDirectory>target/generated-sources/jsonschema2pojo</outputDirectory>
+                    <targetPackage>org.apache.streams.tika</targetPackage>
+                    <useLongIntegers>true</useLongIntegers>
+                    <useJodaDates>true</useJodaDates>
+                </configuration>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>generate</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
 
b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
new file mode 100644
index 0000000..36ca2de
--- /dev/null
+++ 
b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
@@ -0,0 +1,95 @@
+package org.apache.streams.tika;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.List;
+
+public class CategoryParser
+{
+    /**
+     * This method takes a URL and from that text alone determines what 
categories that URL belongs in.
+     * @param url - String URL to categorize
+     * @return categories - A List&lt;String&rt; of categories the URL 
seemingly belongs in
+     */
+    public static List<String> getCategoriesFromUrl(String url) {
+
+        // Clean the URL to remove useless bits and encoding artifacts
+        String normalizedUrl = normalizeURL(url);
+
+        // Break the url apart and get the good stuff
+        String[] keywords = tokenizeURL(normalizedUrl);
+
+        return null;
+    }
+
+    /**
+     * Removes the protocol, if it exists, from the front and
+     * removes any random encoding characters
+     * Extend this to do other url cleaning/pre-processing
+     * @param url - The String URL to normalize
+     * @return normalizedUrl - The String URL that has no junk or surprises
+     */
+    private static String normalizeURL(String url)
+    {
+        // Decode URL to remove any %20 type stuff
+        String normalizedUrl = url;
+        try {
+            // I've used a URLDecoder that's part of Java here,
+            // but this functionality exists in most modern languages
+            // and is universally called url decoding
+            normalizedUrl = URLDecoder.decode(url, "UTF-8");
+        }
+        catch(UnsupportedEncodingException uee)
+        {
+            System.err.println("Unable to Decode URL. Decoding skipped.");
+            uee.printStackTrace();
+        }
+
+        // Remove the protocol, http:// ftp:// or similar from the front
+        if (normalizedUrl.contains("://"))
+            normalizedUrl = normalizedUrl.split(":\\/\\/")[1];
+
+        // Room here to do more pre-processing
+
+        return normalizedUrl;
+    }
+
+    /**
+     * Takes apart the url into the pieces that make at least some sense
+     * This doesn't guarantee that each token is a potentially valid keyword, 
however
+     * because that would require actually iterating over them again, which 
might be
+     * seen as a waste.
+     * @param url - Url to be tokenized
+     * @return tokens - A String array of all the tokens
+     */
+    private static String[] tokenizeURL(String url)
+    {
+        // I assume that we're going to use the whole URL to find tokens in
+        // If you want to just look in the GET parameters, or you want to 
ignore the domain
+        // or you want to use the domain as a token itself, that would have to 
be
+        // processed above the next line, and only the remaining parts split
+        String[] tokens = url.split("\\b|_");
+
+        // One could alternatively use a more complex regex to remove more 
invalid matches
+        // but this is subject to your (?:in)?ability to actually write the 
regex you want
+
+        // These next two get rid of tokens that are too short, also.
+
+        // Destroys anything that's not alphanumeric and things that are
+        // alphanumeric but only 1 character long
+        //String[] tokens = url.split("(?:[\\W_]+\\w)*[\\W_]+");
+
+        // Destroys anything that's not alphanumeric and things that are
+        // alphanumeric but only 1 or 2 characters long
+        //String[] tokens = url.split("(?:[\\W_]+\\w{1,2})*[\\W_]+");
+
+        return tokens;
+    }
+
+    // How this would be used
+    public static void main(String[] args)
+    {
+        List<String> soQuestionUrlClassifications = 
getCategoriesFromUrl("http://stackoverflow.com/questions/10046178/pattern-matching-for-url-classification";);
+        List<String> googleQueryURLClassifications = 
getCategoriesFromUrl("https://www.google.com/search?sugexp=chrome,mod=18&sourceid=chrome&ie=UTF-8&q=spring+is+a+new+service+instance+created#hl=en&sugexp=ciatsh&gs_nf=1&gs_mss=spring%20is%20a%20new%20bean%20instance%20created&tok=lnAt2g0iy8CWkY65Te75sg&pq=spring%20is%20a%20new%20bean%20instance%20created&cp=6&gs_id=1l&xhr=t&q=urlencode&pf=p&safe=off&sclient=psy-ab&oq=url+en&gs_l=&pbx=1&bav=on.2,or.r_gc.r_pw.r_cp.r_qf.,cf.osb&fp=2176d1af1be1f17d&biw=1680&bih=965";);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
 
b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
new file mode 100644
index 0000000..fe0e898
--- /dev/null
+++ 
b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
@@ -0,0 +1,251 @@
+package org.apache.streams.tika;
+
+import org.apache.streams.urls.LinkUnwinder;
+import org.apache.streams.util.DateUtil;
+import org.apache.streams.tika.BoilerPipeArticle;
+import org.apache.streams.tika.LanguageDetected;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.LanguageIdentifier;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.net.URL;
+import java.net.URLConnection;
+import java.text.ParseException;
+import java.util.*;
+
+
+/**
+ * Helpful resources for this class:
+ *
+ * // TODO: This needs to be rethought.
+ *
+ * URL:
+ * Tika UI: http://www.apache.org/dyn/closer.cgi/tika/tika-app-1.4.jar
+ * Tika: http://tika.apache.org/
+ * Dublin Core: http://dublincore.org/documents/dces/
+ */
+
+public class LinkExpander extends LinkUnwinder
+{
+    private final static Logger LOGGER = 
LoggerFactory.getLogger(LinkExpander.class);
+
+    private static final AutoDetectParser AUTO_DETECT_PARSER = new 
AutoDetectParser();
+
+    private final Map<String, String> metaData = new HashMap<String, String>();
+
+    private final Set<String> keywords = new HashSet<String>();
+
+    private BoilerPipeArticle article = new BoilerPipeArticle();
+
+    // sblackmon: I put this here so I wouldn't get NullPointerExceptions when 
serializing results
+    public TextBlock getContentTextBlock() {
+        for(TextBlock textBlock : article.getTextBlocks())
+            if(textBlock.isContent())
+                return textBlock;
+        return null;
+    }
+
+    private static final Collection<String> AUTHOR_SEARCH = new 
ArrayList<String>() {{
+        add("og:author");
+        add("dc:author");
+        add("author");
+    }};
+
+    private static final Collection<String> DESCRIPTION_SEARCH = new 
ArrayList<String>() {{
+        add("og:description");
+        add("dc:description");
+        add("description");
+    }};
+
+    private static final Collection<String> MEDIUM_SEARCH = new 
ArrayList<String>() {{
+        add("og:medium");
+        add("dc:medium");
+        add("medium");
+    }};
+
+    private static final Collection<String> IMAGE_SEARCH = new 
ArrayList<String>() {{
+        add("og:image");
+        add("twitter:image");
+        add("image");
+    }};
+
+    private static final Collection<String> KEYWORDS_SEARCH = new 
ArrayList<String>() {{
+        add("keywords");
+        add("news_keywords");
+    }};
+
+    private static final Collection<String> PUB_DATE_SEARCH = new 
ArrayList<String>() {{
+        add("pubdate");
+        add("os:pubdate");
+        add("dc:pubdate");
+    }};
+
+    private static final Collection<String> MODIFIED_DATE_SEARCH = new 
ArrayList<String>() {{
+        add("lastmod");
+        add("last-modified");
+    }};
+
+    private static final Collection<String> LOCALE_SEARCH = new 
ArrayList<String>() {{
+        add("locale");
+        add("os:locale");
+        add("dc:local");
+    }};
+
+    // Social Searchers
+    private static final Collection<String> FACEBOOK_PAGE_SEARCH = new 
ArrayList<String>() {{
+        add("fb:page_id");
+    }};
+
+    private static final Collection<String> FACEBOOK_APP_SEARCH = new 
ArrayList<String>() {{
+        add("fb:app_id");
+    }};
+
+    private static final Collection<String> TWITTER_SITE_SEARCH = new 
ArrayList<String>() {{
+        add("twitter:site:id");
+        add("twitter:site");
+    }};
+
+    private static final Collection<String> TWITTER_CREATOR_SEARCH = new 
ArrayList<String>() {{
+        add("twitter:creator:id");
+        add("twitter:creator");
+    }};
+
+
+    public LinkExpander(String url) {
+        super(url);
+    }
+
+    public void run() {
+        super.run();
+        expandLink();
+    }
+
+
+    private void expandLink()
+    {
+        InputStream is = null;
+
+        try
+        {
+            URL url = new URL(this.getFinalURL());
+            URLConnection con = url.openConnection();
+            con.setConnectTimeout(10000);
+            is = con.getInputStream();
+
+            parseMainContent(is);
+            parsePlainText(is);
+            detectLanguage(article.getPlainText());
+
+        }
+        // Handle all Exceptions by just reporting that the site status was an 
error.
+        catch (IOException e) {
+            article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+        }
+        catch (TikaException e) {
+            article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+        }
+        catch (SAXException e) {
+            article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+        }
+        catch (Exception e) {
+            article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+        }
+        finally {
+            if (!(is == null)) {
+                try {
+                    is.close();
+                }
+                catch(IOException e) {
+                    LOGGER.warn("Problem closing the input stream: {}", 
e.getMessage());
+                }
+            }
+        }
+    }
+
+    private void parseMainContent(InputStream is) throws IOException, 
SAXException, TikaException, ParseException
+    {
+        Metadata rawMetaData = new Metadata();
+        StringWriter stringWriter = new StringWriter();
+
+        BoilerpipeContentHandler boilerpipeContentHandler = new 
BoilerpipeContentHandler(stringWriter);
+
+        AUTO_DETECT_PARSER.parse(is,
+                boilerpipeContentHandler,
+                rawMetaData);
+
+        
article.setTextBlocks(boilerpipeContentHandler.getTextDocument().getTextBlocks());
+        
article.setBody(boilerpipeContentHandler.getTextDocument().getContent());
+        
article.setTitle(boilerpipeContentHandler.getTextDocument().getTitle());
+
+        // this map is for ourselves so we convert it to lower-case to make it 
easier to search.
+        // the meta data that is going to be returned will be unmodified meta 
data.
+        for(String name : rawMetaData.names())
+            if(rawMetaData.get(name) != null) {
+                this.metaData.put(name.toLowerCase(), rawMetaData.get(name));
+                article.setAdditionalProperty(name.toLowerCase(), 
rawMetaData.get(name));
+            }
+
+        article.setAuthor(metaDataSearcher(LinkExpander.AUTHOR_SEARCH));
+        
article.setDescription(metaDataSearcher(LinkExpander.DESCRIPTION_SEARCH));
+        article.setMedium(metaDataSearcher(LinkExpander.MEDIUM_SEARCH));
+        article.setImageURL(metaDataSearcher(LinkExpander.IMAGE_SEARCH));
+        article.setLocale(metaDataSearcher(LinkExpander.LOCALE_SEARCH));
+
+        
article.setFacebookApp(metaDataSearcher(LinkExpander.FACEBOOK_APP_SEARCH));
+        
article.setFacebookPage(metaDataSearcher(LinkExpander.FACEBOOK_PAGE_SEARCH));
+
+        
article.setTwitterCreator(metaDataSearcher(LinkExpander.TWITTER_CREATOR_SEARCH));
+        
article.setTwitterSite(metaDataSearcher(LinkExpander.TWITTER_SITE_SEARCH));
+
+        mergeSet(LinkExpander.KEYWORDS_SEARCH, this.keywords);
+
+        
article.setPublishedDate(DateUtil.determineDate(metaDataSearcher(LinkExpander.PUB_DATE_SEARCH)));
+        
article.setLastModifiedDate(DateUtil.determineDate(metaDataSearcher(LinkExpander.MODIFIED_DATE_SEARCH)));
+
+        if(article.getBody().length() > 50)
+            article.setSiteStatus(BoilerPipeArticle.SiteStatus.SUCCESS);
+    }
+
+    private void parsePlainText(InputStream is) throws Exception {
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse(is, handler, metadata, new ParseContext());
+        article.setPlainText(handler.toString());
+    }
+
+    private void detectLanguage(String plainText) throws Exception {
+        LanguageDetected languageDetected = new LanguageDetected();
+        LanguageIdentifier languageIdentifier = new 
LanguageIdentifier(plainText);
+        languageDetected.setLanguageCode(languageIdentifier.getLanguage());
+        
languageDetected.setIsLanguageReasonablyCertain(languageIdentifier.isReasonablyCertain());
+        article.setLanguageDetected(languageDetected);
+    }
+
+    private String metaDataSearcher(Collection<String> itemsToSearch) {
+        for(String s : itemsToSearch)
+            if(this.metaData.containsKey(s))
+                return this.metaData.get(s);
+
+        // the meta searcher returned nothing.
+        return null;
+    }
+
+    private void mergeSet(Collection<String> itemsToSearch, Set<String> set) {
+        for(String s : itemsToSearch)
+            Collections.addAll(set, s == null || s.equals("") ? new String[]{} 
: s.split(","));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
 
b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
new file mode 100644
index 0000000..b2f337d
--- /dev/null
+++ 
b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
@@ -0,0 +1,104 @@
+package org.apache.streams.tika;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.datatype.jsonorg.JsonOrgModule;
+import com.google.common.collect.Lists;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.jackson.StreamsJacksonMapper;
+import org.apache.streams.pojo.json.Activity;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+
+/**
+ * References:
+ * Some helpful references to help
+ * Purpose              URL
+ * -------------        
----------------------------------------------------------------
+ * [Status Codes]       http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
+ * [Test Cases]         http://greenbytes.de/tech/tc/httpredirects/
+ * [t.co behavior]      https://dev.twitter.com/docs/tco-redirection-behavior
+ */
+
+public class TikaProcessor implements StreamsProcessor
+{
+    private final static String STREAMS_ID = "LinkExpanderProcessor";
+
+    private final static Logger LOGGER = 
LoggerFactory.getLogger(TikaProcessor.class);
+
+    private ObjectMapper mapper;
+
+    @Override
+    public List<StreamsDatum> process(StreamsDatum entry) {
+
+        List<StreamsDatum> result = Lists.newArrayList();
+
+        LOGGER.debug("{} processing {}", STREAMS_ID, 
entry.getDocument().getClass());
+
+        // get list of shared urls
+        if( entry.getDocument() instanceof Activity) {
+
+            Activity input = (Activity) entry.getDocument();
+
+            List<String> outputLinks = input.getLinks();
+            // for each
+            for( String link : outputLinks ) {
+                if( link instanceof String ) {
+                    // expand
+                    try {
+                        StreamsDatum outputDatum = expandLink((String) link, 
entry);
+                        result.add(outputDatum);
+                    } catch (Exception e) {
+                        //drop unexpandable links
+                        LOGGER.debug("Failed to expand link : {}", link);
+                        LOGGER.debug("Excpetion expanding link : {}", e);
+                    }
+                }
+                else {
+                    LOGGER.warn("Expected Links to be of type 
java.lang.String, but received {}", link.getClass().toString());
+                }
+            }
+
+
+        }
+        else if(entry.getDocument() instanceof String) {
+            StreamsDatum outputDatum = expandLink((String) 
entry.getDocument(), entry);
+            result.add(outputDatum);
+        }
+        else throw new NotImplementedException();
+
+        return result;
+    }
+
+    private StreamsDatum expandLink(String link, StreamsDatum input) {
+
+        LinkExpander expander = new LinkExpander((String)link);
+        expander.run();
+        StreamsDatum datum = null;
+        if(input.getId() == null)
+            datum = new StreamsDatum(this.mapper.convertValue(expander, 
JSONObject.class).toString(), expander.getFinalURL());
+        else
+            datum = new StreamsDatum(this.mapper.convertValue(expander, 
JSONObject.class).toString(), input.getId());
+        datum.setSequenceid(input.getSequenceid());
+        datum.setMetadata(input.getMetadata());
+        datum.setTimestamp(input.getTimestamp());
+        return datum;
+
+    }
+
+    @Override
+    public void prepare(Object o) {
+        this.mapper = StreamsJacksonMapper.getInstance();
+        this.mapper.registerModule(new JsonOrgModule());
+    }
+
+    @Override
+    public void cleanUp() {
+
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
----------------------------------------------------------------------
diff --git 
a/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
 
b/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
new file mode 100644
index 0000000..a23b13e
--- /dev/null
+++ 
b/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
@@ -0,0 +1,80 @@
+{
+    "type": "object",
+    "$schema": "http://json-schema.org/draft-03/schema";,
+    "id": "#",
+    "properties": {
+        "siteStatus" : {
+            "type" : "string",
+            "enum" : ["SUCCESS", "ERROR"]
+        },
+        "title": {
+            "type": "string"
+        },
+        "description": {
+            "type": "string"
+        },
+        "body": {
+            "type": "string"
+        },
+        "plainText": {
+            "type": "string"
+        },
+        "medium": {
+            "type": "string"
+        },
+        "author": {
+            "type": "string"
+        },
+        "locale": {
+            "type": "string"
+        },
+        "publishedDate": {
+            "type": "string",
+            "format" : "date-time"
+        },
+        "lastModifiedDate": {
+            "type": "string",
+            "format" : "date-time"
+        },
+        "imageURL": {
+            "type": "string"
+        },
+        "languageDetected": {
+            "type": "object",
+            "properties": {
+                "languageCode": {
+                    "type": "string"
+                },
+                "isLanguageReasonablyCertain": {
+                    "type": "boolean"
+                }
+            }
+        },
+        "textBlocks": {
+            "type": "array",
+            "items": {
+                "javaType": "de.l3s.boilerpipe.document.TextBlock",
+                "type": "object"
+            }
+        },
+        "keywords": {
+            "type": "array",
+            "uniqueItems": true,
+            "items": {
+                "type": "string"
+            }
+        },
+        "twitterCreator": {
+            "type": "string"
+        },
+        "twitterSite": {
+            "type": "string"
+        },
+        "facebookPage": {
+            "type": "string"
+        },
+        "facebookApp": {
+            "type": "string"
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-util/src/main/java/org/apache/streams/util/DateUtil.java
----------------------------------------------------------------------
diff --git a/streams-util/src/main/java/org/apache/streams/util/DateUtil.java 
b/streams-util/src/main/java/org/apache/streams/util/DateUtil.java
new file mode 100644
index 0000000..e3201bc
--- /dev/null
+++ b/streams-util/src/main/java/org/apache/streams/util/DateUtil.java
@@ -0,0 +1,174 @@
+package org.apache.streams.util;
+
+import org.joda.time.DateTime;
+import org.joda.time.DateTimeZone;
+import org.joda.time.format.DateTimeFormatter;
+import org.joda.time.format.ISODateTimeFormat;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+
+/*
+ *
+ * If you can think of a better way, feel free to implement. This was a great 
class that I found that
+ * solves the majority of the issue I was dealing with.
+ *
+ * smashew 11=13=2012
+ *
+ * Site:
+ * http://stackoverflow.com/questions/3389348/parse-any-date-in-java
+ */
+
+public class DateUtil
+{
+
+    private static final String REGEX_ONLY_NUMBERS = "[0-9]+";
+
+       private static final Map<String, String> DATE_FORMAT_REGEXPS = new 
HashMap<String, String>()
+       {
+               private static final long serialVersionUID = 1L;
+               {
+                       put("^\\d{8}$", "yyyyMMdd");
+                       put("^\\d{1,2}-\\d{1,2}-\\d{4}$", "dd-MM-yyyy");
+                       put("^\\d{4}-\\d{1,2}-\\d{1,2}$", "yyyy-MM-dd");
+                       put("^\\d{1,2}/\\d{1,2}/\\d{4}$", "MM/dd/yyyy");
+                       put("^\\d{4}/\\d{1,2}/\\d{1,2}$", "yyyy/MM/dd");
+                       put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}$", "dd MMM yyyy");
+                       put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}$", "dd MMMM yyyy");
+                       put("^\\d{12}$", "yyyyMMddHHmm");
+                       put("^\\d{8}\\s\\d{4}$", "yyyyMMdd HHmm");
+                       put("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}$", 
"dd-MM-yyyy HH:mm");
+                       put("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}$", 
"yyyy-MM-dd HH:mm");
+                       put("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}$", 
"MM/dd/yyyy HH:mm");
+                       put("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}$", 
"yyyy/MM/dd HH:mm");
+                       put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}$", 
"dd MMM yyyy HH:mm");
+                       
put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}$", "dd MMMM yyyy HH:mm");
+                       put("^\\d{14}$", "yyyyMMddHHmmss");
+                       put("^\\d{8}\\s\\d{6}$", "yyyyMMdd HHmmss");
+                       
put("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd-MM-yyyy 
HH:mm:ss");
+                       
put("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$", "yyyy-MM-dd 
HH:mm:ss");
+                       
put("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "MM/dd/yyyy 
HH:mm:ss");
+                       
put("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$", "yyyy/MM/dd 
HH:mm:ss");
+                       
put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd MMM yyyy 
HH:mm:ss");
+                       
put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd MMMM yyyy 
HH:mm:ss");
+               }
+       };
+
+       /**
+        * Determine SimpleDateFormat pattern matching with the given date 
string. Returns null if format is unknown. You
+        * can simply extend DateUtil with more formats if needed.
+        *
+        * @param dateString
+        *             The date string to determine the SimpleDateFormat 
pattern for.
+        * @return The matching SimpleDateFormat pattern, or null if format is 
unknown.
+        * @see java.text.SimpleDateFormat
+        */
+       public static String determineDateFormat(String dateString)
+        throws ParseException
+       {
+               for (String regexp : DATE_FORMAT_REGEXPS.keySet())
+                       if (dateString.toLowerCase().matches(regexp))
+                               return DATE_FORMAT_REGEXPS.get(regexp);
+
+        throw new ParseException("unable to parse date",0);
+       }
+
+       public static DateTime determineDate(String dateString)
+               throws ParseException
+       {
+        // Trim the string just in case it is dirty.
+        dateString = dateString.trim();
+
+        // check to see if it looks like it is millis. If so, parse as millis 
and return.
+        if(dateString.matches(REGEX_ONLY_NUMBERS))
+            return new DateTime(new Date(Long.parseLong(dateString)));
+
+        try
+        {
+            // try to parse the string into a java.date object, if possible.
+            SimpleDateFormat dateFormat = new 
SimpleDateFormat(determineDateFormat(dateString));
+            dateFormat.setLenient(false);
+            return new DateTime(dateFormat.parse(dateString));
+        }
+        catch(Exception e)
+        {
+
+        }
+
+        return new DateTime(DateTime.parse(dateString));
+       }
+
+    public static DateTime determineDateTime(String dateString)
+            throws ParseException
+    {
+        return new DateTime(determineDate(dateString));
+    }
+
+    public static DateTime determineDateTime(String dateString, DateTimeZone 
theTimeZone)
+            throws ParseException
+    {
+        DateTime beforeTimeZone = determineDateTime(dateString);
+        return new 
DateTime(beforeTimeZone.getYear(),beforeTimeZone.getMonthOfYear(), 
beforeTimeZone.getDayOfMonth(), beforeTimeZone.getHourOfDay(), 
beforeTimeZone.getMinuteOfHour(), beforeTimeZone.getSecondOfMinute(), 
beforeTimeZone.getMillisOfSecond(), theTimeZone);
+    }
+
+
+    public static String getAliasForDate(String date, String prefix) throws 
ParseException {
+        return getAliasesForDateRange(date, null, prefix).iterator().next();
+    }
+
+    public static String getAliasForDate(DateTime date, String prefix) throws 
ParseException {
+        return getAliasesForDateRange(date, null, prefix).iterator().next();
+    }
+
+    public static Set<String> getAliasesForDateRange(String starDate, String 
endDate, String prefix)
+        throws ParseException
+    {
+        DateTime start = null;
+        DateTime end = null;
+        DateTimeFormatter df = ISODateTimeFormat.dateTimeNoMillis();
+        try {
+            start = df.parseDateTime(starDate);
+        } catch (Exception e) {
+            //do nothing. try to parse with other parsers
+        }
+        if(start == null) {
+            start = determineDateTime(starDate);
+        }
+        if(endDate != null) {
+            try {
+                end = df.parseDateTime(endDate);
+            } catch (Exception e) {
+                //do nothing. try to parse with other parsers
+            }
+            if( end == null)
+                end = determineDateTime(endDate);
+        }
+        return getAliasesForDateRange(start, end, prefix);
+    }
+
+    public static Set<String> getAliasesForDateRange(DateTime startDate, 
DateTime endDate, String prefix) {
+        Set<String> aliases = new HashSet<String>();
+        aliases.add(prefix+"_"+getDateAbbreviation(startDate.getYear(), 
startDate.getMonthOfYear()));
+        if(endDate == null) {
+            return aliases;
+        }
+        while(endDate.isAfter(startDate)) {
+            aliases.add(prefix+"_"+getDateAbbreviation(endDate.getYear(), 
endDate.getMonthOfYear()));
+            endDate = endDate.minusMonths(1);
+        }
+        return aliases;
+    }
+
+    private static String getDateAbbreviation(int year, int month) {
+        if(month > 9) {
+            return Integer.toString(year)+Integer.toString(month);
+        }
+        else {
+            return Integer.toString(year)+"0"+Integer.toString(month);
+        }
+    }
+
+
+}

[37/53] [abbrv] git commit: initial version of boiler pipe processor (originally authored by @smashew)

Reply via email to