initial version of boiler pipe processor (originally authored by @smashew)
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/da2d80c7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/da2d80c7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/da2d80c7 Branch: refs/heads/master Commit: da2d80c74991dd86d45aed50edd2252a1697cb12 Parents: f1518b3 Author: sblackmon <[email protected]> Authored: Wed Apr 2 12:23:49 2014 -0500 Committer: sblackmon <[email protected]> Committed: Wed Apr 2 12:23:49 2014 -0500 ---------------------------------------------------------------------- streams-contrib/streams-processor-tika/pom.xml | 139 ++++++++++ .../org/apache/streams/tika/CategoryParser.java | 95 +++++++ .../org/apache/streams/tika/LinkExpander.java | 251 +++++++++++++++++++ .../org/apache/streams/tika/TikaProcessor.java | 104 ++++++++ .../apache/streams/tika/BoilerPipeArticle.json | 80 ++++++ .../java/org/apache/streams/util/DateUtil.java | 174 +++++++++++++ 6 files changed, 843 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/pom.xml ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-tika/pom.xml b/streams-contrib/streams-processor-tika/pom.xml new file mode 100644 index 0000000..b320d38 --- /dev/null +++ b/streams-contrib/streams-processor-tika/pom.xml @@ -0,0 +1,139 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <artifactId>streams-processor-tika</artifactId> + <version>0.1-SNAPSHOT</version> + + <parent> + <groupId>org.apache.streams</groupId> + <artifactId>streams-contrib</artifactId> + <version>0.1-SNAPSHOT</version> + </parent> + + <properties> + <tika.version>1.5</tika.version> + </properties> + + <dependencies> + <dependency> + <groupId>org.apache.streams</groupId> + <artifactId>streams-config</artifactId> + </dependency> + <dependency> + <groupId>org.apache.streams</groupId> + <artifactId>streams-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.streams</groupId> + <artifactId>streams-pojo</artifactId> + </dependency> + <dependency> + <groupId>org.apache.streams</groupId> + <artifactId>streams-processor-urls</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + </dependency> + <dependency> + <groupId>org.jsonschema2pojo</groupId> + <artifactId>jsonschema2pojo-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>${tika.version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>${tika.version}</version> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> + </dependencies> + + <build> + <sourceDirectory>src/main/java</sourceDirectory> + <testSourceDirectory>src/test/java</testSourceDirectory> + <resources> + <resource> + <directory>src/main/resources</directory> + </resource> + </resources> + <testResources> + <testResource> + <directory>src/test/resources</directory> + </testResource> + </testResources> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + <version>1.8</version> + <executions> + <execution> + <id>add-source</id> + <phase>generate-sources</phase> + <goals> + <goal>add-source</goal> + </goals> + <configuration> + <sources> + <source>target/generated-sources/jsonschema2pojo/**/*.java</source> + </sources> + </configuration> + </execution> + <execution> + <id>add-source-jaxb2</id> + <phase>generate-sources</phase> + <goals> + <goal>add-source</goal> + </goals> + <configuration> + <sources> + <source>target/generated-sources/jaxb2</source> + </sources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.jsonschema2pojo</groupId> + <artifactId>jsonschema2pojo-maven-plugin</artifactId> + <configuration> + <addCompileSourceRoot>true</addCompileSourceRoot> + <generateBuilders>true</generateBuilders> + <sourcePaths> + <sourcePath>src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json</sourcePath> + </sourcePaths> + <outputDirectory>target/generated-sources/jsonschema2pojo</outputDirectory> + <targetPackage>org.apache.streams.tika</targetPackage> + <useLongIntegers>true</useLongIntegers> + <useJodaDates>true</useJodaDates> + </configuration> + <executions> + <execution> + <goals> + <goal>generate</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java new file mode 100644 index 0000000..36ca2de --- /dev/null +++ b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java @@ -0,0 +1,95 @@ +package org.apache.streams.tika; + +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.util.List; + +public class CategoryParser +{ + /** + * This method takes a URL and from that text alone determines what categories that URL belongs in. + * @param url - String URL to categorize + * @return categories - A List<String&rt; of categories the URL seemingly belongs in + */ + public static List<String> getCategoriesFromUrl(String url) { + + // Clean the URL to remove useless bits and encoding artifacts + String normalizedUrl = normalizeURL(url); + + // Break the url apart and get the good stuff + String[] keywords = tokenizeURL(normalizedUrl); + + return null; + } + + /** + * Removes the protocol, if it exists, from the front and + * removes any random encoding characters + * Extend this to do other url cleaning/pre-processing + * @param url - The String URL to normalize + * @return normalizedUrl - The String URL that has no junk or surprises + */ + private static String normalizeURL(String url) + { + // Decode URL to remove any %20 type stuff + String normalizedUrl = url; + try { + // I've used a URLDecoder that's part of Java here, + // but this functionality exists in most modern languages + // and is universally called url decoding + normalizedUrl = URLDecoder.decode(url, "UTF-8"); + } + catch(UnsupportedEncodingException uee) + { + System.err.println("Unable to Decode URL. Decoding skipped."); + uee.printStackTrace(); + } + + // Remove the protocol, http:// ftp:// or similar from the front + if (normalizedUrl.contains("://")) + normalizedUrl = normalizedUrl.split(":\\/\\/")[1]; + + // Room here to do more pre-processing + + return normalizedUrl; + } + + /** + * Takes apart the url into the pieces that make at least some sense + * This doesn't guarantee that each token is a potentially valid keyword, however + * because that would require actually iterating over them again, which might be + * seen as a waste. + * @param url - Url to be tokenized + * @return tokens - A String array of all the tokens + */ + private static String[] tokenizeURL(String url) + { + // I assume that we're going to use the whole URL to find tokens in + // If you want to just look in the GET parameters, or you want to ignore the domain + // or you want to use the domain as a token itself, that would have to be + // processed above the next line, and only the remaining parts split + String[] tokens = url.split("\\b|_"); + + // One could alternatively use a more complex regex to remove more invalid matches + // but this is subject to your (?:in)?ability to actually write the regex you want + + // These next two get rid of tokens that are too short, also. + + // Destroys anything that's not alphanumeric and things that are + // alphanumeric but only 1 character long + //String[] tokens = url.split("(?:[\\W_]+\\w)*[\\W_]+"); + + // Destroys anything that's not alphanumeric and things that are + // alphanumeric but only 1 or 2 characters long + //String[] tokens = url.split("(?:[\\W_]+\\w{1,2})*[\\W_]+"); + + return tokens; + } + + // How this would be used + public static void main(String[] args) + { + List<String> soQuestionUrlClassifications = getCategoriesFromUrl("http://stackoverflow.com/questions/10046178/pattern-matching-for-url-classification"); + List<String> googleQueryURLClassifications = getCategoriesFromUrl("https://www.google.com/search?sugexp=chrome,mod=18&sourceid=chrome&ie=UTF-8&q=spring+is+a+new+service+instance+created#hl=en&sugexp=ciatsh&gs_nf=1&gs_mss=spring%20is%20a%20new%20bean%20instance%20created&tok=lnAt2g0iy8CWkY65Te75sg&pq=spring%20is%20a%20new%20bean%20instance%20created&cp=6&gs_id=1l&xhr=t&q=urlencode&pf=p&safe=off&sclient=psy-ab&oq=url+en&gs_l=&pbx=1&bav=on.2,or.r_gc.r_pw.r_cp.r_qf.,cf.osb&fp=2176d1af1be1f17d&biw=1680&bih=965"); + } +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java new file mode 100644 index 0000000..fe0e898 --- /dev/null +++ b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java @@ -0,0 +1,251 @@ +package org.apache.streams.tika; + +import org.apache.streams.urls.LinkUnwinder; +import org.apache.streams.util.DateUtil; +import org.apache.streams.tika.BoilerPipeArticle; +import org.apache.streams.tika.LanguageDetected; +import org.apache.tika.exception.TikaException; +import org.apache.tika.language.LanguageIdentifier; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.BoilerpipeContentHandler; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.sax.BodyContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.document.TextBlock; + +import java.io.IOException; +import java.io.InputStream; +import java.io.StringWriter; +import java.net.URL; +import java.net.URLConnection; +import java.text.ParseException; +import java.util.*; + + +/** + * Helpful resources for this class: + * + * // TODO: This needs to be rethought. + * + * URL: + * Tika UI: http://www.apache.org/dyn/closer.cgi/tika/tika-app-1.4.jar + * Tika: http://tika.apache.org/ + * Dublin Core: http://dublincore.org/documents/dces/ + */ + +public class LinkExpander extends LinkUnwinder +{ + private final static Logger LOGGER = LoggerFactory.getLogger(LinkExpander.class); + + private static final AutoDetectParser AUTO_DETECT_PARSER = new AutoDetectParser(); + + private final Map<String, String> metaData = new HashMap<String, String>(); + + private final Set<String> keywords = new HashSet<String>(); + + private BoilerPipeArticle article = new BoilerPipeArticle(); + + // sblackmon: I put this here so I wouldn't get NullPointerExceptions when serializing results + public TextBlock getContentTextBlock() { + for(TextBlock textBlock : article.getTextBlocks()) + if(textBlock.isContent()) + return textBlock; + return null; + } + + private static final Collection<String> AUTHOR_SEARCH = new ArrayList<String>() {{ + add("og:author"); + add("dc:author"); + add("author"); + }}; + + private static final Collection<String> DESCRIPTION_SEARCH = new ArrayList<String>() {{ + add("og:description"); + add("dc:description"); + add("description"); + }}; + + private static final Collection<String> MEDIUM_SEARCH = new ArrayList<String>() {{ + add("og:medium"); + add("dc:medium"); + add("medium"); + }}; + + private static final Collection<String> IMAGE_SEARCH = new ArrayList<String>() {{ + add("og:image"); + add("twitter:image"); + add("image"); + }}; + + private static final Collection<String> KEYWORDS_SEARCH = new ArrayList<String>() {{ + add("keywords"); + add("news_keywords"); + }}; + + private static final Collection<String> PUB_DATE_SEARCH = new ArrayList<String>() {{ + add("pubdate"); + add("os:pubdate"); + add("dc:pubdate"); + }}; + + private static final Collection<String> MODIFIED_DATE_SEARCH = new ArrayList<String>() {{ + add("lastmod"); + add("last-modified"); + }}; + + private static final Collection<String> LOCALE_SEARCH = new ArrayList<String>() {{ + add("locale"); + add("os:locale"); + add("dc:local"); + }}; + + // Social Searchers + private static final Collection<String> FACEBOOK_PAGE_SEARCH = new ArrayList<String>() {{ + add("fb:page_id"); + }}; + + private static final Collection<String> FACEBOOK_APP_SEARCH = new ArrayList<String>() {{ + add("fb:app_id"); + }}; + + private static final Collection<String> TWITTER_SITE_SEARCH = new ArrayList<String>() {{ + add("twitter:site:id"); + add("twitter:site"); + }}; + + private static final Collection<String> TWITTER_CREATOR_SEARCH = new ArrayList<String>() {{ + add("twitter:creator:id"); + add("twitter:creator"); + }}; + + + public LinkExpander(String url) { + super(url); + } + + public void run() { + super.run(); + expandLink(); + } + + + private void expandLink() + { + InputStream is = null; + + try + { + URL url = new URL(this.getFinalURL()); + URLConnection con = url.openConnection(); + con.setConnectTimeout(10000); + is = con.getInputStream(); + + parseMainContent(is); + parsePlainText(is); + detectLanguage(article.getPlainText()); + + } + // Handle all Exceptions by just reporting that the site status was an error. + catch (IOException e) { + article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR); + } + catch (TikaException e) { + article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR); + } + catch (SAXException e) { + article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR); + } + catch (Exception e) { + article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR); + } + finally { + if (!(is == null)) { + try { + is.close(); + } + catch(IOException e) { + LOGGER.warn("Problem closing the input stream: {}", e.getMessage()); + } + } + } + } + + private void parseMainContent(InputStream is) throws IOException, SAXException, TikaException, ParseException + { + Metadata rawMetaData = new Metadata(); + StringWriter stringWriter = new StringWriter(); + + BoilerpipeContentHandler boilerpipeContentHandler = new BoilerpipeContentHandler(stringWriter); + + AUTO_DETECT_PARSER.parse(is, + boilerpipeContentHandler, + rawMetaData); + + article.setTextBlocks(boilerpipeContentHandler.getTextDocument().getTextBlocks()); + article.setBody(boilerpipeContentHandler.getTextDocument().getContent()); + article.setTitle(boilerpipeContentHandler.getTextDocument().getTitle()); + + // this map is for ourselves so we convert it to lower-case to make it easier to search. + // the meta data that is going to be returned will be unmodified meta data. + for(String name : rawMetaData.names()) + if(rawMetaData.get(name) != null) { + this.metaData.put(name.toLowerCase(), rawMetaData.get(name)); + article.setAdditionalProperty(name.toLowerCase(), rawMetaData.get(name)); + } + + article.setAuthor(metaDataSearcher(LinkExpander.AUTHOR_SEARCH)); + article.setDescription(metaDataSearcher(LinkExpander.DESCRIPTION_SEARCH)); + article.setMedium(metaDataSearcher(LinkExpander.MEDIUM_SEARCH)); + article.setImageURL(metaDataSearcher(LinkExpander.IMAGE_SEARCH)); + article.setLocale(metaDataSearcher(LinkExpander.LOCALE_SEARCH)); + + article.setFacebookApp(metaDataSearcher(LinkExpander.FACEBOOK_APP_SEARCH)); + article.setFacebookPage(metaDataSearcher(LinkExpander.FACEBOOK_PAGE_SEARCH)); + + article.setTwitterCreator(metaDataSearcher(LinkExpander.TWITTER_CREATOR_SEARCH)); + article.setTwitterSite(metaDataSearcher(LinkExpander.TWITTER_SITE_SEARCH)); + + mergeSet(LinkExpander.KEYWORDS_SEARCH, this.keywords); + + article.setPublishedDate(DateUtil.determineDate(metaDataSearcher(LinkExpander.PUB_DATE_SEARCH))); + article.setLastModifiedDate(DateUtil.determineDate(metaDataSearcher(LinkExpander.MODIFIED_DATE_SEARCH))); + + if(article.getBody().length() > 50) + article.setSiteStatus(BoilerPipeArticle.SiteStatus.SUCCESS); + } + + private void parsePlainText(InputStream is) throws Exception { + BodyContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + new HtmlParser().parse(is, handler, metadata, new ParseContext()); + article.setPlainText(handler.toString()); + } + + private void detectLanguage(String plainText) throws Exception { + LanguageDetected languageDetected = new LanguageDetected(); + LanguageIdentifier languageIdentifier = new LanguageIdentifier(plainText); + languageDetected.setLanguageCode(languageIdentifier.getLanguage()); + languageDetected.setIsLanguageReasonablyCertain(languageIdentifier.isReasonablyCertain()); + article.setLanguageDetected(languageDetected); + } + + private String metaDataSearcher(Collection<String> itemsToSearch) { + for(String s : itemsToSearch) + if(this.metaData.containsKey(s)) + return this.metaData.get(s); + + // the meta searcher returned nothing. + return null; + } + + private void mergeSet(Collection<String> itemsToSearch, Set<String> set) { + for(String s : itemsToSearch) + Collections.addAll(set, s == null || s.equals("") ? new String[]{} : s.split(",")); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java new file mode 100644 index 0000000..b2f337d --- /dev/null +++ b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java @@ -0,0 +1,104 @@ +package org.apache.streams.tika; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jsonorg.JsonOrgModule; +import com.google.common.collect.Lists; +import org.apache.commons.lang.NotImplementedException; +import org.apache.streams.core.StreamsDatum; +import org.apache.streams.core.StreamsProcessor; +import org.apache.streams.jackson.StreamsJacksonMapper; +import org.apache.streams.pojo.json.Activity; +import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +/** + * References: + * Some helpful references to help + * Purpose URL + * ------------- ---------------------------------------------------------------- + * [Status Codes] http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html + * [Test Cases] http://greenbytes.de/tech/tc/httpredirects/ + * [t.co behavior] https://dev.twitter.com/docs/tco-redirection-behavior + */ + +public class TikaProcessor implements StreamsProcessor +{ + private final static String STREAMS_ID = "LinkExpanderProcessor"; + + private final static Logger LOGGER = LoggerFactory.getLogger(TikaProcessor.class); + + private ObjectMapper mapper; + + @Override + public List<StreamsDatum> process(StreamsDatum entry) { + + List<StreamsDatum> result = Lists.newArrayList(); + + LOGGER.debug("{} processing {}", STREAMS_ID, entry.getDocument().getClass()); + + // get list of shared urls + if( entry.getDocument() instanceof Activity) { + + Activity input = (Activity) entry.getDocument(); + + List<String> outputLinks = input.getLinks(); + // for each + for( String link : outputLinks ) { + if( link instanceof String ) { + // expand + try { + StreamsDatum outputDatum = expandLink((String) link, entry); + result.add(outputDatum); + } catch (Exception e) { + //drop unexpandable links + LOGGER.debug("Failed to expand link : {}", link); + LOGGER.debug("Excpetion expanding link : {}", e); + } + } + else { + LOGGER.warn("Expected Links to be of type java.lang.String, but received {}", link.getClass().toString()); + } + } + + + } + else if(entry.getDocument() instanceof String) { + StreamsDatum outputDatum = expandLink((String) entry.getDocument(), entry); + result.add(outputDatum); + } + else throw new NotImplementedException(); + + return result; + } + + private StreamsDatum expandLink(String link, StreamsDatum input) { + + LinkExpander expander = new LinkExpander((String)link); + expander.run(); + StreamsDatum datum = null; + if(input.getId() == null) + datum = new StreamsDatum(this.mapper.convertValue(expander, JSONObject.class).toString(), expander.getFinalURL()); + else + datum = new StreamsDatum(this.mapper.convertValue(expander, JSONObject.class).toString(), input.getId()); + datum.setSequenceid(input.getSequenceid()); + datum.setMetadata(input.getMetadata()); + datum.setTimestamp(input.getTimestamp()); + return datum; + + } + + @Override + public void prepare(Object o) { + this.mapper = StreamsJacksonMapper.getInstance(); + this.mapper.registerModule(new JsonOrgModule()); + } + + @Override + public void cleanUp() { + + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json ---------------------------------------------------------------------- diff --git a/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json b/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json new file mode 100644 index 0000000..a23b13e --- /dev/null +++ b/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json @@ -0,0 +1,80 @@ +{ + "type": "object", + "$schema": "http://json-schema.org/draft-03/schema", + "id": "#", + "properties": { + "siteStatus" : { + "type" : "string", + "enum" : ["SUCCESS", "ERROR"] + }, + "title": { + "type": "string" + }, + "description": { + "type": "string" + }, + "body": { + "type": "string" + }, + "plainText": { + "type": "string" + }, + "medium": { + "type": "string" + }, + "author": { + "type": "string" + }, + "locale": { + "type": "string" + }, + "publishedDate": { + "type": "string", + "format" : "date-time" + }, + "lastModifiedDate": { + "type": "string", + "format" : "date-time" + }, + "imageURL": { + "type": "string" + }, + "languageDetected": { + "type": "object", + "properties": { + "languageCode": { + "type": "string" + }, + "isLanguageReasonablyCertain": { + "type": "boolean" + } + } + }, + "textBlocks": { + "type": "array", + "items": { + "javaType": "de.l3s.boilerpipe.document.TextBlock", + "type": "object" + } + }, + "keywords": { + "type": "array", + "uniqueItems": true, + "items": { + "type": "string" + } + }, + "twitterCreator": { + "type": "string" + }, + "twitterSite": { + "type": "string" + }, + "facebookPage": { + "type": "string" + }, + "facebookApp": { + "type": "string" + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-util/src/main/java/org/apache/streams/util/DateUtil.java ---------------------------------------------------------------------- diff --git a/streams-util/src/main/java/org/apache/streams/util/DateUtil.java b/streams-util/src/main/java/org/apache/streams/util/DateUtil.java new file mode 100644 index 0000000..e3201bc --- /dev/null +++ b/streams-util/src/main/java/org/apache/streams/util/DateUtil.java @@ -0,0 +1,174 @@ +package org.apache.streams.util; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.ISODateTimeFormat; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.*; + + +/* + * + * If you can think of a better way, feel free to implement. This was a great class that I found that + * solves the majority of the issue I was dealing with. + * + * smashew 11=13=2012 + * + * Site: + * http://stackoverflow.com/questions/3389348/parse-any-date-in-java + */ + +public class DateUtil +{ + + private static final String REGEX_ONLY_NUMBERS = "[0-9]+"; + + private static final Map<String, String> DATE_FORMAT_REGEXPS = new HashMap<String, String>() + { + private static final long serialVersionUID = 1L; + { + put("^\\d{8}$", "yyyyMMdd"); + put("^\\d{1,2}-\\d{1,2}-\\d{4}$", "dd-MM-yyyy"); + put("^\\d{4}-\\d{1,2}-\\d{1,2}$", "yyyy-MM-dd"); + put("^\\d{1,2}/\\d{1,2}/\\d{4}$", "MM/dd/yyyy"); + put("^\\d{4}/\\d{1,2}/\\d{1,2}$", "yyyy/MM/dd"); + put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}$", "dd MMM yyyy"); + put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}$", "dd MMMM yyyy"); + put("^\\d{12}$", "yyyyMMddHHmm"); + put("^\\d{8}\\s\\d{4}$", "yyyyMMdd HHmm"); + put("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}$", "dd-MM-yyyy HH:mm"); + put("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}$", "yyyy-MM-dd HH:mm"); + put("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}$", "MM/dd/yyyy HH:mm"); + put("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}$", "yyyy/MM/dd HH:mm"); + put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}$", "dd MMM yyyy HH:mm"); + put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}$", "dd MMMM yyyy HH:mm"); + put("^\\d{14}$", "yyyyMMddHHmmss"); + put("^\\d{8}\\s\\d{6}$", "yyyyMMdd HHmmss"); + put("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd-MM-yyyy HH:mm:ss"); + put("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$", "yyyy-MM-dd HH:mm:ss"); + put("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "MM/dd/yyyy HH:mm:ss"); + put("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$", "yyyy/MM/dd HH:mm:ss"); + put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd MMM yyyy HH:mm:ss"); + put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd MMMM yyyy HH:mm:ss"); + } + }; + + /** + * Determine SimpleDateFormat pattern matching with the given date string. Returns null if format is unknown. You + * can simply extend DateUtil with more formats if needed. + * + * @param dateString + * The date string to determine the SimpleDateFormat pattern for. + * @return The matching SimpleDateFormat pattern, or null if format is unknown. + * @see java.text.SimpleDateFormat + */ + public static String determineDateFormat(String dateString) + throws ParseException + { + for (String regexp : DATE_FORMAT_REGEXPS.keySet()) + if (dateString.toLowerCase().matches(regexp)) + return DATE_FORMAT_REGEXPS.get(regexp); + + throw new ParseException("unable to parse date",0); + } + + public static DateTime determineDate(String dateString) + throws ParseException + { + // Trim the string just in case it is dirty. + dateString = dateString.trim(); + + // check to see if it looks like it is millis. If so, parse as millis and return. + if(dateString.matches(REGEX_ONLY_NUMBERS)) + return new DateTime(new Date(Long.parseLong(dateString))); + + try + { + // try to parse the string into a java.date object, if possible. + SimpleDateFormat dateFormat = new SimpleDateFormat(determineDateFormat(dateString)); + dateFormat.setLenient(false); + return new DateTime(dateFormat.parse(dateString)); + } + catch(Exception e) + { + + } + + return new DateTime(DateTime.parse(dateString)); + } + + public static DateTime determineDateTime(String dateString) + throws ParseException + { + return new DateTime(determineDate(dateString)); + } + + public static DateTime determineDateTime(String dateString, DateTimeZone theTimeZone) + throws ParseException + { + DateTime beforeTimeZone = determineDateTime(dateString); + return new DateTime(beforeTimeZone.getYear(),beforeTimeZone.getMonthOfYear(), beforeTimeZone.getDayOfMonth(), beforeTimeZone.getHourOfDay(), beforeTimeZone.getMinuteOfHour(), beforeTimeZone.getSecondOfMinute(), beforeTimeZone.getMillisOfSecond(), theTimeZone); + } + + + public static String getAliasForDate(String date, String prefix) throws ParseException { + return getAliasesForDateRange(date, null, prefix).iterator().next(); + } + + public static String getAliasForDate(DateTime date, String prefix) throws ParseException { + return getAliasesForDateRange(date, null, prefix).iterator().next(); + } + + public static Set<String> getAliasesForDateRange(String starDate, String endDate, String prefix) + throws ParseException + { + DateTime start = null; + DateTime end = null; + DateTimeFormatter df = ISODateTimeFormat.dateTimeNoMillis(); + try { + start = df.parseDateTime(starDate); + } catch (Exception e) { + //do nothing. try to parse with other parsers + } + if(start == null) { + start = determineDateTime(starDate); + } + if(endDate != null) { + try { + end = df.parseDateTime(endDate); + } catch (Exception e) { + //do nothing. try to parse with other parsers + } + if( end == null) + end = determineDateTime(endDate); + } + return getAliasesForDateRange(start, end, prefix); + } + + public static Set<String> getAliasesForDateRange(DateTime startDate, DateTime endDate, String prefix) { + Set<String> aliases = new HashSet<String>(); + aliases.add(prefix+"_"+getDateAbbreviation(startDate.getYear(), startDate.getMonthOfYear())); + if(endDate == null) { + return aliases; + } + while(endDate.isAfter(startDate)) { + aliases.add(prefix+"_"+getDateAbbreviation(endDate.getYear(), endDate.getMonthOfYear())); + endDate = endDate.minusMonths(1); + } + return aliases; + } + + private static String getDateAbbreviation(int year, int month) { + if(month > 9) { + return Integer.toString(year)+Integer.toString(month); + } + else { + return Integer.toString(year)+"0"+Integer.toString(month); + } + } + + +}
