Author: sinaci
Date: Mon Mar 5 16:41:31 2012
New Revision: 1297121
URL: http://svn.apache.org/viewvc?rev=1297121&view=rev
Log:
Moving CNN Crawler from contenthub into demos
Added:
incubator/stanbol/trunk/demos/crawler/pom.xml
incubator/stanbol/trunk/demos/crawler/src/
incubator/stanbol/trunk/demos/crawler/src/license/
incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties
incubator/stanbol/trunk/demos/crawler/src/main/
incubator/stanbol/trunk/demos/crawler/src/main/java/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/crawler/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/crawler/cnn/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/crawler/cnn/impl/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
incubator/stanbol/trunk/demos/crawler/src/main/resources/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/jit.js
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/jquery-1.5.1.min.js
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/jquery-ui-1.8.11.custom.min.js
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/prettify/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/prettify/prettify.css
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/prettify/prettify.js
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/style/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/style/contenthub.css
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/style/jquery-ui-1.8.11.custom.css
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/resources/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource/
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource/index.ftl
Added: incubator/stanbol/trunk/demos/crawler/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/pom.xml?rev=1297121&view=auto
==============================================================================
--- incubator/stanbol/trunk/demos/crawler/pom.xml (added)
+++ incubator/stanbol/trunk/demos/crawler/pom.xml Mon Mar 5 16:41:31 2012
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>stanbol-parent</artifactId>
+ <version>0.9.0-incubating-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <artifactId>org.apache.stanbol.demos.crawler</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Crawlers</name>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Embed-Dependency>jtidy|htmlcleaner</Embed-Dependency>
+ <Import-Package>
+ !org.jdom,
+ !org.apache.tools.ant,
+ *
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.osgi</groupId>
+ <artifactId>org.osgi.compendium</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.contenthub.servicesapi</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>net.sourceforge.htmlcleaner</groupId>
+ <artifactId>htmlcleaner</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>jtidy</groupId>
+ <artifactId>jtidy</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.web.base</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.freemarker</groupId>
+ <artifactId>freemarker</artifactId>
+ </dependency>
+ </dependencies>
+</project>
Added: incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties?rev=1297121&view=auto
==============================================================================
--- incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties
(added)
+++ incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties
Mon Mar 5 16:41:31 2012
@@ -0,0 +1,25 @@
+# Generated by org.codehaus.mojo.license.AddThirdPartyMojo
+#-------------------------------------------------------------------------------
+# Already used licenses in project :
+# - Apache 2
+# - Apache License
+# - BSD
+# - BSD License
+# - Common Development and Distribution License (CDDL) v1.0
+# - Common Public License Version 1.0
+# - ICU License
+# - Java HTML Tidy License
+# - MIT License
+# - The Apache Software License, Version 2.0
+#-------------------------------------------------------------------------------
+# Please fill the missing licenses for dependencies :
+#
+#
+#Wed Feb 15 19:03:24 CET 2012
+javax.servlet--servlet-api--2.4=Common Development And Distribution License
(CDDL), Version 1.0
+org.apache.ant--ant--1.7.0=The Apache Software License, Version 2.0
+org.apache.ant--ant-launcher--1.7.0=The Apache Software License, Version 2.0
+org.apache.zookeeper--zookeeper--3.3.1=The Apache Software License, Version 2.0
+org.jdom--jdom--1.1=BSD License
+org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0
+org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0
Added:
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java?rev=1297121&view=auto
==============================================================================
---
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
(added)
+++
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
Mon Mar 5 16:41:31 2012
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.cnn;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.contenthub.servicesapi.ldpath.SemanticIndexManager;
+import org.apache.stanbol.contenthub.servicesapi.store.solr.SolrContentItem;
+import org.apache.stanbol.contenthub.servicesapi.store.solr.SolrStore;
+import org.apache.stanbol.demos.crawler.web.model.NewsSummary;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.tidy.Tidy;
+
+/**
+ *
+ * @author cihan
+ * @author anil.sinaci
+ *
+ */
+@Component(metatype = true)
+@Service(value=CNNCrawler.class)
+public class CNNCrawler {
+
+ private static final Logger logger = LoggerFactory
+ .getLogger(CNNCrawler.class);
+
+ private static final String CNN_URL = "http://topics.cnn.com/topics/";
+ private static final String TEXT_CLASS = "cnn_strycntntlft";
+
+ @Reference
+ private SolrStore solrStore;
+
+ @Activate
+ public void activate(ComponentContext cc) {
+ if (solrStore == null) {
+ logger.error("Cannot activate CNNImporter. There is no
SolrStore to be binded.");
+ }
+ }
+
+ /**
+ *
+ * @param topic
+ * The topic which will be crawled.
+ * @param maxNumber
+ * Max number of news to be retrieved from CNN about the
+ * {@link topic}
+ * @param fullNews
+ * If {@code true}, the topic will be crawled in detail to
+ * retrieve all information from CNN about the {@link
topic}. If
+ * {@code false}, only summary of the news will be crawled
and
+ * imported.
+ * @return A map which includes the URI of the related topic and the
news
+ * content. If {@link fullNews} is {@code true}, the news
content is
+ * the full news; if not, it is the summary of the news.
+ */
+ public Map<URI, String> importCNNNews(String topic, int maxNumber,
+ boolean fullNews) {
+ return importCNNNews(topic, maxNumber, fullNews, null);
+ }
+
+ /**
+ *
+ * @param topic
+ * The topic which will be crawled.
+ * @param maxNumber
+ * Max number of news to be retrieved from CNN about the
+ * {@link topic}
+ * @param fullNews
+ * If {@code true}, the topic will be crawled in detail to
+ * retrieve all information from CNN about the {@link
topic}. If
+ * {@code false}, only summary of the news will be crawled
and
+ * imported.
+ *
+ * @param indexName
+ * Name of the LDPath program (name of the Solr core/index)
to be
+ * used while storing this content item. LDPath programs can
be
+ * managed through {@link SemanticIndexManagerResource} or
+ * {@link SemanticIndexManager}
+ *
+ * @return A map which includes the URI of the related topic and the
news
+ * content. If {@link fullNews} is {@code true}, the news
content is
+ * the full news; if not, it is the summary of the news.
+ */
+ public Map<URI, String> importCNNNews(String topic, int maxNumber,
+ boolean fullNews, String indexName) {
+ List<NewsSummary> summaries = getRelatedNews(topic, maxNumber);
+ Map<URI, String> newsInfo = new HashMap<URI, String>();
+ if (fullNews) {
+ for (NewsSummary summary : summaries) {
+ String realContent =
getNewsContent(summary.getNewsURI());
+ if (realContent != null &&
!realContent.isEmpty()) {
+ summary.setContent(realContent);
+ }
+ }
+ }
+
+ for (NewsSummary summary : summaries) {
+ try {
+ SolrContentItem sci =
solrStore.create(summary.getContent()
+ .getBytes(), null,
summary.getTitle(), "text/plain",
+ null);
+ URI uri = new URI(solrStore.enhanceAndPut(sci,
indexName));
+ String title = summary.getTitle();
+ if (uri != null) {
+ newsInfo.put(uri, title);
+ }
+ } catch (Exception e) {
+ logger.error("Error storing content {}.
Skipping ...",
+ summary.getContent(), e);
+ }
+ }
+ return newsInfo;
+ }
+
+ private String getNewsContent(URI newsURI) {
+ try {
+ URL newsURL = newsURI.toURL();
+ HtmlCleaner cleaner = new HtmlCleaner();
+ TagNode root = cleaner.clean(newsURL);
+ Object[] text = root.evaluateXPath("//div[@class='" +
TEXT_CLASS
+ + "']");
+ StringBuilder realContent = new StringBuilder();
+ for (Object storyPart : text) {
+
+ try {
+ TagNode storyFragment = (TagNode)
storyPart;
+ for (TagNode child :
storyFragment.getChildTags()) {
+ if
(child.getName().equals("p")) {
+
realContent.append(child.getText().toString());
+ }
+ }
+ } catch (ClassCastException e) {
+ logger.debug("Can not cast {} to
TagNode",
+ storyPart.getClass());
+ }
+ }
+ return realContent.toString();
+ } catch (Exception e) {
+ logger.warn("Unable to get real content of the news {}",
+ newsURI.toString());
+ }
+ return null;
+
+ }
+
+ private List<NewsSummary> getRelatedNews(String topic, int maxNumber) {
+ List<NewsSummary> summaries = new ArrayList<NewsSummary>();
+ try {
+ URL topicURL = new URL(CNN_URL
+ + topic.toLowerCase().replaceAll(" ",
"_"));
+ Tidy tidy = new Tidy();
+ Document doc = tidy.parseDOM(topicURL.openStream(),
+ new ByteArrayOutputStream());
+ NodeList nodes =
doc.getDocumentElement().getElementsByTagName(
+ "div");
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node current = nodes.item(i);
+ NamedNodeMap atts = current.getAttributes();
+ Node classAtt = atts.getNamedItem("class");
+ if (classAtt != null
+ && classAtt
+ .getNodeValue()
+
.equals("cnnRelatedArticle archive-item story cnn_skn_spccovstrylst")) {
+ NewsSummary summary =
createSummary((Element) current);
+ if (summary != null) {
+ summaries.add(summary);
+ }
+ }
+ if (summaries.size() >= maxNumber) {
+ break;
+ }
+ }
+ } catch (MalformedURLException e) {
+ logger.warn("Topic {} results in malformed url.",
topic);
+ } catch (IOException e) {
+ logger.warn("Can get content of topic {}.", topic);
+ }
+ return summaries;
+ }
+
+ private NewsSummary createSummary(Element current) {
+ NewsSummary newsSummary = null;
+ try {
+ String summary =
current.getElementsByTagName("p").item(0)
+ .getFirstChild().getNodeValue();
+ String uri = ((Element)
current.getElementsByTagName("a").item(0))
+ .getAttribute("href");
+ String title = current.getElementsByTagName("a").item(0)
+ .getFirstChild().getNodeValue();
+ newsSummary = new NewsSummary();
+ newsSummary.setNewsURI(new URI(uri));
+ newsSummary.setTitle(title);
+ newsSummary.setContent(summary);
+ } catch (Exception e) {
+ newsSummary = null;
+ logger.warn("Error creating summary from node {}",
current);
+ }
+ return newsSummary;
+ }
+
+}
Added:
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java?rev=1297121&view=auto
==============================================================================
---
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
(added)
+++
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
Mon Mar 5 16:41:31 2012
@@ -0,0 +1,107 @@
+/**
+ *
+ */
+package org.apache.stanbol.demos.crawler.web.fragment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.web.base.LinkResource;
+import org.apache.stanbol.commons.web.base.NavigationLink;
+import org.apache.stanbol.commons.web.base.ScriptResource;
+import org.apache.stanbol.commons.web.base.WebFragment;
+import org.apache.stanbol.demos.crawler.web.resources.CNNCrawlerResource;
+import org.osgi.framework.BundleContext;
+import org.osgi.service.component.ComponentContext;
+
+import freemarker.cache.ClassTemplateLoader;
+import freemarker.cache.TemplateLoader;
+
+/**
+ * @author anil.sinaci
+ *
+ */
+@Component(immediate = true, metatype = true)
+@Service
+public class CrawlerWebFragment implements WebFragment {
+
+ private static final String NAME = "crawler";
+
+ private static final String TEMPLATE_PATH =
"/org/apache/stanbol/demos/crawler/web/templates";
+ private static final String STATIC_RESOURCE_PATH =
"/org/apache/stanbol/demos/crawler/web/static";
+
+ private BundleContext bundleContext;
+
+ @Activate
+ protected void activate(ComponentContext ctx) {
+ this.bundleContext = ctx.getBundleContext();
+ }
+
+ @Override
+ public String getName() {
+ return NAME;
+ }
+
+ @Override
+ public String getStaticResourceClassPath() {
+ return STATIC_RESOURCE_PATH;
+ }
+
+ @Override
+ public Set<Class<?>> getJaxrsResourceClasses() {
+ Set<Class<?>> classes = new HashSet<Class<?>>();
+ classes.add(CNNCrawlerResource.class);
+ return classes;
+ }
+
+ @Override
+ public Set<Object> getJaxrsResourceSingletons() {
+ return Collections.emptySet();
+ }
+
+ @Override
+ public TemplateLoader getTemplateLoader() {
+ return new ClassTemplateLoader(getClass(), TEMPLATE_PATH);
+ }
+
+ @Override
+ public List<LinkResource> getLinkResources() {
+ List<LinkResource> resources = new ArrayList<LinkResource>();
+ resources.add(new LinkResource("stylesheet",
"style/contenthub.css",
+ this, 0));
+ resources.add(new LinkResource("stylesheet",
+ "style/jquery-ui-1.8.11.custom.css", this, 1));
+ return resources;
+ }
+
+ @Override
+ public List<ScriptResource> getScriptResources() {
+ List<ScriptResource> resources = new
ArrayList<ScriptResource>();
+ resources.add(new ScriptResource("text/javascript",
+ "scripts/prettify/prettify.js", this, 0));
+ resources.add(new ScriptResource("text/javascript",
"scripts/jit.js",
+ this, 1));
+ resources.add(new ScriptResource("text/javascript",
+ "scripts/jquery-1.5.1.min.js", this, 2));
+ resources.add(new ScriptResource("text/javascript",
+ "scripts/jquery-ui-1.8.11.custom.min.js", this,
3));
+ return resources;
+ }
+
+ @Override
+ public List<NavigationLink> getNavigationLinks() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public BundleContext getBundleContext() {
+ return bundleContext;
+ }
+
+}
Added:
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java?rev=1297121&view=auto
==============================================================================
---
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
(added)
+++
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
Mon Mar 5 16:41:31 2012
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.web.model;
+
+import java.net.URI;
+
+/**
+ * Bean class to keep information about news that are obtained from news
sources.
+ *
+ * @author cihan
+ *
+ */
+public class NewsSummary {
+
+ private URI newsURI;
+ private String title;
+ private String content;
+
+ public void setNewsURI(URI newsURI) {
+ this.newsURI = newsURI;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ public URI getNewsURI() {
+ return newsURI;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public void setTitle(String title) {
+ this.title = title;
+ }
+}
Added:
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java?rev=1297121&view=auto
==============================================================================
---
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
(added)
+++
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
Mon Mar 5 16:41:31 2012
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.web.model;
+
+import java.net.URI;
+import java.util.List;
+
+/**
+ *
+ * @author cihan
+ *
+ */
+public class TopicNews {
+
+ private String topic;
+ private List<URI> uris;
+ private List<String> titles;
+
+ public String getTopic() {
+ return topic;
+ }
+
+ public List<URI> getUris() {
+ return uris;
+ }
+
+ public List<String> getTitles() {
+ return titles;
+ }
+
+ public void setTopic(String topic) {
+ this.topic = topic;
+ }
+
+ public void setUris(List<URI> uris) {
+ this.uris = uris;
+ }
+
+ public void setTitles(List<String> titles) {
+ this.titles = titles;
+ }
+}
Added:
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java?rev=1297121&view=auto
==============================================================================
---
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
(added)
+++
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
Mon Mar 5 16:41:31 2012
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.web.resources;
+
+import static javax.ws.rs.core.MediaType.TEXT_HTML;
+
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Map;
+
+import javax.servlet.ServletContext;
+import javax.ws.rs.FormParam;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.Response;
+
+import org.apache.stanbol.commons.web.base.ContextHelper;
+import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
+import org.apache.stanbol.demos.crawler.cnn.CNNCrawler;
+import org.apache.stanbol.demos.crawler.web.model.TopicNews;
+
+import com.sun.jersey.api.view.Viewable;
+
+/**
+ * This is the web resource for CNN Crawler.
+ *
+ * @author cihan
+ *
+ */
+@Path("/crawler/cnn/{index}")
+public class CNNCrawlerResource extends BaseStanbolResource {
+
+ private CNNCrawler cnnCrawler;
+ private Object templateData = null;
+ private String indexName;
+
+ public CNNCrawlerResource(@Context ServletContext context,
+ @PathParam(value = "index") String indexName) {
+ this.indexName = indexName;
+ this.cnnCrawler =
ContextHelper.getServiceFromContext(CNNCrawler.class, context);
+ }
+
+ private TopicNews importCNNNews(String topic, Integer max, Boolean
full) {
+ if (topic == null || topic.isEmpty()) {
+ return null;
+ }
+ if (max == null) {
+ max = 10;
+ }
+ if (full == null) {
+ full = false;
+ }
+
+ Map<URI, String> newsInfo = cnnCrawler.importCNNNews(topic,
max, full,
+ indexName);
+ TopicNews tn = new TopicNews();
+ tn.setTopic(topic);
+ tn.setUris(new ArrayList<URI>(newsInfo.keySet()));
+ tn.setTitles(new ArrayList<String>(newsInfo.values()));
+ return tn;
+ }
+
+ /**
+ * For HTML view only.
+ *
+ * @return Returns the HTML view for CNN News Crawler.
+ */
+ @GET
+ @Produces(TEXT_HTML)
+ public Response importCNNNewsHTML() {
+ return Response.ok(new Viewable("index", this),
TEXT_HTML).build();
+ }
+
+ /**
+ *
+ * @param topic
+ * The topic which will be crawled.
+ * @param max
+ * Maximum number of news to be retrieved from CNN about the
+ * {@link topic}
+ * @param full
+ * If {@code yes}, the topic will be crawled in detail to
+ * retrieve all information from CNN about the {@link
topic}. If
+ * {@code no}, only summary of the news will be crawled and
+ * imported.
+ * @return Returns the HTML view as the result of importing news from
CNN.
+ */
+ @POST
+ @Produces(TEXT_HTML)
+ public Response importCNNNewsHTMLPOST(@FormParam("topic") String topic,
+ @FormParam("max") Integer max, @FormParam("full")
Boolean full) {
+ this.templateData = importCNNNews(topic, max, full);
+ return Response.ok(new Viewable("index", this),
TEXT_HTML).build();
+ }
+
+ public Object getTemplateData() {
+ return templateData;
+ }
+
+ public String getIndexName() {
+ return this.indexName;
+ }
+}