Author: sinaci
Date: Mon Mar  5 16:41:31 2012
New Revision: 1297121

URL: http://svn.apache.org/viewvc?rev=1297121&view=rev
Log:
Moving CNN Crawler from contenthub into demos

Added:
    incubator/stanbol/trunk/demos/crawler/pom.xml
    incubator/stanbol/trunk/demos/crawler/src/
    incubator/stanbol/trunk/demos/crawler/src/license/
    incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties
    incubator/stanbol/trunk/demos/crawler/src/main/
    incubator/stanbol/trunk/demos/crawler/src/main/java/
    incubator/stanbol/trunk/demos/crawler/src/main/java/org/
    incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/
    incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/crawler/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/crawler/cnn/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/contenthub/crawler/cnn/impl/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/
    
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
    incubator/stanbol/trunk/demos/crawler/src/main/resources/
    incubator/stanbol/trunk/demos/crawler/src/main/resources/org/
    incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/
    incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/jit.js
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/jquery-1.5.1.min.js
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/jquery-ui-1.8.11.custom.min.js
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/prettify/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/prettify/prettify.css
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/scripts/prettify/prettify.js
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/style/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/style/contenthub.css
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/static/style/jquery-ui-1.8.11.custom.css
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/resources/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource/
    
incubator/stanbol/trunk/demos/crawler/src/main/resources/org/apache/stanbol/demos/crawler/web/templates/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource/index.ftl

Added: incubator/stanbol/trunk/demos/crawler/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/pom.xml?rev=1297121&view=auto
==============================================================================
--- incubator/stanbol/trunk/demos/crawler/pom.xml (added)
+++ incubator/stanbol/trunk/demos/crawler/pom.xml Mon Mar  5 16:41:31 2012
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/maven-v4_0_0.xsd";>
+
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>stanbol-parent</artifactId>
+    <version>0.9.0-incubating-SNAPSHOT</version>
+    <relativePath>../../parent</relativePath>
+  </parent>
+
+  <artifactId>org.apache.stanbol.demos.crawler</artifactId>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Crawlers</name>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Embed-Dependency>jtidy|htmlcleaner</Embed-Dependency>
+            <Import-Package>
+              !org.jdom,
+              !org.apache.tools.ant,
+              *
+            </Import-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.scr.annotations</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.osgi</groupId>
+      <artifactId>org.osgi.core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.osgi</groupId>
+      <artifactId>org.osgi.compendium</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.contenthub.servicesapi</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>net.sourceforge.htmlcleaner</groupId>
+      <artifactId>htmlcleaner</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>jtidy</groupId>
+      <artifactId>jtidy</artifactId>
+    </dependency>
+    
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.commons.web.base</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.freemarker</groupId>
+      <artifactId>freemarker</artifactId>
+    </dependency>
+  </dependencies>
+</project>

Added: incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties?rev=1297121&view=auto
==============================================================================
--- incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties 
(added)
+++ incubator/stanbol/trunk/demos/crawler/src/license/THIRD-PARTY.properties 
Mon Mar  5 16:41:31 2012
@@ -0,0 +1,25 @@
+# Generated by org.codehaus.mojo.license.AddThirdPartyMojo
+#-------------------------------------------------------------------------------
+# Already used licenses in project :
+# - Apache 2
+# - Apache License
+# - BSD
+# - BSD License
+# - Common Development and Distribution License (CDDL) v1.0
+# - Common Public License Version 1.0
+# - ICU License
+# - Java HTML Tidy License
+# - MIT License
+# - The Apache Software License, Version 2.0
+#-------------------------------------------------------------------------------
+# Please fill the missing licenses for dependencies :
+#
+#
+#Wed Feb 15 19:03:24 CET 2012
+javax.servlet--servlet-api--2.4=Common Development And Distribution License 
(CDDL), Version 1.0
+org.apache.ant--ant--1.7.0=The Apache Software License, Version 2.0
+org.apache.ant--ant-launcher--1.7.0=The Apache Software License, Version 2.0
+org.apache.zookeeper--zookeeper--3.3.1=The Apache Software License, Version 2.0
+org.jdom--jdom--1.1=BSD License
+org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0
+org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0

Added: 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java?rev=1297121&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
 (added)
+++ 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/cnn/CNNCrawler.java
 Mon Mar  5 16:41:31 2012
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.cnn;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.contenthub.servicesapi.ldpath.SemanticIndexManager;
+import org.apache.stanbol.contenthub.servicesapi.store.solr.SolrContentItem;
+import org.apache.stanbol.contenthub.servicesapi.store.solr.SolrStore;
+import org.apache.stanbol.demos.crawler.web.model.NewsSummary;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.tidy.Tidy;
+
+/**
+ * 
+ * @author cihan
+ * @author anil.sinaci
+ * 
+ */
+@Component(metatype = true)
+@Service(value=CNNCrawler.class)
+public class CNNCrawler {
+
+       private static final Logger logger = LoggerFactory
+                       .getLogger(CNNCrawler.class);
+
+       private static final String CNN_URL = "http://topics.cnn.com/topics/";;
+       private static final String TEXT_CLASS = "cnn_strycntntlft";
+
+       @Reference
+       private SolrStore solrStore;
+
+       @Activate
+       public void activate(ComponentContext cc) {
+               if (solrStore == null) {
+                       logger.error("Cannot activate CNNImporter. There is no 
SolrStore to be binded.");
+               }
+       }
+
+       /**
+        * 
+        * @param topic
+        *            The topic which will be crawled.
+        * @param maxNumber
+        *            Max number of news to be retrieved from CNN about the
+        *            {@link topic}
+        * @param fullNews
+        *            If {@code true}, the topic will be crawled in detail to
+        *            retrieve all information from CNN about the {@link 
topic}. If
+        *            {@code false}, only summary of the news will be crawled 
and
+        *            imported.
+        * @return A map which includes the URI of the related topic and the 
news
+        *         content. If {@link fullNews} is {@code true}, the news 
content is
+        *         the full news; if not, it is the summary of the news.
+        */
+       public Map<URI, String> importCNNNews(String topic, int maxNumber,
+                       boolean fullNews) {
+               return importCNNNews(topic, maxNumber, fullNews, null);
+       }
+
+       /**
+        * 
+        * @param topic
+        *            The topic which will be crawled.
+        * @param maxNumber
+        *            Max number of news to be retrieved from CNN about the
+        *            {@link topic}
+        * @param fullNews
+        *            If {@code true}, the topic will be crawled in detail to
+        *            retrieve all information from CNN about the {@link 
topic}. If
+        *            {@code false}, only summary of the news will be crawled 
and
+        *            imported.
+        * 
+        * @param indexName
+        *            Name of the LDPath program (name of the Solr core/index) 
to be
+        *            used while storing this content item. LDPath programs can 
be
+        *            managed through {@link SemanticIndexManagerResource} or
+        *            {@link SemanticIndexManager}
+        * 
+        * @return A map which includes the URI of the related topic and the 
news
+        *         content. If {@link fullNews} is {@code true}, the news 
content is
+        *         the full news; if not, it is the summary of the news.
+        */
+       public Map<URI, String> importCNNNews(String topic, int maxNumber,
+                       boolean fullNews, String indexName) {
+               List<NewsSummary> summaries = getRelatedNews(topic, maxNumber);
+               Map<URI, String> newsInfo = new HashMap<URI, String>();
+               if (fullNews) {
+                       for (NewsSummary summary : summaries) {
+                               String realContent = 
getNewsContent(summary.getNewsURI());
+                               if (realContent != null && 
!realContent.isEmpty()) {
+                                       summary.setContent(realContent);
+                               }
+                       }
+               }
+
+               for (NewsSummary summary : summaries) {
+                       try {
+                               SolrContentItem sci = 
solrStore.create(summary.getContent()
+                                               .getBytes(), null, 
summary.getTitle(), "text/plain",
+                                               null);
+                               URI uri = new URI(solrStore.enhanceAndPut(sci, 
indexName));
+                               String title = summary.getTitle();
+                               if (uri != null) {
+                                       newsInfo.put(uri, title);
+                               }
+                       } catch (Exception e) {
+                               logger.error("Error storing content {}. 
Skipping ...",
+                                               summary.getContent(), e);
+                       }
+               }
+               return newsInfo;
+       }
+
+       private String getNewsContent(URI newsURI) {
+               try {
+                       URL newsURL = newsURI.toURL();
+                       HtmlCleaner cleaner = new HtmlCleaner();
+                       TagNode root = cleaner.clean(newsURL);
+                       Object[] text = root.evaluateXPath("//div[@class='" + 
TEXT_CLASS
+                                       + "']");
+                       StringBuilder realContent = new StringBuilder();
+                       for (Object storyPart : text) {
+
+                               try {
+                                       TagNode storyFragment = (TagNode) 
storyPart;
+                                       for (TagNode child : 
storyFragment.getChildTags()) {
+                                               if 
(child.getName().equals("p")) {
+                                                       
realContent.append(child.getText().toString());
+                                               }
+                                       }
+                               } catch (ClassCastException e) {
+                                       logger.debug("Can not cast {} to 
TagNode",
+                                                       storyPart.getClass());
+                               }
+                       }
+                       return realContent.toString();
+               } catch (Exception e) {
+                       logger.warn("Unable to get real content of the news {}",
+                                       newsURI.toString());
+               }
+               return null;
+
+       }
+
+       private List<NewsSummary> getRelatedNews(String topic, int maxNumber) {
+               List<NewsSummary> summaries = new ArrayList<NewsSummary>();
+               try {
+                       URL topicURL = new URL(CNN_URL
+                                       + topic.toLowerCase().replaceAll(" ", 
"_"));
+                       Tidy tidy = new Tidy();
+                       Document doc = tidy.parseDOM(topicURL.openStream(),
+                                       new ByteArrayOutputStream());
+                       NodeList nodes = 
doc.getDocumentElement().getElementsByTagName(
+                                       "div");
+                       for (int i = 0; i < nodes.getLength(); i++) {
+                               Node current = nodes.item(i);
+                               NamedNodeMap atts = current.getAttributes();
+                               Node classAtt = atts.getNamedItem("class");
+                               if (classAtt != null
+                                               && classAtt
+                                                               .getNodeValue()
+                                                               
.equals("cnnRelatedArticle archive-item story cnn_skn_spccovstrylst")) {
+                                       NewsSummary summary = 
createSummary((Element) current);
+                                       if (summary != null) {
+                                               summaries.add(summary);
+                                       }
+                               }
+                               if (summaries.size() >= maxNumber) {
+                                       break;
+                               }
+                       }
+               } catch (MalformedURLException e) {
+                       logger.warn("Topic {} results in malformed url.", 
topic);
+               } catch (IOException e) {
+                       logger.warn("Can get content of topic {}.", topic);
+               }
+               return summaries;
+       }
+
+       private NewsSummary createSummary(Element current) {
+               NewsSummary newsSummary = null;
+               try {
+                       String summary = 
current.getElementsByTagName("p").item(0)
+                                       .getFirstChild().getNodeValue();
+                       String uri = ((Element) 
current.getElementsByTagName("a").item(0))
+                                       .getAttribute("href");
+                       String title = current.getElementsByTagName("a").item(0)
+                                       .getFirstChild().getNodeValue();
+                       newsSummary = new NewsSummary();
+                       newsSummary.setNewsURI(new URI(uri));
+                       newsSummary.setTitle(title);
+                       newsSummary.setContent(summary);
+               } catch (Exception e) {
+                       newsSummary = null;
+                       logger.warn("Error creating summary from node {}", 
current);
+               }
+               return newsSummary;
+       }
+
+}

Added: 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java?rev=1297121&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
 (added)
+++ 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/fragment/CrawlerWebFragment.java
 Mon Mar  5 16:41:31 2012
@@ -0,0 +1,107 @@
+/**
+ * 
+ */
+package org.apache.stanbol.demos.crawler.web.fragment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.web.base.LinkResource;
+import org.apache.stanbol.commons.web.base.NavigationLink;
+import org.apache.stanbol.commons.web.base.ScriptResource;
+import org.apache.stanbol.commons.web.base.WebFragment;
+import org.apache.stanbol.demos.crawler.web.resources.CNNCrawlerResource;
+import org.osgi.framework.BundleContext;
+import org.osgi.service.component.ComponentContext;
+
+import freemarker.cache.ClassTemplateLoader;
+import freemarker.cache.TemplateLoader;
+
+/**
+ * @author anil.sinaci
+ * 
+ */
+@Component(immediate = true, metatype = true)
+@Service
+public class CrawlerWebFragment implements WebFragment {
+
+       private static final String NAME = "crawler";
+
+       private static final String TEMPLATE_PATH = 
"/org/apache/stanbol/demos/crawler/web/templates";
+       private static final String STATIC_RESOURCE_PATH = 
"/org/apache/stanbol/demos/crawler/web/static";
+
+       private BundleContext bundleContext;
+
+       @Activate
+       protected void activate(ComponentContext ctx) {
+               this.bundleContext = ctx.getBundleContext();
+       }
+
+       @Override
+       public String getName() {
+               return NAME;
+       }
+
+       @Override
+       public String getStaticResourceClassPath() {
+               return STATIC_RESOURCE_PATH;
+       }
+
+       @Override
+       public Set<Class<?>> getJaxrsResourceClasses() {
+               Set<Class<?>> classes = new HashSet<Class<?>>();
+               classes.add(CNNCrawlerResource.class);
+               return classes;
+       }
+
+       @Override
+       public Set<Object> getJaxrsResourceSingletons() {
+               return Collections.emptySet();
+       }
+
+       @Override
+       public TemplateLoader getTemplateLoader() {
+               return new ClassTemplateLoader(getClass(), TEMPLATE_PATH);
+       }
+
+       @Override
+       public List<LinkResource> getLinkResources() {
+               List<LinkResource> resources = new ArrayList<LinkResource>();
+               resources.add(new LinkResource("stylesheet", 
"style/contenthub.css",
+                               this, 0));
+               resources.add(new LinkResource("stylesheet",
+                               "style/jquery-ui-1.8.11.custom.css", this, 1));
+               return resources;
+       }
+
+       @Override
+       public List<ScriptResource> getScriptResources() {
+               List<ScriptResource> resources = new 
ArrayList<ScriptResource>();
+               resources.add(new ScriptResource("text/javascript",
+                               "scripts/prettify/prettify.js", this, 0));
+               resources.add(new ScriptResource("text/javascript", 
"scripts/jit.js",
+                               this, 1));
+               resources.add(new ScriptResource("text/javascript",
+                               "scripts/jquery-1.5.1.min.js", this, 2));
+               resources.add(new ScriptResource("text/javascript",
+                               "scripts/jquery-ui-1.8.11.custom.min.js", this, 
3));
+               return resources;
+       }
+
+       @Override
+       public List<NavigationLink> getNavigationLinks() {
+               return Collections.emptyList();
+       }
+
+       @Override
+       public BundleContext getBundleContext() {
+               return bundleContext;
+       }
+
+}

Added: 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java?rev=1297121&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
 (added)
+++ 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/NewsSummary.java
 Mon Mar  5 16:41:31 2012
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.web.model;
+
+import java.net.URI;
+
+/**
+ * Bean class to keep information about news that are obtained from news 
sources.
+ * 
+ * @author cihan
+ * 
+ */
+public class NewsSummary {
+
+    private URI newsURI;
+    private String title;
+    private String content;
+
+    public void setNewsURI(URI newsURI) {
+        this.newsURI = newsURI;
+    }
+
+    public void setContent(String content) {
+        this.content = content;
+    }
+
+    public URI getNewsURI() {
+        return newsURI;
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public void setTitle(String title) {
+        this.title = title;
+    }
+}

Added: 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java?rev=1297121&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
 (added)
+++ 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/model/TopicNews.java
 Mon Mar  5 16:41:31 2012
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.web.model;
+
+import java.net.URI;
+import java.util.List;
+
+/**
+ * 
+ * @author cihan
+ * 
+ */
+public class TopicNews {
+
+    private String topic;
+    private List<URI> uris;
+    private List<String> titles;
+    
+    public String getTopic() {
+        return topic;
+    }
+
+    public List<URI> getUris() {
+        return uris;
+    }
+
+    public List<String> getTitles() {
+        return titles;
+    }
+    
+    public void setTopic(String topic) {
+        this.topic = topic;
+    }
+
+    public void setUris(List<URI> uris) {
+        this.uris = uris;
+    }
+    
+    public void setTitles(List<String> titles) {
+        this.titles = titles;
+    }
+}

Added: 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java?rev=1297121&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
 (added)
+++ 
incubator/stanbol/trunk/demos/crawler/src/main/java/org/apache/stanbol/demos/crawler/web/resources/CNNCrawlerResource.java
 Mon Mar  5 16:41:31 2012
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.demos.crawler.web.resources;
+
+import static javax.ws.rs.core.MediaType.TEXT_HTML;
+
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Map;
+
+import javax.servlet.ServletContext;
+import javax.ws.rs.FormParam;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.Response;
+
+import org.apache.stanbol.commons.web.base.ContextHelper;
+import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
+import org.apache.stanbol.demos.crawler.cnn.CNNCrawler;
+import org.apache.stanbol.demos.crawler.web.model.TopicNews;
+
+import com.sun.jersey.api.view.Viewable;
+
+/**
+ * This is the web resource for CNN Crawler.
+ * 
+ * @author cihan
+ * 
+ */
+@Path("/crawler/cnn/{index}")
+public class CNNCrawlerResource extends BaseStanbolResource {
+
+       private CNNCrawler cnnCrawler;
+       private Object templateData = null;
+       private String indexName;
+
+       public CNNCrawlerResource(@Context ServletContext context,
+                       @PathParam(value = "index") String indexName) {
+               this.indexName = indexName;
+               this.cnnCrawler = 
ContextHelper.getServiceFromContext(CNNCrawler.class, context);
+       }
+
+       private TopicNews importCNNNews(String topic, Integer max, Boolean 
full) {
+               if (topic == null || topic.isEmpty()) {
+                       return null;
+               }
+               if (max == null) {
+                       max = 10;
+               }
+               if (full == null) {
+                       full = false;
+               }
+
+               Map<URI, String> newsInfo = cnnCrawler.importCNNNews(topic, 
max, full,
+                               indexName);
+               TopicNews tn = new TopicNews();
+               tn.setTopic(topic);
+               tn.setUris(new ArrayList<URI>(newsInfo.keySet()));
+               tn.setTitles(new ArrayList<String>(newsInfo.values()));
+               return tn;
+       }
+
+       /**
+        * For HTML view only.
+        * 
+        * @return Returns the HTML view for CNN News Crawler.
+        */
+       @GET
+       @Produces(TEXT_HTML)
+       public Response importCNNNewsHTML() {
+               return Response.ok(new Viewable("index", this), 
TEXT_HTML).build();
+       }
+
+       /**
+        * 
+        * @param topic
+        *            The topic which will be crawled.
+        * @param max
+        *            Maximum number of news to be retrieved from CNN about the
+        *            {@link topic}
+        * @param full
+        *            If {@code yes}, the topic will be crawled in detail to
+        *            retrieve all information from CNN about the {@link 
topic}. If
+        *            {@code no}, only summary of the news will be crawled and
+        *            imported.
+        * @return Returns the HTML view as the result of importing news from 
CNN.
+        */
+       @POST
+       @Produces(TEXT_HTML)
+       public Response importCNNNewsHTMLPOST(@FormParam("topic") String topic,
+                       @FormParam("max") Integer max, @FormParam("full") 
Boolean full) {
+               this.templateData = importCNNNews(topic, max, full);
+               return Response.ok(new Viewable("index", this), 
TEXT_HTML).build();
+       }
+
+       public Object getTemplateData() {
+               return templateData;
+       }
+
+       public String getIndexName() {
+               return this.indexName;
+       }
+}


Reply via email to