Author: thorsten
Date: Thu Sep 10 11:53:44 2009
New Revision: 813386

URL: http://svn.apache.org/viewvc?rev=813386&view=rev
Log:
Droids-62 Customizable solr handler. 
due-to Bertil Chapuis.
thanks Bertil Chapuis

Added:
    
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
    
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
    
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
Modified:
    incubator/droids/trunk/droids-solr/example/conf/schema.xml
    incubator/droids/trunk/droids-solr/pom.xml

Modified: incubator/droids/trunk/droids-solr/example/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/example/conf/schema.xml?rev=813386&r1=813385&r2=813386&view=diff
==============================================================================
--- incubator/droids/trunk/droids-solr/example/conf/schema.xml (original)
+++ incubator/droids/trunk/droids-solr/example/conf/schema.xml Thu Sep 10 
11:53:44 2009
@@ -62,7 +62,10 @@
    <field name="id"      type="string" indexed="true" stored="true" 
required="true" /> 
    <field name="name"    type="string" indexed="true" stored="true" />
    <field name="host"    type="string" indexed="true" stored="true" />
-   <field name="content" type="text"   indexed="true" stored="false" />   
+   <field name="mime"    type="string" indexed="true" stored="true" />
+   <field name="content" type="text"   indexed="true" stored="true" />
+   <field name="selector" type="text"   indexed="true" stored="true" />
+
  </fields>
 
  <!-- Field to use to determine and enforce document uniqueness. 

Modified: incubator/droids/trunk/droids-solr/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/pom.xml?rev=813386&r1=813385&r2=813386&view=diff
==============================================================================
--- incubator/droids/trunk/droids-solr/pom.xml (original)
+++ incubator/droids/trunk/droids-solr/pom.xml Thu Sep 10 11:53:44 2009
@@ -41,6 +41,17 @@
       <version>${pom.version}</version>
     </dependency>
     <dependency>
+        <groupId>nekohtml</groupId>
+        <artifactId>nekohtml</artifactId>
+        <version>${nekohtml.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>xml-apis</groupId>
+            <artifactId>xml-apis</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+    <dependency>
       <groupId>org.apache.solr</groupId>
       <artifactId>solr-solrj</artifactId>
       <version>1.3.0</version>

Added: 
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java?rev=813386&view=auto
==============================================================================
--- 
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
 (added)
+++ 
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
 Thu Sep 10 11:53:44 2009
@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.solr;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.Stack;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Handler;
+import org.apache.droids.exception.DroidsException;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.SolrInputDocument;
+import org.cyberneko.html.parsers.SAXParser;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
+/**
+ * A Droids Handler which allows to specify selectors to store 
+ * documents' parts in a Solr index.
+ * 
+ * A selector is an Entry made of a key which matches the solr fiel and
+ * of a value which correspond to a path selector.
+ * 
+ * Path selectors are always absolute and supports indexes. 
+ * 
+ * Here are some examples:
+ *     - /html[0]/div[0]
+ *     - /html[0]/div[0]/p[0]
+ *     - /html[0]/div[1]/p[2]
+ */
+public class AdvancedSolrHandler implements Handler {
+
+       /**
+        * A solr server
+        */
+       private SolrServer server;
+       
+       /**
+        * The selectors allow to save specific parts of the document in the 
index.
+        * The HashMap's key matches the Solr field.
+        * The HashMap's value is an absolute path corresponding to an element.
+        */
+       private HashMap<String, String> selectors;
+       
+       /**
+        * A content handler
+        */
+       private SolrContentHandler contentHandler = new 
SolrContentHandler(selectors);
+
+       /**
+        * An HTML parser
+        */
+       private SAXParser parser;
+       
+       /**
+        * @return the current solr server
+        */
+       public SolrServer getServer() {
+               return server;
+       }
+
+       /**
+        * @param solr a solr server 
+        */
+       public void setServer(SolrServer solr) {
+               this.server = solr;
+       }
+
+       /**
+        * @return the current path selectors
+        */
+       public HashMap<String, String> getSelectors() {
+               return selectors;
+       }
+
+       /**
+        * @param selectors an hash map containing path selectors
+        */
+       public void setSelectors(HashMap<String, String> selectors) {
+               contentHandler.initPatterns(selectors);
+               this.selectors = selectors;
+       }
+
+       /* 
+        * @see org.apache.droids.api.Handler#handle(java.net.URI, 
org.apache.droids.api.ContentEntity)
+        */
+       public void handle(URI uri, ContentEntity entity) throws IOException, 
DroidsException {
+               SolrInputDocument doc = createSolrInputDocument(uri, entity);
+               try {
+                       server.add(doc);
+               } catch (SolrServerException e) {
+                       throw new DroidsException(e);
+               }
+       }
+
+       /**
+        * Generates a SolrInputDocument from an URI and a ContentEntity 
+        * which correspond to the document which need to be saved in the index
+        * 
+        * @param uri an uri
+        * @param entity an entity
+        * @return
+        */
+       private SolrInputDocument createSolrInputDocument(URI uri, 
ContentEntity entity) {
+               SolrInputDocument doc = new SolrInputDocument();
+
+               doc.setField("id", uri.getPath());
+               doc.setField("name", uri.toASCIIString());
+               doc.setField("host", uri.getHost());
+               doc.setField("mime", entity.getMimeType());
+               doc.setField("content", entity.getParse().getText());
+               
+               if (parser == null) initParser();
+               
+               if (selectors.size() > 0) {
+                       contentHandler.initDocument(doc);
+                       try {
+                               parser.setContentHandler(contentHandler);
+                               parser.parse(new 
InputSource(entity.obtainContent()));
+                       } catch (IOException e) {
+                               e.printStackTrace();
+                       } catch (SAXException e) {
+                               e.printStackTrace();
+                       }
+               }
+               
+               return doc;
+       }
+
+       /**
+        * Initialize a Cyber Necko parser configured to return lower case 
element's names
+        * 
+        * @return
+        */
+       private SAXParser initParser() {
+               parser = new SAXParser();
+               try {
+                       
parser.setProperty("http://cyberneko.org/html/properties/names/elems";, "lower");
+                       
parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content";,
 false);
+                       
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment";,
 true);
+                       
parser.setFeature("http://cyberneko.org/html/features/report-errors";, false);
+               } catch (SAXNotRecognizedException ex) {
+                       throw new IllegalStateException(ex);
+               } catch (SAXNotSupportedException ex) {
+                       throw new IllegalStateException(ex);
+               }
+               return parser;
+       }
+
+       /**
+        * A class that implements a SAX ContentHandler and uses patterns to 
record documents 
+        * elements in a SolrInputDocuement.
+        */
+       private class SolrContentHandler implements ContentHandler {
+
+               private SolrInputDocument doc;
+               
+               /**
+                * the patterns which match element's path
+                */
+               private HashMap<String, Pattern> patterns = new HashMap<String, 
Pattern>();
+
+               /**
+                * stores the values which match the patterns
+                */
+               private HashMap<String, String> valueRecorders = new 
HashMap<String, String>();
+
+               /**
+                * A two dimensional stack used to store the current path
+                */
+               private Stack<Stack<String>> path = new Stack<Stack<String>>();
+
+               private Integer level = 0;
+
+               private Integer lastLevel = 0;
+               
+               /**
+                * Constructor
+                * 
+                * @param selectors an HashMap which contains selectors
+                */
+               public SolrContentHandler(HashMap<String, String> selectors) {
+                       initPatterns(selectors);
+               }
+               
+               /**
+                * @param selectors
+                * @return
+                */
+               public void initPatterns(HashMap<String, String> selectors) {
+                       if (selectors != null) {
+                               
+                               // clear the current patterns
+                               patterns.clear();
+                               
+                               // pattern for the element and its index
+                               final Pattern p = 
Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
+       
+                               // for each selector
+                               Set<String> keys = selectors.keySet();
+                               for (String key : keys) {
+                                       // creating a pattern
+                                       String regex = "^";
+                                       String selector = selectors.get(key);
+                                       String[] elements = selector.split("/");
+                                       // which match all the elements and 
their respective indices
+                                       for (String element : elements) {
+                                               Matcher m = p.matcher(element);
+                                               if (m.find()) {
+                                                       String elementName = 
m.group(1);
+                                                       String elementIndex = 
m.group(3);
+                                                       regex += "/" + 
elementName;
+                                                       if (elementIndex == 
null) {
+                                                               regex += 
"\\[[0-9]*\\]";
+                                                       } else {
+                                                               regex += "\\[" 
+ elementIndex + "\\]";
+                                                       }
+                                               }
+                                       }
+                                       regex += "$";
+       
+                                       // storing the new Pattern
+                                       Pattern pattern = 
Pattern.compile(regex);
+                                       patterns.put(key, pattern);
+                               }
+                       }
+               }
+               
+               /**
+                * Initialization of the document used for indexation
+                * 
+                * @param doc a solr document
+                */
+               public void initDocument(SolrInputDocument doc) {
+                       this.doc = doc;
+               }
+
+               /* 
+                * @see org.xml.sax.ContentHandler#startDocument()
+                */
+               @Override
+               public void startDocument() throws SAXException {
+                       level = 0;
+               }
+
+               /* 
+                * @see org.xml.sax.ContentHandler#endDocument()
+                */
+               @Override
+               public void endDocument() throws SAXException {
+                       level = 0;
+               }
+
+               /* 
+                * @see 
org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, 
java.lang.String, org.xml.sax.Attributes)
+                */
+               @Override
+               public void startElement(String uri, String localName, String 
qName, Attributes atts) throws SAXException {                     
+                       // set the level properties.
+                       level++;
+
+                       // go down in the hierarchy of elements.
+                       if (level == lastLevel && path.size() > 0) {
+                               path.get(path.size() - 1).add(localName);
+                       } else if (level > lastLevel) {
+                               Stack<String> s = new Stack<String>();
+                               s.add(localName);
+                               path.add(s);
+                       }
+
+                       // if the path matches a pattern, starts recording the 
matching content.
+                       String path = getCurrentPath();
+                       Iterator<Entry<String, Pattern>> entries = 
patterns.entrySet().iterator();
+                       while (entries.hasNext()) {
+                               Entry<String, Pattern> entry = entries.next();
+                               String patternName = entry.getKey();
+                               Pattern patternValue = entry.getValue();
+                               Matcher matcher = patternValue.matcher(path);
+                               if (matcher.find()) {
+                                       valueRecorders.put(patternName, "");
+                               }
+                       }
+
+               }
+
+               /* 
+                * @see org.xml.sax.ContentHandler#endElement(java.lang.String, 
java.lang.String, java.lang.String)
+                */
+               @Override
+               public void endElement(String uri, String localName, String 
qName) throws SAXException {
+                       // check if we climb in the hierarchy.
+                       if (level < lastLevel && path.size() > 0) { 
+                               path.pop();
+                       }
+                       
+                       // set the level properties.
+                       lastLevel = level;
+                       level--;
+                       
+                       // if the path matches a selector, stores the matching 
content.
+                       String path = getCurrentPath();
+                       Iterator<Entry<String, Pattern>> entries = 
patterns.entrySet().iterator();
+                       while (entries.hasNext()) {
+                               Entry<String, Pattern> entry = entries.next();
+                               String patternName = entry.getKey();
+                               Pattern patternValue = entry.getValue();
+                               Matcher matcher = patternValue.matcher(path);
+                               if (matcher.find()) {
+                                       // add the matching content to the solr 
document.
+                                       String value = 
valueRecorders.remove(patternName);
+                                       doc.addField(patternName, value);
+                               }
+                       }
+               }
+
+               /* 
+                * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+                */
+               @Override
+               public void characters(char[] ch, int start, int length) throws 
SAXException {
+                       // store the content in each recorder
+                       Set<String> keys = valueRecorders.keySet();
+                       for (String key : keys) {
+                               String recorder = valueRecorders.get(key);
+                               recorder += new String(Arrays.copyOfRange(ch, 
start, start + length));
+                               valueRecorders.put(key, recorder);
+                       }
+               }
+               
+               /* 
+                * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], 
int, int)
+                */
+               @Override
+               public void ignorableWhitespace(char[] ch, int start, int 
length) throws SAXException {
+                       characters(ch, start, length);
+               }
+
+               /* 
+                * @see 
org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String, 
java.lang.String)
+                */
+               @Override
+               public void startPrefixMapping(String prefix, String uri) 
throws SAXException {
+
+               }
+
+               /* 
+                * @see 
org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String)
+                */
+               @Override
+               public void endPrefixMapping(String prefix) throws SAXException 
{
+
+               }
+
+               /* 
+                * @see 
org.xml.sax.ContentHandler#processingInstruction(java.lang.String, 
java.lang.String)
+                */
+               @Override
+               public void processingInstruction(String target, String data) 
throws SAXException {
+
+               }
+
+               /* 
+                * @see 
org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
+                */
+               @Override
+               public void setDocumentLocator(Locator locator) {
+
+               }
+
+               /* 
+                * @see 
org.xml.sax.ContentHandler#skippedEntity(java.lang.String)
+                */
+               @Override
+               public void skippedEntity(String name) throws SAXException {
+
+               }
+
+               /**
+                * Computes the current path by crossing the path stack.
+                * 
+                * @return a path
+                */
+               private String getCurrentPath() {
+                       String p = "";
+
+                       // find the element at each level
+                       for (Stack<String> h : path) {
+                               String element = h.get(h.size() - 1);
+                               Integer index = -1;
+                               // find the element's index 
+                               for (String e : h) {
+                                       if (e.equals(element)) {
+                                               index++;
+                                       }
+                               }
+                               // path with the index at each level
+                               p += "/" + element + "[" + index + "]";
+                       }
+                       return p;
+               }
+
+       }
+}

Added: 
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java?rev=813386&view=auto
==============================================================================
--- 
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
 (added)
+++ 
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
 Thu Sep 10 11:53:44 2009
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Parse;
+import org.apache.droids.exception.DroidsException;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
+
+public class AdvancedSolrHandleTest extends TestCase {
+
+       SolrServer solr;
+       
+       String simpleHtmlPage = "" +
+               "<html>" +
+                       "<body>" +
+                               "<div>" +
+                                       "<p>p0</p>" +
+                                       "<p>p1</p>" +
+                                       "<p>p2</p>" +
+                               "</div>" +
+                               "<div>" +
+                                       "<p>p3</p>" +
+                                       "<p>p4</p>" +
+                                       "<p>p5</p>" +
+                               "</div>" +
+                       "</body>" +
+               "</html>";
+
+       protected String getSolrHome() {
+               return "example";
+       }
+
+       @Override
+       public void setUp() throws Exception {
+               super.setUp();
+
+               SolrResourceLoader loader = new 
SolrResourceLoader(getSolrHome());
+               CoreContainer container = new CoreContainer(loader);
+               CoreDescriptor descriptor = new CoreDescriptor(container, 
"cname", ".");
+               SolrCore core = container.create(descriptor);
+               container.register(core.getName(), core, false);
+
+               solr = new EmbeddedSolrServer(container, core.getName());
+       }
+
+       public void tearDown() throws Exception {
+               // remove everything....
+               solr.deleteByQuery("*:*");
+               solr.commit();
+       }
+       
+       public void performSelection(String html, String field, String 
selector, String expectedValue) throws IOException, DroidsException, 
URISyntaxException, SolrServerException {
+               AdvancedSolrHandler handler = new AdvancedSolrHandler();
+               handler.setServer(solr);
+               
+               HashMap<String, String> selectors = new HashMap<String, 
String>();
+               selectors.put(field, selector);
+               handler.setSelectors(selectors);
+               
+               MockContentEntity contentEntity = new MockContentEntity();
+               contentEntity.setCharset("UTF-8");
+               contentEntity.setMimeType("text/html");
+               contentEntity.setText(html);
+               
+               handler.handle(new URI("http://localhost/";), contentEntity);
+               solr.commit();
+               
+               SolrQuery query = new SolrQuery();
+               query.setQuery("*:*");
+               query.setFields(field);
+               QueryResponse response = solr.query(query);
+               
+               SolrDocument doc = response.getResults().iterator().next();
+               String value = (String)doc.getFieldValue(field);
+               
+               assertEquals(expectedValue, value);
+       }
+       
+       public void testSelectorA() throws Exception {
+               performSelection(simpleHtmlPage, "selector", 
"/html[0]/body[0]/div[0]/p[0]", "p0");
+       }
+       
+       public void testSelectorB() throws Exception {
+               performSelection(simpleHtmlPage, "selector", 
"/html[0]/body[0]/div[1]/p[1]", "p4");
+       }
+       
+       public void testSelectorC() throws Exception {
+               performSelection(simpleHtmlPage, "selector", 
"/html[0]/body[0]/div[1]", "p3p4p5");
+       }
+
+}
\ No newline at end of file

Added: 
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java?rev=813386&view=auto
==============================================================================
--- 
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
 (added)
+++ 
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
 Thu Sep 10 11:53:44 2009
@@ -0,0 +1,68 @@
+package org.apache.droids.solr;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Link;
+import org.apache.droids.api.Parse;
+import org.apache.droids.parse.ParseImpl;
+
+public class MockContentEntity implements ContentEntity {
+
+       private String text;
+       
+       private String charset = "UTF-8";
+       
+       private String mimeType = "text/html";
+       
+       private Collection<Link> outlinks = new ArrayList<Link>();
+
+       public Collection<Link> getOutlinks() {
+               return outlinks;
+       }
+
+       public void setOutlinks(Collection<Link> outlinks) {
+               this.outlinks = outlinks;
+       }
+
+       public String getText() {
+               return text;
+       }
+
+       public void setText(String text) {
+               this.text = text;
+       }
+       
+       @Override
+       public String getCharset() {
+               return charset;
+       }
+
+       public void setCharset(String charset) {
+               this.charset = charset;
+       }
+       
+       @Override
+       public String getMimeType() {
+               return mimeType;
+       }
+       
+       public void setMimeType(String mimeType) {
+               this.mimeType = mimeType;
+       }
+
+       @Override
+       public Parse getParse() {
+               return new ParseImpl(text, outlinks);
+       }
+
+       @Override
+       public InputStream obtainContent() throws IOException {
+               return new ByteArrayInputStream(text.getBytes());
+       }
+
+}


Reply via email to