Author: thorsten
Date: Thu Sep 10 11:53:44 2009
New Revision: 813386
URL: http://svn.apache.org/viewvc?rev=813386&view=rev
Log:
Droids-62 Customizable solr handler.
due-to Bertil Chapuis.
thanks Bertil Chapuis
Added:
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
Modified:
incubator/droids/trunk/droids-solr/example/conf/schema.xml
incubator/droids/trunk/droids-solr/pom.xml
Modified: incubator/droids/trunk/droids-solr/example/conf/schema.xml
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/example/conf/schema.xml?rev=813386&r1=813385&r2=813386&view=diff
==============================================================================
--- incubator/droids/trunk/droids-solr/example/conf/schema.xml (original)
+++ incubator/droids/trunk/droids-solr/example/conf/schema.xml Thu Sep 10
11:53:44 2009
@@ -62,7 +62,10 @@
<field name="id" type="string" indexed="true" stored="true"
required="true" />
<field name="name" type="string" indexed="true" stored="true" />
<field name="host" type="string" indexed="true" stored="true" />
- <field name="content" type="text" indexed="true" stored="false" />
+ <field name="mime" type="string" indexed="true" stored="true" />
+ <field name="content" type="text" indexed="true" stored="true" />
+ <field name="selector" type="text" indexed="true" stored="true" />
+
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Modified: incubator/droids/trunk/droids-solr/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/pom.xml?rev=813386&r1=813385&r2=813386&view=diff
==============================================================================
--- incubator/droids/trunk/droids-solr/pom.xml (original)
+++ incubator/droids/trunk/droids-solr/pom.xml Thu Sep 10 11:53:44 2009
@@ -41,6 +41,17 @@
<version>${pom.version}</version>
</dependency>
<dependency>
+ <groupId>nekohtml</groupId>
+ <artifactId>nekohtml</artifactId>
+ <version>${nekohtml.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>xml-apis</groupId>
+ <artifactId>xml-apis</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>1.3.0</version>
Added:
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java?rev=813386&view=auto
==============================================================================
---
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
(added)
+++
incubator/droids/trunk/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
Thu Sep 10 11:53:44 2009
@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.solr;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.Stack;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Handler;
+import org.apache.droids.exception.DroidsException;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.SolrInputDocument;
+import org.cyberneko.html.parsers.SAXParser;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
+/**
+ * A Droids Handler which allows to specify selectors to store
+ * documents' parts in a Solr index.
+ *
+ * A selector is an Entry made of a key which matches the solr fiel and
+ * of a value which correspond to a path selector.
+ *
+ * Path selectors are always absolute and supports indexes.
+ *
+ * Here are some examples:
+ * - /html[0]/div[0]
+ * - /html[0]/div[0]/p[0]
+ * - /html[0]/div[1]/p[2]
+ */
+public class AdvancedSolrHandler implements Handler {
+
+ /**
+ * A solr server
+ */
+ private SolrServer server;
+
+ /**
+ * The selectors allow to save specific parts of the document in the
index.
+ * The HashMap's key matches the Solr field.
+ * The HashMap's value is an absolute path corresponding to an element.
+ */
+ private HashMap<String, String> selectors;
+
+ /**
+ * A content handler
+ */
+ private SolrContentHandler contentHandler = new
SolrContentHandler(selectors);
+
+ /**
+ * An HTML parser
+ */
+ private SAXParser parser;
+
+ /**
+ * @return the current solr server
+ */
+ public SolrServer getServer() {
+ return server;
+ }
+
+ /**
+ * @param solr a solr server
+ */
+ public void setServer(SolrServer solr) {
+ this.server = solr;
+ }
+
+ /**
+ * @return the current path selectors
+ */
+ public HashMap<String, String> getSelectors() {
+ return selectors;
+ }
+
+ /**
+ * @param selectors an hash map containing path selectors
+ */
+ public void setSelectors(HashMap<String, String> selectors) {
+ contentHandler.initPatterns(selectors);
+ this.selectors = selectors;
+ }
+
+ /*
+ * @see org.apache.droids.api.Handler#handle(java.net.URI,
org.apache.droids.api.ContentEntity)
+ */
+ public void handle(URI uri, ContentEntity entity) throws IOException,
DroidsException {
+ SolrInputDocument doc = createSolrInputDocument(uri, entity);
+ try {
+ server.add(doc);
+ } catch (SolrServerException e) {
+ throw new DroidsException(e);
+ }
+ }
+
+ /**
+ * Generates a SolrInputDocument from an URI and a ContentEntity
+ * which correspond to the document which need to be saved in the index
+ *
+ * @param uri an uri
+ * @param entity an entity
+ * @return
+ */
+ private SolrInputDocument createSolrInputDocument(URI uri,
ContentEntity entity) {
+ SolrInputDocument doc = new SolrInputDocument();
+
+ doc.setField("id", uri.getPath());
+ doc.setField("name", uri.toASCIIString());
+ doc.setField("host", uri.getHost());
+ doc.setField("mime", entity.getMimeType());
+ doc.setField("content", entity.getParse().getText());
+
+ if (parser == null) initParser();
+
+ if (selectors.size() > 0) {
+ contentHandler.initDocument(doc);
+ try {
+ parser.setContentHandler(contentHandler);
+ parser.parse(new
InputSource(entity.obtainContent()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return doc;
+ }
+
+ /**
+ * Initialize a Cyber Necko parser configured to return lower case
element's names
+ *
+ * @return
+ */
+ private SAXParser initParser() {
+ parser = new SAXParser();
+ try {
+
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+
parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
false);
+
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
+
parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
+ } catch (SAXNotRecognizedException ex) {
+ throw new IllegalStateException(ex);
+ } catch (SAXNotSupportedException ex) {
+ throw new IllegalStateException(ex);
+ }
+ return parser;
+ }
+
+ /**
+ * A class that implements a SAX ContentHandler and uses patterns to
record documents
+ * elements in a SolrInputDocuement.
+ */
+ private class SolrContentHandler implements ContentHandler {
+
+ private SolrInputDocument doc;
+
+ /**
+ * the patterns which match element's path
+ */
+ private HashMap<String, Pattern> patterns = new HashMap<String,
Pattern>();
+
+ /**
+ * stores the values which match the patterns
+ */
+ private HashMap<String, String> valueRecorders = new
HashMap<String, String>();
+
+ /**
+ * A two dimensional stack used to store the current path
+ */
+ private Stack<Stack<String>> path = new Stack<Stack<String>>();
+
+ private Integer level = 0;
+
+ private Integer lastLevel = 0;
+
+ /**
+ * Constructor
+ *
+ * @param selectors an HashMap which contains selectors
+ */
+ public SolrContentHandler(HashMap<String, String> selectors) {
+ initPatterns(selectors);
+ }
+
+ /**
+ * @param selectors
+ * @return
+ */
+ public void initPatterns(HashMap<String, String> selectors) {
+ if (selectors != null) {
+
+ // clear the current patterns
+ patterns.clear();
+
+ // pattern for the element and its index
+ final Pattern p =
Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
+
+ // for each selector
+ Set<String> keys = selectors.keySet();
+ for (String key : keys) {
+ // creating a pattern
+ String regex = "^";
+ String selector = selectors.get(key);
+ String[] elements = selector.split("/");
+ // which match all the elements and
their respective indices
+ for (String element : elements) {
+ Matcher m = p.matcher(element);
+ if (m.find()) {
+ String elementName =
m.group(1);
+ String elementIndex =
m.group(3);
+ regex += "/" +
elementName;
+ if (elementIndex ==
null) {
+ regex +=
"\\[[0-9]*\\]";
+ } else {
+ regex += "\\["
+ elementIndex + "\\]";
+ }
+ }
+ }
+ regex += "$";
+
+ // storing the new Pattern
+ Pattern pattern =
Pattern.compile(regex);
+ patterns.put(key, pattern);
+ }
+ }
+ }
+
+ /**
+ * Initialization of the document used for indexation
+ *
+ * @param doc a solr document
+ */
+ public void initDocument(SolrInputDocument doc) {
+ this.doc = doc;
+ }
+
+ /*
+ * @see org.xml.sax.ContentHandler#startDocument()
+ */
+ @Override
+ public void startDocument() throws SAXException {
+ level = 0;
+ }
+
+ /*
+ * @see org.xml.sax.ContentHandler#endDocument()
+ */
+ @Override
+ public void endDocument() throws SAXException {
+ level = 0;
+ }
+
+ /*
+ * @see
org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String,
java.lang.String, org.xml.sax.Attributes)
+ */
+ @Override
+ public void startElement(String uri, String localName, String
qName, Attributes atts) throws SAXException {
+ // set the level properties.
+ level++;
+
+ // go down in the hierarchy of elements.
+ if (level == lastLevel && path.size() > 0) {
+ path.get(path.size() - 1).add(localName);
+ } else if (level > lastLevel) {
+ Stack<String> s = new Stack<String>();
+ s.add(localName);
+ path.add(s);
+ }
+
+ // if the path matches a pattern, starts recording the
matching content.
+ String path = getCurrentPath();
+ Iterator<Entry<String, Pattern>> entries =
patterns.entrySet().iterator();
+ while (entries.hasNext()) {
+ Entry<String, Pattern> entry = entries.next();
+ String patternName = entry.getKey();
+ Pattern patternValue = entry.getValue();
+ Matcher matcher = patternValue.matcher(path);
+ if (matcher.find()) {
+ valueRecorders.put(patternName, "");
+ }
+ }
+
+ }
+
+ /*
+ * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
java.lang.String, java.lang.String)
+ */
+ @Override
+ public void endElement(String uri, String localName, String
qName) throws SAXException {
+ // check if we climb in the hierarchy.
+ if (level < lastLevel && path.size() > 0) {
+ path.pop();
+ }
+
+ // set the level properties.
+ lastLevel = level;
+ level--;
+
+ // if the path matches a selector, stores the matching
content.
+ String path = getCurrentPath();
+ Iterator<Entry<String, Pattern>> entries =
patterns.entrySet().iterator();
+ while (entries.hasNext()) {
+ Entry<String, Pattern> entry = entries.next();
+ String patternName = entry.getKey();
+ Pattern patternValue = entry.getValue();
+ Matcher matcher = patternValue.matcher(path);
+ if (matcher.find()) {
+ // add the matching content to the solr
document.
+ String value =
valueRecorders.remove(patternName);
+ doc.addField(patternName, value);
+ }
+ }
+ }
+
+ /*
+ * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+ */
+ @Override
+ public void characters(char[] ch, int start, int length) throws
SAXException {
+ // store the content in each recorder
+ Set<String> keys = valueRecorders.keySet();
+ for (String key : keys) {
+ String recorder = valueRecorders.get(key);
+ recorder += new String(Arrays.copyOfRange(ch,
start, start + length));
+ valueRecorders.put(key, recorder);
+ }
+ }
+
+ /*
+ * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[],
int, int)
+ */
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int
length) throws SAXException {
+ characters(ch, start, length);
+ }
+
+ /*
+ * @see
org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String,
java.lang.String)
+ */
+ @Override
+ public void startPrefixMapping(String prefix, String uri)
throws SAXException {
+
+ }
+
+ /*
+ * @see
org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String)
+ */
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException
{
+
+ }
+
+ /*
+ * @see
org.xml.sax.ContentHandler#processingInstruction(java.lang.String,
java.lang.String)
+ */
+ @Override
+ public void processingInstruction(String target, String data)
throws SAXException {
+
+ }
+
+ /*
+ * @see
org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
+ */
+ @Override
+ public void setDocumentLocator(Locator locator) {
+
+ }
+
+ /*
+ * @see
org.xml.sax.ContentHandler#skippedEntity(java.lang.String)
+ */
+ @Override
+ public void skippedEntity(String name) throws SAXException {
+
+ }
+
+ /**
+ * Computes the current path by crossing the path stack.
+ *
+ * @return a path
+ */
+ private String getCurrentPath() {
+ String p = "";
+
+ // find the element at each level
+ for (Stack<String> h : path) {
+ String element = h.get(h.size() - 1);
+ Integer index = -1;
+ // find the element's index
+ for (String e : h) {
+ if (e.equals(element)) {
+ index++;
+ }
+ }
+ // path with the index at each level
+ p += "/" + element + "[" + index + "]";
+ }
+ return p;
+ }
+
+ }
+}
Added:
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java?rev=813386&view=auto
==============================================================================
---
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
(added)
+++
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
Thu Sep 10 11:53:44 2009
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashMap;
+
+import junit.framework.TestCase;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Parse;
+import org.apache.droids.exception.DroidsException;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
+
+public class AdvancedSolrHandleTest extends TestCase {
+
+ SolrServer solr;
+
+ String simpleHtmlPage = "" +
+ "<html>" +
+ "<body>" +
+ "<div>" +
+ "<p>p0</p>" +
+ "<p>p1</p>" +
+ "<p>p2</p>" +
+ "</div>" +
+ "<div>" +
+ "<p>p3</p>" +
+ "<p>p4</p>" +
+ "<p>p5</p>" +
+ "</div>" +
+ "</body>" +
+ "</html>";
+
+ protected String getSolrHome() {
+ return "example";
+ }
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+
+ SolrResourceLoader loader = new
SolrResourceLoader(getSolrHome());
+ CoreContainer container = new CoreContainer(loader);
+ CoreDescriptor descriptor = new CoreDescriptor(container,
"cname", ".");
+ SolrCore core = container.create(descriptor);
+ container.register(core.getName(), core, false);
+
+ solr = new EmbeddedSolrServer(container, core.getName());
+ }
+
+ public void tearDown() throws Exception {
+ // remove everything....
+ solr.deleteByQuery("*:*");
+ solr.commit();
+ }
+
+ public void performSelection(String html, String field, String
selector, String expectedValue) throws IOException, DroidsException,
URISyntaxException, SolrServerException {
+ AdvancedSolrHandler handler = new AdvancedSolrHandler();
+ handler.setServer(solr);
+
+ HashMap<String, String> selectors = new HashMap<String,
String>();
+ selectors.put(field, selector);
+ handler.setSelectors(selectors);
+
+ MockContentEntity contentEntity = new MockContentEntity();
+ contentEntity.setCharset("UTF-8");
+ contentEntity.setMimeType("text/html");
+ contentEntity.setText(html);
+
+ handler.handle(new URI("http://localhost/"), contentEntity);
+ solr.commit();
+
+ SolrQuery query = new SolrQuery();
+ query.setQuery("*:*");
+ query.setFields(field);
+ QueryResponse response = solr.query(query);
+
+ SolrDocument doc = response.getResults().iterator().next();
+ String value = (String)doc.getFieldValue(field);
+
+ assertEquals(expectedValue, value);
+ }
+
+ public void testSelectorA() throws Exception {
+ performSelection(simpleHtmlPage, "selector",
"/html[0]/body[0]/div[0]/p[0]", "p0");
+ }
+
+ public void testSelectorB() throws Exception {
+ performSelection(simpleHtmlPage, "selector",
"/html[0]/body[0]/div[1]/p[1]", "p4");
+ }
+
+ public void testSelectorC() throws Exception {
+ performSelection(simpleHtmlPage, "selector",
"/html[0]/body[0]/div[1]", "p3p4p5");
+ }
+
+}
\ No newline at end of file
Added:
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java?rev=813386&view=auto
==============================================================================
---
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
(added)
+++
incubator/droids/trunk/droids-solr/src/test/java/org/apache/droids/solr/MockContentEntity.java
Thu Sep 10 11:53:44 2009
@@ -0,0 +1,68 @@
+package org.apache.droids.solr;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+
+import org.apache.droids.api.ContentEntity;
+import org.apache.droids.api.Link;
+import org.apache.droids.api.Parse;
+import org.apache.droids.parse.ParseImpl;
+
+public class MockContentEntity implements ContentEntity {
+
+ private String text;
+
+ private String charset = "UTF-8";
+
+ private String mimeType = "text/html";
+
+ private Collection<Link> outlinks = new ArrayList<Link>();
+
+ public Collection<Link> getOutlinks() {
+ return outlinks;
+ }
+
+ public void setOutlinks(Collection<Link> outlinks) {
+ this.outlinks = outlinks;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public void setText(String text) {
+ this.text = text;
+ }
+
+ @Override
+ public String getCharset() {
+ return charset;
+ }
+
+ public void setCharset(String charset) {
+ this.charset = charset;
+ }
+
+ @Override
+ public String getMimeType() {
+ return mimeType;
+ }
+
+ public void setMimeType(String mimeType) {
+ this.mimeType = mimeType;
+ }
+
+ @Override
+ public Parse getParse() {
+ return new ParseImpl(text, outlinks);
+ }
+
+ @Override
+ public InputStream obtainContent() throws IOException {
+ return new ByteArrayInputStream(text.getBytes());
+ }
+
+}