vgritsenko 02/01/24 20:34:19
Modified: src/java/org/apache/cocoon/components/search
IndexHelperField.java
LuceneIndexContentHandler.java
LuceneXMLIndexer.java
SimpleLuceneCocoonIndexerImpl.java
SimpleLuceneXMLIndexerImpl.java
Log:
Improve lucene searching:
- When indexing, create one lucene document per resource, not one document per
element
- Allow adding attribute values to the body text if element is marked by
lucene:text-attr
Result:
- some important attributes (decided by document author) could be indexed as well
- AND searches work as expected now. Example: person@name:Donald AND History
Revision Changes Path
1.2 +3 -3
xml-cocoon2/src/java/org/apache/cocoon/components/search/IndexHelperField.java
Index: IndexHelperField.java
===================================================================
RCS file:
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/IndexHelperField.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- IndexHelperField.java 3 Jan 2002 12:31:13 -0000 1.1
+++ IndexHelperField.java 25 Jan 2002 04:34:18 -0000 1.2
@@ -24,7 +24,7 @@
* A helper class for generating a lucene document in a SAX ContentHandler.
*
* @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a>
- * @version CVS $Id: IndexHelperField.java,v 1.1 2002/01/03 12:31:13 giacomo Exp
$
+ * @version CVS $Id: IndexHelperField.java,v 1.2 2002/01/25 04:34:18 vgritsenko
Exp $
*/
class IndexHelperField
{
@@ -89,8 +89,8 @@
* @return The text value
* @since
*/
- public String getText() {
- return text.toString();
+ public StringBuffer getText() {
+ return text;
}
1.3 +32 -27
xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneIndexContentHandler.java
Index: LuceneIndexContentHandler.java
===================================================================
RCS file:
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneIndexContentHandler.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- LuceneIndexContentHandler.java 23 Jan 2002 19:06:38 -0000 1.2
+++ LuceneIndexContentHandler.java 25 Jan 2002 04:34:18 -0000 1.3
@@ -27,17 +27,22 @@
* Parse XML and generate lucene document(s)
*
* @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a>
- * @version CVS $Id: LuceneIndexContentHandler.java,v 1.2 2002/01/23 19:06:38
vgritsenko Exp $
+ * @version CVS $Id: LuceneIndexContentHandler.java,v 1.3 2002/01/25 04:34:18
vgritsenko Exp $
*/
public class LuceneIndexContentHandler implements ContentHandler
{
+ public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
+
+ /** If this attribute is specified on element, values of all attributes
+ * if this element added to the text of the element, and to the document
+ * body text */
+ public static final String LUCENE_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
+
StringBuffer bodyText;
private List documents;
private Document bodyDocument;
-
private Stack elementStack;
-
/**
*Constructor for the LuceneIndexContentHandler object
*
@@ -48,10 +53,9 @@
this.bodyDocument = new Document();
this.documents = new ArrayList();
this.documents.add(this.bodyDocument);
- elementStack = new Stack();
+ this.elementStack = new Stack();
}
-
/**
*Sets the documentLocator attribute of the LuceneIndexContentHandler object
*
@@ -92,10 +96,13 @@
* @since
*/
public void characters(char[] ch, int start, int length) {
- IndexHelperField tos = (IndexHelperField) elementStack.peek();
- if (ch.length > 0 && start >= 0 && length > 1 && tos != null) {
+
+ if (ch.length > 0 && start >= 0 && length > 1) {
String text = new String(ch, start, length);
- tos.appendText(text);
+ if (elementStack.size() > 0) {
+ IndexHelperField tos = (IndexHelperField) elementStack.peek();
+ tos.appendText(text);
+ }
bodyText.append(text);
}
}
@@ -124,30 +131,28 @@
*/
public void endElement(String namespaceURI, String localName, String qName) {
IndexHelperField tos = (IndexHelperField) elementStack.pop();
- String text = tos.getText();
String lname = tos.getLocalFieldName();
- String qname = tos.getQualifiedFieldName();
-
- Document document = new Document();
- boolean add_document = false;
- if (text != null && text.length() > 0) {
- System.out.println("field qname " + qname);
- document.add(Field.UnStored(qName, text));
- add_document = true;
- }
+ StringBuffer text = tos.getText();
+ // (VG): Atts are never null, see startElement
Attributes atts = tos.getAttributes();
- if (atts != null && atts.getLength() > 0) {
- for (int i = 0; i < atts.getLength(); i++) {
- String atts_qname = atts.getQName(i);
- String atts_value = atts.getValue(i);
- System.out.println("attribute field " + qname + "@" + atts_qname +
": " + atts_value);
- document.add(Field.UnStored(qname + "@" + atts_qname, atts_value));
- add_document = true;
+ boolean attributesToText = atts.getIndex(LUCENE_URI,
LUCENE_ATTR_TO_TEXT_ATTRIBUTE) != -1;
+ for (int i = 0; i < atts.getLength(); i++) {
+ if (LUCENE_URI.equals(atts.getURI(i))) continue;
+
+ String atts_lname = atts.getLocalName(i);
+ String atts_value = atts.getValue(i);
+ bodyDocument.add(Field.UnStored(lname + "@" + atts_lname, atts_value));
+ if (attributesToText) {
+ text.append(atts_value);
+ text.append(' ');
+ bodyText.append(atts_value);
+ bodyText.append(' ');
}
}
- if (add_document) {
- documents.add(document);
+
+ if (text != null && text.length() > 0) {
+ bodyDocument.add(Field.UnStored(lname, text.toString()));
}
}
1.3 +3 -27
xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneXMLIndexer.java
Index: LuceneXMLIndexer.java
===================================================================
RCS file:
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneXMLIndexer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- LuceneXMLIndexer.java 23 Jan 2002 19:06:38 -0000 1.2
+++ LuceneXMLIndexer.java 25 Jan 2002 04:34:18 -0000 1.3
@@ -14,6 +14,7 @@
import org.apache.avalon.framework.component.Component;
import org.apache.cocoon.ProcessingException;
+import org.apache.lucene.document.Document;
/**
* The avalon behavioural component interface of generating
@@ -33,7 +34,7 @@
* </p>
*
* @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a>
- * @version CVS $Id: LuceneXMLIndexer.java,v 1.2 2002/01/23 19:06:38 vgritsenko
Exp $
+ * @version CVS $Id: LuceneXMLIndexer.java,v 1.3 2002/01/25 04:34:18 vgritsenko
Exp $
*/
public interface LuceneXMLIndexer extends Component
{
@@ -92,31 +93,6 @@
*/
String UID_FIELD = "uid";
-
- /**
- * Return a list of all lucene documents generated by
- * the method build().
- *
- * @return List list of lucene documents
- * @since
- * @see java.util.List
- * @see #build( URL url )
- */
- List allDocuments();
-
-
- /**
- * return an iterator of all lucene documents generated by
- * the method build().
- *
- * @return Iterator iterator of lucene Documents
- * @since
- * @see java.util.Iterator
- * @see #build( URL url )
- */
- Iterator iterator();
-
-
/**
* Build lucene documents from a URL.
* <p>
@@ -129,5 +105,5 @@
* @exception ProcessingException Description of Exception
* @since
*/
- void build(URL url) throws ProcessingException;
+ List build(URL url) throws ProcessingException;
}
1.3 +3 -5
xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneCocoonIndexerImpl.java
Index: SimpleLuceneCocoonIndexerImpl.java
===================================================================
RCS file:
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneCocoonIndexerImpl.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- SimpleLuceneCocoonIndexerImpl.java 23 Jan 2002 19:06:38 -0000 1.2
+++ SimpleLuceneCocoonIndexerImpl.java 25 Jan 2002 04:34:18 -0000 1.3
@@ -46,7 +46,7 @@
* </p>
*
* @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a>
- * @version CVS $Revision: 1.2 $ $Date: 2002/01/23 19:06:38 $
+ * @version CVS $Revision: 1.3 $ $Date: 2002/01/25 04:34:18 $
*/
public class SimpleLuceneCocoonIndexerImpl extends AbstractLoggable
implements LuceneCocoonIndexer, Configurable, Composable, Disposable
@@ -206,13 +206,11 @@
}
// build lucene documents from the content of the crawl_url
- lxi.build(crawl_url);
- Iterator i = lxi.iterator();
+ Iterator i = lxi.build(crawl_url).iterator();
// add all built lucene documents
while (i.hasNext()) {
- Document document = (Document) i.next();
- writer.addDocument(document);
+ writer.addDocument((Document) i.next());
}
}
// optimize it
1.3 +7 -41
xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java
Index: SimpleLuceneXMLIndexerImpl.java
===================================================================
RCS file:
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- SimpleLuceneXMLIndexerImpl.java 23 Jan 2002 19:06:38 -0000 1.2
+++ SimpleLuceneXMLIndexerImpl.java 25 Jan 2002 04:34:18 -0000 1.3
@@ -54,10 +54,10 @@
* A simple class building lucene documents from xml content.
*
* @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a>
- * @version CVS $Revision: 1.2 $ $Date: 2002/01/23 19:06:38 $
+ * @version CVS $Revision: 1.3 $ $Date: 2002/01/25 04:34:18 $
*/
public class SimpleLuceneXMLIndexerImpl extends AbstractLoggable
- implements LuceneXMLIndexer, Configurable, Composable
+ implements LuceneXMLIndexer, Configurable, Composable, ThreadSafe
{
/**
@@ -68,13 +68,6 @@
protected ComponentManager manager = null;
/**
- * list of lucene Document objects
- *
- * @since
- */
- List documents;
-
- /**
* append this string to the url in order to get the
* content view of the url
*
@@ -94,8 +87,6 @@
* @since
*/
public SimpleLuceneXMLIndexerImpl() {
- documents = null;
-
allowedContentType = new HashSet();
allowedContentType.add("text/xml");
allowedContentType.add("text/xhtml");
@@ -126,40 +117,13 @@
/**
- * return a list of all lucene documents generated by
- *
- * @return List list of lucene Documents
- * @since
- * @see build
- */
- public List allDocuments() {
- return documents;
- }
-
-
- /**
- * return an iterator of all lucene documents generated by
- *
- * @return Iterator iterator of lucene Documents
- * @since
- * @see build
- */
- public Iterator iterator() {
- if (documents == null) {
- return new ArrayList().iterator();
- }
- return documents.iterator();
- }
-
-
- /**
* Build lucenen documents from a URL
*
* @param url the content of this url gets indexed.
* @exception ProcessingException Description of Exception
* @since
*/
- public void build(URL url)
+ public List build(URL url)
throws ProcessingException {
try {
@@ -187,15 +151,17 @@
// store ... false, index ... true, token ... false
d.add(new Field(UID_FIELD, uid(contentURLConnection), false,
true, false));
}
- documents = luceneIndexContentHandler.allDocuments();
+
+ return luceneIndexContentHandler.allDocuments();
} else {
if (getLogger().isDebugEnabled()) {
getLogger().debug("Ignoring " + contentURL + " (" + contentType
+ ")");
}
+
+ return java.util.Collections.EMPTY_LIST;
}
} catch (IOException ioe) {
throw new ProcessingException("Cannot read URL " + url, ioe);
- } finally {
}
}
----------------------------------------------------------------------
In case of troubles, e-mail: [EMAIL PROTECTED]
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]