ehatcher 2002/07/10 18:12:31
Added: contributions/ant/lib Tidy.jar
contributions/ant/src/main/org/apache/lucene/ant
DocumentHandler.java DocumentHandlerException.java
FileExtensionDocumentHandler.java HtmlDocument.java
IndexTask.java TextDocument.java
contributions/ant/src/test/org/apache/lucene/ant
DocumentTestCase.java HtmlDocumentTest.java
IndexTaskTest.java test.html test.txt
TextDocumentTest.java
Removed: projects/ant build.xml
projects/ant/lib Tidy.jar
projects/ant/src/main/org/apache/lucene/ant
DocumentHandler.java DocumentHandlerException.java
FileExtensionDocumentHandler.java HtmlDocument.java
IndexTask.java TextDocument.java
projects/ant/src/test/org/apache/lucene/ant
DocumentTestCase.java HtmlDocumentTest.java
IndexTaskTest.java test.html test.txt
TextDocumentTest.java
Log:
fixed build file to work with pre Ant 1.5. Relocated to contributions area.
Revision Changes Path
1.1 jakarta-lucene-sandbox/contributions/ant/lib/Tidy.jar
<<Binary file>>
1.1
jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/DocumentHandler.java
Index: DocumentHandler.java
===================================================================
package org.apache.lucene.ant;
import java.io.File;
import org.apache.lucene.document.Document;
/**
* Allows a class to act as a Lucene document handler
*
*@author Erik Hatcher
*@created October 27, 2001
*/
public interface DocumentHandler {
/**
* Gets the document attribute of the DocumentHandler object
*
*@param file Description of Parameter
*@return The document value
*@throws DocumentHandlerException
*/
public Document getDocument(File file)
throws DocumentHandlerException;
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/DocumentHandlerException.java
Index: DocumentHandlerException.java
===================================================================
package org.apache.lucene.ant;
import java.io.PrintStream;
import java.io.PrintWriter;
/**
*/
public class DocumentHandlerException extends Exception
{
private Throwable cause;
public DocumentHandlerException() {
super();
}
public DocumentHandlerException(String message) {
super(message);
}
public DocumentHandlerException(Throwable cause) {
super(cause.toString());
this.cause = cause;
}
public Throwable getException() {
return cause;
}
// Override stack trace methods to show original cause:
public void printStackTrace() {
printStackTrace(System.err);
}
public void printStackTrace(PrintStream ps) {
synchronized (ps) {
super.printStackTrace(ps);
if (cause != null) {
ps.println("--- Nested Exception ---");
cause.printStackTrace(ps);
}
}
}
public void printStackTrace(PrintWriter pw) {
synchronized (pw) {
super.printStackTrace(pw);
if (cause != null) {
pw.println("--- Nested Exception ---");
cause.printStackTrace(pw);
}
}
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/FileExtensionDocumentHandler.java
Index: FileExtensionDocumentHandler.java
===================================================================
package org.apache.lucene.ant;
import java.io.File;
import org.apache.lucene.document.Document;
/**
* Decides which class used to create the Lucene Document
* object based on its file extension.
*
*@author Erik Hatcher
*@created October 28, 2001
*@todo Add dynamic file extension/classname mappings for
* extensibility
*/
public class FileExtensionDocumentHandler
implements DocumentHandler {
/**
* Gets the document attribute of the
* FileExtensionDocumentHandler object
*
*@param file Description of
* Parameter
*@return The document value
*@exception DocumentHandlerException Description of
* Exception
*/
public Document getDocument(File file)
throws DocumentHandlerException {
Document doc = null;
String name = file.getName();
try {
if (name.endsWith(".txt")) {
doc = TextDocument.Document(file);
}
if (name.endsWith(".html")) {
doc = HtmlDocument.Document(file);
}
}
catch (java.io.IOException e) {
throw new DocumentHandlerException(e);
}
return doc;
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/HtmlDocument.java
Index: HtmlDocument.java
===================================================================
package org.apache.lucene.ant;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
// Imports commented out since there is a name clash and fully
// qualified class names will be used in the code. Imports are
// left for ease of maintenance.
import org.apache.lucene.document.Field;
//import org.apache.lucene.document.Document;
//import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
/**
* The <code>HtmlDocument</code> class creates a Lucene {@link
* org.apache.lucene.document.Document} from an HTML document. <P>
*
* It does this by using JTidy package. It can take input input
* from {@link java.io.File} or {@link java.io.InputStream}.
*
*@author Erik Hatcher
*@created October 27, 2001
*/
public class HtmlDocument {
private Element rawDoc;
//-------------------------------------------------------------
// Constructors
//-------------------------------------------------------------
/**
* Constructs an <code>HtmlDocument</code> from a {@link
* java.io.File}.
*
*@param file the <code>File</code> containing the
* HTML to parse
*@exception IOException if an I/O exception occurs
*@since
*/
public HtmlDocument(File file) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root =
tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
}
/**
* Constructs an <code>HtmlDocument</code> from an {@link
* java.io.InputStream}.
*
*@param is the <code>InputStream</code>
* containing the HTML
*@exception IOException if I/O exception occurs
*@since
*/
public HtmlDocument(InputStream is) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
rawDoc = root.getDocumentElement();
}
/**
* Creates a Lucene <code>Document</code> from an {@link
* java.io.InputStream}.
*
*@param is
*@return
*@exception IOException
*/
public static org.apache.lucene.document.Document
getDocument(InputStream is) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
return luceneDoc;
}
//-------------------------------------------------------------
// Public methods
//-------------------------------------------------------------
/**
* Creates a Lucene <code>Document</code> from a {@link
* java.io.File}.
*
*@param file
*@return
*@exception IOException
*/
public static org.apache.lucene.document.Document
Document(File file) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
luceneDoc.add(Field.UnIndexed("rawcontents", contents));
return luceneDoc;
}
//-------------------------------------------------------------
// Private methods
//-------------------------------------------------------------
/**
* Runs <code>HtmlDocument</code> on the files specified on
* the command line.
*
*@param args Command line arguments
*@exception Exception Description of Exception
*/
private static void main(String args[]) throws Exception {
// HtmlDocument doc = new HtmlDocument(new File(args[0]));
// System.out.println("Title = " + doc.getTitle());
// System.out.println("Body = " + doc.getBody());
HtmlDocument doc =
new HtmlDocument(new FileInputStream(new File(args[0])));
System.out.println("Title = " + doc.getTitle());
System.out.println("Body = " + doc.getBody());
}
/**
* Gets the title attribute of the <code>HtmlDocument</code>
* object.
*
*@return the title value
*/
public String getTitle() {
if (rawDoc == null) {
return null;
}
String title = "";
NodeList nl = rawDoc.getElementsByTagName("title");
if (nl.getLength() > 0) {
Element titleElement = ((Element) nl.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
}
}
return title;
}
/**
* Gets the bodyText attribute of the
* <code>HtmlDocument</code> object.
*
*@return the bodyText value
*/
public String getBody() {
if (rawDoc == null) {
return null;
}
String body = "";
NodeList nl = rawDoc.getElementsByTagName("body");
if (nl.getLength() > 0) {
body = getBodyText(nl.item(0));
}
return body;
}
/**
* Gets the bodyText attribute of the
* <code>HtmlDocument</code> object.
*
*@param node a DOM Node
*@return The bodyText value
*/
private String getBodyText(Node node) {
NodeList nl = node.getChildNodes();
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < nl.getLength(); i++) {
Node child = nl.item(i);
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
buffer.append(getBodyText(child));
buffer.append(" ");
break;
case Node.TEXT_NODE:
buffer.append(((Text) child).getData());
break;
}
}
return buffer.toString();
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/IndexTask.java
Index: IndexTask.java
===================================================================
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.DirectoryScanner;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.Task;
import org.apache.tools.ant.types.FileSet;
/**
* Builds a Lucene index from a fileset.
*
* @author Erik Hatcher
*/
public class IndexTask extends Task {
/**
* file list
*/
private Vector filesets = new Vector();
/**
* overwrite index?
*/
private boolean overwrite = false;
/**
* index path
*/
private File indexPath;
/**
* document handler classname
*/
private String handlerClassName =
"org.apache.lucene.ant.FileExtensionDocumentHandler";
/**
* document handler instance
*/
private DocumentHandler handler;
/**
* Lucene merge factor
*/
private int mergeFactor = 20;
/**
* Specifies the directory where the index will be stored
*
* @param indexPath The new index value
*/
public void setIndex(File indexPath) {
this.indexPath = indexPath;
}
/**
* Sets the mergeFactor attribute of the IndexTask object
*
*@param mergeFactor The new mergeFactor value
*/
public void setMergeFactor(int mergeFactor) {
this.mergeFactor = mergeFactor;
}
/**
* If true, index will be overwritten.
*
* @param overwrite The new overwrite value
*/
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
}
/**
* Classname of document handler.
*
* @param classname The new documentHandler value
*/
public void setDocumentHandler(String classname) {
handlerClassName = classname;
}
/**
* Adds a set of files.
*
* @param set FileSet to be added
*/
public void addFileset(FileSet set) {
filesets.addElement(set);
}
/**
* Begins the indexing
*
* @exception BuildException If an error occurs indexing the
* fileset
* @todo add classpath handling so handler does not
* have to be in system classpath
*/
public void execute() throws BuildException {
try {
Class clazz = Class.forName(handlerClassName);
handler = (DocumentHandler) clazz.newInstance();
}
catch (ClassNotFoundException cnfe) {
throw new BuildException(cnfe);
}
catch (InstantiationException ie) {
throw new BuildException(ie);
}
catch (IllegalAccessException iae) {
throw new BuildException(iae);
}
try {
indexDocs();
}
catch (IOException e) {
throw new BuildException(e);
}
}
/**
* index the fileset
*
* @exception IOException Description of Exception
* @todo refactor - definitely lots of room for improvement here
*/
private void indexDocs() throws IOException {
Date start = new Date();
boolean create = overwrite;
// If the index directory doesn't exist,
// create it and force create mode
if (indexPath.mkdirs() && !overwrite) {
create = true;
}
Searcher searcher = null;
Analyzer analyzer = new StopAnalyzer();
boolean checkLastModified = false;
if (!create) {
try {
searcher = new IndexSearcher(indexPath.getAbsolutePath());
checkLastModified = true;
}
catch (IOException ioe) {
log("IOException: " + ioe.getMessage());
// Empty - ignore, which indicates to index all
// documents
}
}
log("checkLastModified = " + checkLastModified);
IndexWriter writer =
new IndexWriter(indexPath, analyzer, create);
int totalFiles = 0;
int totalIndexed = 0;
int totalIgnored = 0;
try {
writer.mergeFactor = mergeFactor;
for (int i = 0; i < filesets.size(); i++) {
FileSet fs = (FileSet) filesets.elementAt(i);
if (fs != null) {
DirectoryScanner ds =
fs.getDirectoryScanner(project);
String[] dsfiles = ds.getIncludedFiles();
File baseDir = ds.getBasedir();
for (int j = 0; j < dsfiles.length; j++) {
File file = new File(baseDir, dsfiles[j]);
totalFiles++;
if (!file.exists() || !file.canRead()) {
throw new BuildException("File \"" +
file.getAbsolutePath()
+ "\" does not exist or is not readable.");
}
boolean indexIt = true;
if (checkLastModified) {
Hits hits = null;
Term pathTerm =
new Term("path", file.getPath());
TermQuery query =
new TermQuery(pathTerm);
hits = searcher.search(query);
// if document is found, compare the
// indexed last modified time with the
// current file
// - don't index if up to date
if (hits.length() > 0) {
Document doc = hits.doc(0);
String indexModified =
doc.get("modified");
if (indexModified != null) {
if (DateField.stringToTime(indexModified)
== file.lastModified()) {
indexIt = false;
}
}
}
}
if (indexIt) {
try {
log("Indexing " + file.getPath(),
Project.MSG_VERBOSE);
Document doc =
handler.getDocument(file);
if (doc == null) {
totalIgnored++;
}
else {
// Add the path of the file as a field named
"path". Use a Text field, so
// that the index stores the path, and so that
the path is searchable
doc.add(Field.Keyword("path", file.getPath()));
// Add the last modified date of the file a
field named "modified". Use a
// Keyword field, so that it's searchable, but
so that no attempt is made
// to tokenize the field into words.
doc.add(Field.Keyword("modified",
DateField.timeToString(file.lastModified())));
writer.addDocument(doc);
totalIndexed++;
}
}
catch (DocumentHandlerException e) {
throw new BuildException(e);
}
}
}
// for j
}
// if (fs != null)
}
// for i
writer.optimize();
}
//try
finally {
// always make sure everything gets closed,
// no matter how we exit.
writer.close();
if (searcher != null) {
searcher.close();
}
}
Date end = new Date();
log(totalIndexed + " out of " + totalFiles + " indexed (" +
totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
" milliseconds");
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/main/org/apache/lucene/ant/TextDocument.java
Index: TextDocument.java
===================================================================
package org.apache.lucene.ant;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* A utility for making Lucene Documents from a File.
*
*@author Erik Hatcher
*@created December 6, 2001
*@todo Fix JavaDoc comments here
*/
public class TextDocument {
private String contents;
/**
* Constructor for the TextDocument object
*
*@param file Description of Parameter
*@exception IOException Description of Exception
*/
public TextDocument(File file) throws IOException {
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
}
/**
* Makes a document for a File. <p>
*
* The document has a single field:
* <ul>
* <li> <code>contents</code>--containing the full contents
* of the file, as a Text field;
*
*@param f Description of Parameter
*@return Description of the Returned Value
*@exception IOException Description of Exception
*/
public static Document Document(File f) throws IOException {
TextDocument textDoc = new TextDocument(f);
// make a new, empty document
Document doc = new Document();
doc.add(Field.Text("contents", textDoc.getContents()));
doc.add(Field.UnIndexed("rawcontents",
textDoc.getContents()));
// return the document
return doc;
}
/**
*@return The contents value
*@todo finish this method
*/
public String getContents() {
return contents;
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/test/org/apache/lucene/ant/DocumentTestCase.java
Index: DocumentTestCase.java
===================================================================
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
public abstract class DocumentTestCase extends TestCase
{
public DocumentTestCase(String name) {
super(name);
}
protected File getFile(String filename) throws IOException {
String fullname =
this.getClass().getResource(filename).getFile();
File file = new File(fullname);
return file;
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/test/org/apache/lucene/ant/HtmlDocumentTest.java
Index: HtmlDocumentTest.java
===================================================================
package org.apache.lucene.ant;
import java.io.IOException;
import org.apache.lucene.ant.DocumentTestCase;
import org.apache.lucene.ant.HtmlDocument;
public class HtmlDocumentTest extends DocumentTestCase
{
public HtmlDocumentTest (String name) {
super(name);
}
HtmlDocument doc;
public void setUp() throws IOException {
doc = new HtmlDocument(getFile("test.html"));
}
public void testDoc() {
assertEquals("Title", "Test Title", doc.getTitle());
assertTrue("Body", doc.getBody().startsWith("This is some test"));
}
public void tearDown() {
doc = null;
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/test/org/apache/lucene/ant/IndexTaskTest.java
Index: IndexTaskTest.java
===================================================================
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.ant.IndexTask;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.types.FileSet;
/**
* Test cases for index task
*
*@author Erik Hatcher
*/
public class IndexTaskTest extends TestCase {
private final static String docHandler =
"org.apache.lucene.ant.FileExtensionDocumentHandler";
private String docsDir = System.getProperty("docs.dir");
private String indexDir = System.getProperty("index.dir");
private Searcher searcher;
private Analyzer analyzer;
/**
* Constructor for the IndexTaskTest object
*
*@param name Description of Parameter
*/
public IndexTaskTest(String name) {
super(name);
}
/**
* The JUnit setup method
*
*@exception IOException Description of Exception
*/
public void setUp() throws IOException {
Project project = new Project();
IndexTask task = new IndexTask();
FileSet fs = new FileSet();
fs.setDir(new File(docsDir));
task.addFileset(fs);
task.setOverwrite(true);
task.setDocumentHandler(docHandler);
task.setIndex(new File(indexDir));
task.setProject(project);
task.execute();
searcher = new IndexSearcher(indexDir);
analyzer = new StopAnalyzer();
}
/**
* A unit test for JUnit
*/
public void testSearch() throws IOException, ParseException {
System.out.println("sysout");
System.err.println("syserr");
Query query = QueryParser.parse("test", "contents", analyzer);
Hits hits = searcher.search(query);
assertEquals("Find document(s)", 2, hits.length());
}
/**
* The teardown method for JUnit
* @todo remove indexDir?
*/
public void tearDown() throws IOException {
searcher.close();
}
}
1.1
jakarta-lucene-sandbox/contributions/ant/src/test/org/apache/lucene/ant/test.html
Index: test.html
===================================================================
<html>
<head>
<title>Test Title</title>
</head>
<body>
<i>This is <b>some</b>test</i>
</body>
1.1
jakarta-lucene-sandbox/contributions/ant/src/test/org/apache/lucene/ant/test.txt
Index: test.txt
===================================================================
Test Contents
1.1
jakarta-lucene-sandbox/contributions/ant/src/test/org/apache/lucene/ant/TextDocumentTest.java
Index: TextDocumentTest.java
===================================================================
package org.apache.lucene.ant;
import java.io.IOException;
import org.apache.lucene.ant.DocumentTestCase;
import org.apache.lucene.ant.TextDocument;
public class TextDocumentTest extends DocumentTestCase
{
public TextDocumentTest (String name) {
super(name);
}
TextDocument doc;
public void setUp() throws IOException {
doc = new TextDocument(getFile("test.txt"));
}
public void testDoc() {
assertEquals("Contents", "Test Contents", doc.getContents());
}
public void tearDown() {
doc = null;
}
}
--
To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>