Author: lehmi Date: Tue Oct 8 17:15:09 2013 New Revision: 1530357 URL: http://svn.apache.org/r1530357 Log: PDFBOX-1356: moved the lucene stuff to the examples subproject, remove the lucene subproject
Added: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java (with props) pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java - copied, changed from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html - copied, changed from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html Removed: pdfbox/trunk/lucene/pom.xml pdfbox/trunk/lucene/src/ Modified: pdfbox/trunk/examples/pom.xml pdfbox/trunk/pom.xml Modified: pdfbox/trunk/examples/pom.xml URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/pom.xml?rev=1530357&r1=1530356&r2=1530357&view=diff ============================================================================== --- pdfbox/trunk/examples/pom.xml (original) +++ pdfbox/trunk/examples/pom.xml Tue Oct 8 17:15:09 2013 @@ -37,6 +37,10 @@ </description> <inceptionYear>2002</inceptionYear> + <properties> + <lucene.version>4.3.1</lucene.version> + </properties> + <dependencies> <dependency> <groupId>org.apache.pdfbox</groupId> @@ -49,6 +53,21 @@ <version>1.48</version> <optional>true</optional> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>pdfbox</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>${lucene.version}</version> + </dependency> </dependencies> <build> Added: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java?rev=1530357&view=auto ============================================================================== --- pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java (added) +++ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java Tue Oct 8 17:15:09 2013 @@ -0,0 +1,234 @@ +package org.apache.pdfbox.examples.lucene; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Date; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +/** + * Index all pdf files under a directory. + * <p> + * This is a command-line application demonstrating simple Lucene indexing. Run it with no command-line arguments for + * usage information. + * <p> + * It's based on a demo provided by the lucene project. + */ +public class IndexPDFFiles +{ + + private IndexPDFFiles() + { + } + + /** + * Index all text files under a directory. + * + * @param args command line arguments + * + */ + public static void main(String[] args) + { + String usage = "java org.apache.pdfbox.lucene.IndexPDFFiles" + + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + + "This indexes all PDF documents in DOCS_PATH, creating a Lucene index" + + "in INDEX_PATH that can be searched with SearchFiles"; + String indexPath = "index"; + String docsPath = null; + boolean create = true; + for (int i = 0; i < args.length; i++) + { + if ("-index".equals(args[i])) + { + indexPath = args[i + 1]; + i++; + } + else if ("-docs".equals(args[i])) + { + docsPath = args[i + 1]; + i++; + } + else if ("-update".equals(args[i])) + { + create = false; + } + } + + if (docsPath == null) + { + System.err.println("Usage: " + usage); + System.exit(1); + } + + final File docDir = new File(docsPath); + if (!docDir.exists() || !docDir.canRead()) + { + System.out.println("Document directory '" + docDir.getAbsolutePath() + + "' does not exist or is not readable, please check the path"); + System.exit(1); + } + + Date start = new Date(); + try + { + System.out.println("Indexing to directory '" + indexPath + "'..."); + + Directory dir = FSDirectory.open(new File(indexPath)); + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); + + if (create) + { + // Create a new index in the directory, removing any + // previously indexed documents: + iwc.setOpenMode(OpenMode.CREATE); + } + else + { + // Add new documents to an existing index: + iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); + } + + // Optional: for better indexing performance, if you + // are indexing many documents, increase the RAM + // buffer. But if you do this, increase the max heap + // size to the JVM (eg add -Xmx512m or -Xmx1g): + // + // iwc.setRAMBufferSizeMB(256.0); + + IndexWriter writer = new IndexWriter(dir, iwc); + indexDocs(writer, docDir); + + // NOTE: if you want to maximize search performance, + // you can optionally call forceMerge here. This can be + // a terribly costly operation, so generally it's only + // worth it when your index is relatively static (ie + // you're done adding documents to it): + // + // writer.forceMerge(1); + + writer.close(); + + Date end = new Date(); + System.out.println(end.getTime() - start.getTime() + " total milliseconds"); + + } + catch (IOException e) + { + System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); + } + } + + /** + * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories + * found under the given directory. + * + * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents + * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one + * document per line, using the <a + * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" + * >WriteLineDocTask</a>. + * + * @param writer Writer to the index where the given file/dir info will be stored + * @param file The file to index, or the directory to recurse into to find files to index + * @throws IOException If there is a low-level I/O error + */ + static void indexDocs(IndexWriter writer, File file) throws IOException + { + // do not try to index files that cannot be read + if (file.canRead()) + { + if (file.isDirectory()) + { + String[] files = file.list(); + // an IO error could occur + if (files != null) + { + for (int i = 0; i < files.length; i++) + { + indexDocs(writer, new File(file, files[i])); + } + } + } + else + { + + FileInputStream fis; + try + { + fis = new FileInputStream(file); + } + catch (FileNotFoundException fnfe) + { + // at least on windows, some temporary files raise this exception with an "access denied" message + // checking if the file can be read doesn't help + return; + } + + try + { + + String path = file.getName().toUpperCase(); + Document doc = null; + if (path.endsWith(".PDF")) + { + System.out.println("Indexing PDF document: " + file); + doc = LucenePDFDocument.getDocument(file); + } + else + { + System.out.println("Skipping " + file); + return; + } + + if (writer.getConfig().getOpenMode() == OpenMode.CREATE) + { + // New index, so we just add the document (no old document can be there): + System.out.println("adding " + file); + writer.addDocument(doc); + } + else + { + // Existing index (an old copy of this document may have been indexed) so + // we use updateDocument instead to replace the old one matching the exact + // path, if present: + System.out.println("updating " + file); + writer.updateDocument(new Term("uid", LucenePDFDocument.createUID(file)), doc); + } + } + finally + { + fis.close(); + } + } + } + } +} Propchange: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java ------------------------------------------------------------------------------ svn:eol-style = native Copied: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java (from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java) URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java?p2=pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java&p1=pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java&r1=1502161&r2=1530357&rev=1530357&view=diff ============================================================================== --- pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java (original) +++ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java Tue Oct 8 17:15:09 2013 @@ -14,272 +14,268 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.pdfbox.lucene; +package org.apache.pdfbox.examples.lucene; import java.io.File; import java.io.FileInputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; -import java.util.Calendar; - import java.net.URL; import java.net.URLConnection; - +import java.util.Calendar; import java.util.Date; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentInformation; - +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.InvalidPasswordException; - +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.util.PDFTextStripper; /** - * This class is used to create a document for the lucene search engine. - * This should easily plug into the IndexHTML or IndexFiles that comes with - * the lucene project. This class will populate the following fields. + * This class is used to create a document for the lucene search engine. This should easily plug into the IndexPDFFiles + * that comes with the lucene project. This class will populate the following fields. * <table> - * <tr> - * <th>Lucene Field Name</th> - * <th>Description</th> - * </tr> - * <tr> - * <td>path</td> - * <td>File system path if loaded from a file</td> - * </tr> - * <tr> - * <td>url</td> - * <td>URL to PDF document</td> - * </tr> - * <tr> - * <td>contents</td> - * <td>Entire contents of PDF document, indexed but not stored</td> - * </tr> - * <tr> - * <td>summary</td> - * <td>First 500 characters of content</td> - * </tr> - * <tr> - * <td>modified</td> - * <td>The modified date/time according to the url or path</td> - * </tr> - * <tr> - * <td>uid</td> - * <td>A unique identifier for the Lucene document.</td> - * </tr> - * <tr> - * <td>CreationDate</td> - * <td>From PDF meta-data if available</td> - * </tr> - * <tr> - * <td>Creator</td> - * <td>From PDF meta-data if available</td> - * </tr> - * <tr> - * <td>Keywords</td> - * <td>From PDF meta-data if available</td> - * </tr> - * <tr> - * <td>ModificationDate</td> - * <td>From PDF meta-data if available</td> - * </tr> - * <tr> - * <td>Producer</td> - * <td>From PDF meta-data if available</td> - * </tr> - * <tr> - * <td>Subject</td> - * <td>From PDF meta-data if available</td> - * </tr> - * <tr> - * <td>Trapped</td> - * <td>From PDF meta-data if available</td> - * </tr> + * <tr> + * <th>Lucene Field Name</th> + * <th>Description</th> + * </tr> + * <tr> + * <td>path</td> + * <td>File system path if loaded from a file</td> + * </tr> + * <tr> + * <td>url</td> + * <td>URL to PDF document</td> + * </tr> + * <tr> + * <td>contents</td> + * <td>Entire contents of PDF document, indexed but not stored</td> + * </tr> + * <tr> + * <td>summary</td> + * <td>First 500 characters of content</td> + * </tr> + * <tr> + * <td>modified</td> + * <td>The modified date/time according to the url or path</td> + * </tr> + * <tr> + * <td>uid</td> + * <td>A unique identifier for the Lucene document.</td> + * </tr> + * <tr> + * <td>CreationDate</td> + * <td>From PDF meta-data if available</td> + * </tr> + * <tr> + * <td>Creator</td> + * <td>From PDF meta-data if available</td> + * </tr> + * <tr> + * <td>Keywords</td> + * <td>From PDF meta-data if available</td> + * </tr> + * <tr> + * <td>ModificationDate</td> + * <td>From PDF meta-data if available</td> + * </tr> + * <tr> + * <td>Producer</td> + * <td>From PDF meta-data if available</td> + * </tr> + * <tr> + * <td>Subject</td> + * <td>From PDF meta-data if available</td> + * </tr> + * <tr> + * <td>Trapped</td> + * <td>From PDF meta-data if available</td> + * </tr> * </table> - * + * * @author <a href="mailto:b...@benlitchfield.com">Ben Litchfield</a> - * @version $Revision: 1.23 $ + * */ public class LucenePDFDocument { private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); // given caveat of increased search times when using - //MICROSECOND, only use SECOND by default - private DateTools.Resolution dateTimeResolution = DateTools.Resolution.SECOND; + // MICROSECOND, only use SECOND by default + private static final DateTools.Resolution DATE_TIME_RES = DateTools.Resolution.SECOND; private PDFTextStripper stripper = null; - /** - * Constructor. - */ - public LucenePDFDocument() + private boolean useNonSeqParser; + + /** not Indexed, tokenized, stored. */ + public static final FieldType TYPE_STORED_NOT_INDEXED = new FieldType(); + + static { + TYPE_STORED_NOT_INDEXED.setIndexed(false); + TYPE_STORED_NOT_INDEXED.setStored(true); + TYPE_STORED_NOT_INDEXED.setTokenized(true); + TYPE_STORED_NOT_INDEXED.freeze(); } /** - * Set the text stripper that will be used during extraction. - * - * @param aStripper The new pdf text stripper. + * Constructor. */ - public void setTextStripper( PDFTextStripper aStripper ) + public LucenePDFDocument() { - stripper = aStripper; + this(false); } /** - * Get the Lucene data time resolution. - * - * @return current date/time resolution + * Constructor. + * + * @param nonSequentialParser indicates if the non-sequential parser should be used + * */ - public DateTools.Resolution getDateTimeResolution() + public LucenePDFDocument(boolean nonSequentialParser) { - return dateTimeResolution; + useNonSeqParser = nonSequentialParser; } /** - * Set the Lucene data time resolution. - * - * @param resolution set new date/time resolution + * Set the text stripper that will be used during extraction. + * + * @param aStripper The new pdf text stripper. */ - public void setDateTimeResolution( DateTools.Resolution resolution ) + public void setTextStripper(PDFTextStripper aStripper) { - dateTimeResolution = resolution; + stripper = aStripper; } - // - // compatibility methods for lucene-1.9+ - // - private String timeToString( long time ) + private static String timeToString(long time) { - return DateTools.timeToString( time, dateTimeResolution ); + return DateTools.timeToString(time, DATE_TIME_RES); } - private void addKeywordField( Document document, String name, String value ) + private void addKeywordField(Document document, String name, String value) { - if ( value != null ) + if (value != null) { - document.add( new Field( name, value, Field.Store.YES, Field.Index.NOT_ANALYZED ) ); + document.add(new StringField(name, value, Field.Store.YES)); } } - private void addTextField( Document document, String name, Reader value ) + private void addTextField(Document document, String name, Reader value) { - if ( value != null ) + if (value != null) { - document.add( new Field( name, value ) ); + document.add(new TextField(name, value)); } } - private void addTextField( Document document, String name, String value ) + private void addTextField(Document document, String name, String value) { - if ( value != null ) + if (value != null) { - document.add( new Field( name, value, Field.Store.YES, Field.Index.ANALYZED ) ); + document.add(new TextField(name, value, Field.Store.YES)); } } - private void addTextField( Document document, String name, Date value ) + private void addTextField(Document document, String name, Date value) { - if ( value != null ) + if (value != null) { - addTextField( document, name, DateTools.dateToString( value, dateTimeResolution ) ); + addTextField(document, name, DateTools.dateToString(value, DATE_TIME_RES)); } } - private void addTextField( Document document, String name, Calendar value ) + private void addTextField(Document document, String name, Calendar value) { - if ( value != null ) + if (value != null) { - addTextField( document, name, value.getTime() ); + addTextField(document, name, value.getTime()); } } - private static void addUnindexedField( Document document, String name, String value ) + private static void addUnindexedField(Document document, String name, String value) { - if ( value != null ) + if (value != null) { - document.add( new Field( name, value, Field.Store.YES, Field.Index.NO ) ); + document.add(new Field(name, value, TYPE_STORED_NOT_INDEXED)); } } - private void addUnstoredKeywordField( Document document, String name, String value ) + private void addUnstoredKeywordField(Document document, String name, String value) { - if ( value != null ) + if (value != null) { - document.add( new Field( name, value, Field.Store.NO, Field.Index.NOT_ANALYZED ) ); + document.add(new Field(name, value, TextField.TYPE_NOT_STORED)); } } /** * Convert the PDF stream to a lucene document. - * + * * @param is The input stream. * @return The input stream converted to a lucene document. * @throws IOException If there is an error converting the PDF. */ - public Document convertDocument( InputStream is ) throws IOException + public Document convertDocument(InputStream is) throws IOException { Document document = new Document(); - addContent( document, is, "<inputstream>" ); + addContent(document, is, "<inputstream>"); return document; } /** * This will take a reference to a PDF document and create a lucene document. - * + * * @param file A reference to a PDF document. * @return The converted lucene document. - * + * * @throws IOException If there is an exception while converting the document. */ - public Document convertDocument( File file ) throws IOException + public Document convertDocument(File file) throws IOException { Document document = new Document(); - // Add the url as a field named "url". Use an UnIndexed field, so + // Add the url as a field named "url". Use an UnIndexed field, so // that the url is just stored with the document, but is not searchable. - addUnindexedField( document, "path", file.getPath() ); - addUnindexedField( document, "url", file.getPath().replace(FILE_SEPARATOR, '/') ); + addUnindexedField(document, "path", file.getPath()); + addUnindexedField(document, "url", file.getPath().replace(FILE_SEPARATOR, '/')); - // Add the last modified date of the file a field named "modified". Use a + // Add the last modified date of the file a field named "modified". Use a // Keyword field, so that it's searchable, but so that no attempt is made // to tokenize the field into words. - addKeywordField( document, "modified", timeToString( file.lastModified() ) ); + addKeywordField(document, "modified", timeToString(file.lastModified())); - String uid = file.getPath().replace(FILE_SEPARATOR,'\u0000') - + "\u0000" - + timeToString( file.lastModified() ); + String uid = createUID(file); // Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with document, it is indexed, but it is not // tokenized prior to indexing. - addUnstoredKeywordField( document, "uid", uid ); + addUnstoredKeywordField(document, "uid", uid); FileInputStream input = null; try { - input = new FileInputStream( file ); - addContent( document, input, file.getPath() ); + input = new FileInputStream(file); + addContent(document, input, file.getPath()); } finally { - if( input != null ) + if (input != null) { input.close(); } } - // return the document return document; @@ -287,43 +283,41 @@ public class LucenePDFDocument /** * Convert the document from a PDF to a lucene document. - * + * * @param url A url to a PDF document. * @return The PDF converted to a lucene document. * @throws IOException If there is an error while converting the document. */ - public Document convertDocument( URL url ) throws IOException + public Document convertDocument(URL url) throws IOException { Document document = new Document(); URLConnection connection = url.openConnection(); connection.connect(); - // Add the url as a field named "url". Use an UnIndexed field, so + // Add the url as a field named "url". Use an UnIndexed field, so // that the url is just stored with the document, but is not searchable. - addUnindexedField( document, "url", url.toExternalForm() ); + addUnindexedField(document, "url", url.toExternalForm()); - // Add the last modified date of the file a field named "modified". Use a + // Add the last modified date of the file a field named "modified". Use a // Keyword field, so that it's searchable, but so that no attempt is made // to tokenize the field into words. - addKeywordField( document, "modified", timeToString(connection.getLastModified() ) ); + addKeywordField(document, "modified", timeToString(connection.getLastModified())); - String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') - + "\u0000" - + timeToString( connection.getLastModified() ); + String uid = createUID(url, connection.getLastModified()); // Add the uid as a field, so that index can be incrementally maintained. // This field is not stored with document, it is indexed, but it is not // tokenized prior to indexing. - addUnstoredKeywordField( document, "uid", uid ); + addUnstoredKeywordField(document, "uid", uid); InputStream input = null; try { input = connection.getInputStream(); - addContent( document, input,url.toExternalForm() ); + addContent(document, input, url.toExternalForm()); } finally { - if( input != null ) + if (input != null) { input.close(); } @@ -335,74 +329,126 @@ public class LucenePDFDocument /** * This will get a lucene document from a PDF file. - * + * * @param is The stream to read the PDF from. - * + * * @return The lucene document. - * + * * @throws IOException If there is an error parsing or indexing the document. */ - public static Document getDocument( InputStream is ) throws IOException + public static Document getDocument(InputStream is) throws IOException { - LucenePDFDocument converter = new LucenePDFDocument(); - return converter.convertDocument( is ); + return getDocument(is, false); } /** * This will get a lucene document from a PDF file. - * + * + * @param is The stream to read the PDF from. + * @param nonSeqParser indicates if the non-sequential parser should be used + * + * @return The lucene document. + * + * @throws IOException If there is an error parsing or indexing the document. + */ + public static Document getDocument(InputStream is, boolean nonSeqParser) throws IOException + { + LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser); + return converter.convertDocument(is); + } + + /** + * This will get a lucene document from a PDF file. + * + * @param file The file to get the document for. + * + * @return The lucene document. + * + * @throws IOException If there is an error parsing or indexing the document. + */ + public static Document getDocument(File file) throws IOException + { + return getDocument(file, false); + } + + /** + * This will get a lucene document from a PDF file. + * * @param file The file to get the document for. - * + * @param nonSeqParser indicates if the non-sequential parser should be used + * * @return The lucene document. - * + * * @throws IOException If there is an error parsing or indexing the document. */ - public static Document getDocument( File file ) throws IOException + public static Document getDocument(File file, boolean nonSeqParser) throws IOException { - LucenePDFDocument converter = new LucenePDFDocument(); - return converter.convertDocument( file ); + LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser); + return converter.convertDocument(file); } /** * This will get a lucene document from a PDF file. - * + * * @param url The file to get the document for. - * + * * @return The lucene document. - * + * * @throws IOException If there is an error parsing or indexing the document. */ - public static Document getDocument( URL url ) throws IOException + public static Document getDocument(URL url) throws IOException { - LucenePDFDocument converter = new LucenePDFDocument(); - return converter.convertDocument( url ); + return getDocument(url, false); + } + + /** + * This will get a lucene document from a PDF file. + * + * @param url The file to get the document for. + * @param nonSeqParser indicates if the non-sequential parser should be used + * + * @return The lucene document. + * + * @throws IOException If there is an error parsing or indexing the document. + */ + public static Document getDocument(URL url, boolean nonSeqParser) throws IOException + { + LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser); + return converter.convertDocument(url); } /** * This will add the contents to the lucene document. - * + * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. - * + * * @throws IOException If there is an error parsing the document. */ - private void addContent( Document document, InputStream is, String documentLocation ) throws IOException + private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; try { - pdfDocument = PDDocument.load( is ); - - if( pdfDocument.isEncrypted() ) + if (useNonSeqParser) { - //Just try using the default password and move on - pdfDocument.decrypt( "" ); + pdfDocument = PDDocument.loadNonSeq(is, null, ""); + } + else + { + pdfDocument = PDDocument.load(is); + + if (pdfDocument.isEncrypted()) + { + // Just try using the default password and move on + pdfDocument.decrypt(""); + } } - //create a writer where to append the text content. + // create a writer where to append the text content. StringWriter writer = new StringWriter(); - if( stripper == null ) + if (stripper == null) { stripper = new PDFTextStripper(); } @@ -410,7 +456,7 @@ public class LucenePDFDocument { stripper.resetEngine(); } - stripper.writeText( pdfDocument, writer ); + stripper.writeText(pdfDocument, writer); // Note: the buffer to string operation is costless; // the char array value of the writer buffer and the content string @@ -418,59 +464,57 @@ public class LucenePDFDocument // not occur here. String contents = writer.getBuffer().toString(); - StringReader reader = new StringReader( contents ); + StringReader reader = new StringReader(contents); // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. - addTextField( document, "contents", reader ); + addTextField(document, "contents", reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); - if( info != null ) + if (info != null) { - addTextField( document, "Author", info.getAuthor() ); + addTextField(document, "Author", info.getAuthor()); try { - addTextField( document, "CreationDate", info.getCreationDate() ); + addTextField(document, "CreationDate", info.getCreationDate()); } - catch( IOException io ) + catch (IOException io) { - //ignore, bad date but continue with indexing + // ignore, bad date but continue with indexing } - addTextField( document, "Creator", info.getCreator() ); - addTextField( document, "Keywords", info.getKeywords() ); + addTextField(document, "Creator", info.getCreator()); + addTextField(document, "Keywords", info.getKeywords()); try { - addTextField( document, "ModificationDate", info.getModificationDate() ); + addTextField(document, "ModificationDate", info.getModificationDate()); } - catch( IOException io ) + catch (IOException io) { - //ignore, bad date but continue with indexing + // ignore, bad date but continue with indexing } - addTextField( document, "Producer", info.getProducer() ); - addTextField( document, "Subject", info.getSubject() ); - addTextField( document, "Title", info.getTitle() ); - addTextField( document, "Trapped", info.getTrapped() ); + addTextField(document, "Producer", info.getProducer()); + addTextField(document, "Subject", info.getSubject()); + addTextField(document, "Title", info.getTitle()); + addTextField(document, "Trapped", info.getTrapped()); } - int summarySize = Math.min( contents.length(), 500 ); - String summary = contents.substring( 0, summarySize ); + int summarySize = Math.min(contents.length(), 500); + String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and returned // with hit documents for display. - addUnindexedField( document, "summary", summary ); + addUnindexedField(document, "summary", summary); } - catch( CryptographyException e ) + catch (CryptographyException e) { - throw new IOException( "Error decrypting document(" + documentLocation + "): " + e ); + throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } - catch( InvalidPasswordException e ) + catch (InvalidPasswordException e) { - //they didn't suppply a password and the default of "" was wrong. - throw new IOException( - "Error: The document(" + documentLocation + - ") is encrypted and will not be indexed." ); + // they didn't suppply a password and the default of "" was wrong. + throw new IOException("Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { - if( pdfDocument != null ) + if (pdfDocument != null) { pdfDocument.close(); } @@ -478,22 +522,27 @@ public class LucenePDFDocument } /** - * This will test creating a document. - * - * usage: java pdfparser.searchengine.lucene.LucenePDFDocument <pdf-document> - * - * @param args command line arguments. - * - * @throws IOException If there is an error. - */ - public static void main( String[] args ) throws IOException - { - if( args.length != 1 ) - { - String us = LucenePDFDocument.class.getName(); - System.err.println( "usage: java " + us + " <pdf-document>" ); - System.exit( 1 ); - } - System.out.println( "Document=" + getDocument( new File( args[0] ) ) ); + * Create an UID for the given file using the given time. + * + * @param file the file we have to create an UID for + * @param time the time to used to the UID + * + * @return the created UID + */ + public static String createUID(URL url, long time) + { + return url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + timeToString(time); + } + + /** + * Create an UID for the given file. + * + * @param file the file we have to create an UID for + * + * @return the created UID + */ + public static String createUID(File file) + { + return file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + timeToString(file.lastModified()); } } Copied: pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html (from r1502161, pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html) URL: http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html?p2=pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html&p1=pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html&r1=1502161&r2=1530357&rev=1530357&view=diff ============================================================================== --- pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html (original) +++ pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html Tue Oct 8 17:15:09 2013 @@ -20,6 +20,6 @@ </head> <body> -This package holds classes that are used to integrate the PDFBox project with lucene. +This example shows how to to integrate the PDFBox project with lucene. </body> </html> Modified: pdfbox/trunk/pom.xml URL: http://svn.apache.org/viewvc/pdfbox/trunk/pom.xml?rev=1530357&r1=1530356&r2=1530357&view=diff ============================================================================== --- pdfbox/trunk/pom.xml (original) +++ pdfbox/trunk/pom.xml Tue Oct 8 17:15:09 2013 @@ -50,7 +50,6 @@ <module>pdfbox</module> <module>preflight</module> <module>preflight-app</module> - <module>lucene</module> <module>ant</module> <module>war</module> <module>app</module>