svn commit: r1530357 - in /pdfbox/trunk: ./ examples/ examples/src/main/java/org/apache/pdfbox/examples/lucene/ lucene/ lucene/src/

lehmi Tue, 08 Oct 2013 10:16:01 -0700

Author: lehmi
Date: Tue Oct  8 17:15:09 2013
New Revision: 1530357

URL: http://svn.apache.org/r1530357
Log:
PDFBOX-1356: moved the lucene stuff to the examples subproject, remove the 
lucene subproject


Added:
    pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/
    
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
   (with props)
    
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java
      - copied, changed from r1502161, 
pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java
    
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html
      - copied, changed from r1502161, 
pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html
Removed:
    pdfbox/trunk/lucene/pom.xml
    pdfbox/trunk/lucene/src/
Modified:
    pdfbox/trunk/examples/pom.xml
    pdfbox/trunk/pom.xml

Modified: pdfbox/trunk/examples/pom.xml
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/examples/pom.xml?rev=1530357&r1=1530356&r2=1530357&view=diff
==============================================================================
--- pdfbox/trunk/examples/pom.xml (original)
+++ pdfbox/trunk/examples/pom.xml Tue Oct  8 17:15:09 2013
@@ -37,6 +37,10 @@
   </description>
   <inceptionYear>2002</inceptionYear>
 
+  <properties>
+    <lucene.version>4.3.1</lucene.version>
+  </properties>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.pdfbox</groupId>
@@ -49,6 +53,21 @@
       <version>1.48</version>
       <optional>true</optional>
     </dependency>
+    <dependency>
+        <groupId>${project.groupId}</groupId>
+        <artifactId>pdfbox</artifactId>
+        <version>${project.version}</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-core</artifactId>
+        <version>${lucene.version}</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-analyzers-common</artifactId>
+        <version>${lucene.version}</version>
+    </dependency>  
   </dependencies>
 
   <build>

Added: 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java?rev=1530357&view=auto
==============================================================================
--- 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
 (added)
+++ 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
 Tue Oct  8 17:15:09 2013
@@ -0,0 +1,234 @@
+package org.apache.pdfbox.examples.lucene;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ * Index all pdf files under a directory.
+ * <p>
+ * This is a command-line application demonstrating simple Lucene indexing. 
Run it with no command-line arguments for
+ * usage information.
+ * <p>
+ * It's based on a demo provided by the lucene project.
+ */
+public class IndexPDFFiles
+{
+
+    private IndexPDFFiles()
+    {
+    }
+
+    /**
+     * Index all text files under a directory.
+     * 
+     * @param args command line arguments
+     * 
+     */
+    public static void main(String[] args)
+    {
+        String usage = "java org.apache.pdfbox.lucene.IndexPDFFiles"
+                + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
+                + "This indexes all PDF documents in DOCS_PATH, creating a 
Lucene index"
+                + "in INDEX_PATH that can be searched with SearchFiles";
+        String indexPath = "index";
+        String docsPath = null;
+        boolean create = true;
+        for (int i = 0; i < args.length; i++)
+        {
+            if ("-index".equals(args[i]))
+            {
+                indexPath = args[i + 1];
+                i++;
+            }
+            else if ("-docs".equals(args[i]))
+            {
+                docsPath = args[i + 1];
+                i++;
+            }
+            else if ("-update".equals(args[i]))
+            {
+                create = false;
+            }
+        }
+
+        if (docsPath == null)
+        {
+            System.err.println("Usage: " + usage);
+            System.exit(1);
+        }
+
+        final File docDir = new File(docsPath);
+        if (!docDir.exists() || !docDir.canRead())
+        {
+            System.out.println("Document directory '" + 
docDir.getAbsolutePath()
+                    + "' does not exist or is not readable, please check the 
path");
+            System.exit(1);
+        }
+
+        Date start = new Date();
+        try
+        {
+            System.out.println("Indexing to directory '" + indexPath + "'...");
+
+            Directory dir = FSDirectory.open(new File(indexPath));
+            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
+            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, 
analyzer);
+
+            if (create)
+            {
+                // Create a new index in the directory, removing any
+                // previously indexed documents:
+                iwc.setOpenMode(OpenMode.CREATE);
+            }
+            else
+            {
+                // Add new documents to an existing index:
+                iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
+            }
+
+            // Optional: for better indexing performance, if you
+            // are indexing many documents, increase the RAM
+            // buffer. But if you do this, increase the max heap
+            // size to the JVM (eg add -Xmx512m or -Xmx1g):
+            //
+            // iwc.setRAMBufferSizeMB(256.0);
+
+            IndexWriter writer = new IndexWriter(dir, iwc);
+            indexDocs(writer, docDir);
+
+            // NOTE: if you want to maximize search performance,
+            // you can optionally call forceMerge here. This can be
+            // a terribly costly operation, so generally it's only
+            // worth it when your index is relatively static (ie
+            // you're done adding documents to it):
+            //
+            // writer.forceMerge(1);
+
+            writer.close();
+
+            Date end = new Date();
+            System.out.println(end.getTime() - start.getTime() + " total 
milliseconds");
+
+        }
+        catch (IOException e)
+        {
+            System.out.println(" caught a " + e.getClass() + "\n with message: 
" + e.getMessage());
+        }
+    }
+
+    /**
+     * Indexes the given file using the given writer, or if a directory is 
given, recurses over files and directories
+     * found under the given directory.
+     * 
+     * NOTE: This method indexes one document per input file. This is slow. 
For good throughput, put multiple documents
+     * into your input file(s). An example of this is in the benchmark module, 
which can create "line doc" files, one
+     * document per line, using the <a
+     * 
href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
+     * >WriteLineDocTask</a>.
+     * 
+     * @param writer Writer to the index where the given file/dir info will be 
stored
+     * @param file The file to index, or the directory to recurse into to find 
files to index
+     * @throws IOException If there is a low-level I/O error
+     */
+    static void indexDocs(IndexWriter writer, File file) throws IOException
+    {
+        // do not try to index files that cannot be read
+        if (file.canRead())
+        {
+            if (file.isDirectory())
+            {
+                String[] files = file.list();
+                // an IO error could occur
+                if (files != null)
+                {
+                    for (int i = 0; i < files.length; i++)
+                    {
+                        indexDocs(writer, new File(file, files[i]));
+                    }
+                }
+            }
+            else
+            {
+
+                FileInputStream fis;
+                try
+                {
+                    fis = new FileInputStream(file);
+                }
+                catch (FileNotFoundException fnfe)
+                {
+                    // at least on windows, some temporary files raise this 
exception with an "access denied" message
+                    // checking if the file can be read doesn't help
+                    return;
+                }
+
+                try
+                {
+
+                    String path = file.getName().toUpperCase();
+                    Document doc = null;
+                    if (path.endsWith(".PDF"))
+                    {
+                        System.out.println("Indexing PDF document: " + file);
+                        doc = LucenePDFDocument.getDocument(file);
+                    }
+                    else
+                    {
+                        System.out.println("Skipping " + file);
+                        return;
+                    }
+
+                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE)
+                    {
+                        // New index, so we just add the document (no old 
document can be there):
+                        System.out.println("adding " + file);
+                        writer.addDocument(doc);
+                    }
+                    else
+                    {
+                        // Existing index (an old copy of this document may 
have been indexed) so
+                        // we use updateDocument instead to replace the old 
one matching the exact
+                        // path, if present:
+                        System.out.println("updating " + file);
+                        writer.updateDocument(new Term("uid", 
LucenePDFDocument.createUID(file)), doc);
+                    }
+                }
+                finally
+                {
+                    fis.close();
+                }
+            }
+        }
+    }
+}

Propchange: 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/IndexPDFFiles.java
------------------------------------------------------------------------------
    svn:eol-style = native

Copied: 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java
 (from r1502161, 
pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java)
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java?p2=pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java&p1=pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java&r1=1502161&r2=1530357&rev=1530357&view=diff
==============================================================================
--- 
pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/LucenePDFDocument.java
 (original)
+++ 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/LucenePDFDocument.java
 Tue Oct  8 17:15:09 2013
@@ -14,272 +14,268 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.pdfbox.lucene;
+package org.apache.pdfbox.examples.lucene;
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.InputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.Reader;
 import java.io.StringReader;
 import java.io.StringWriter;
-import java.util.Calendar;
-
 import java.net.URL;
 import java.net.URLConnection;
-
+import java.util.Calendar;
 import java.util.Date;
 
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
 import org.apache.pdfbox.exceptions.CryptographyException;
 import org.apache.pdfbox.exceptions.InvalidPasswordException;
-
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.util.PDFTextStripper;
 
 /**
- * This class is used to create a document for the lucene search engine.
- * This should easily plug into the IndexHTML or IndexFiles that comes with
- * the lucene project.  This class will populate the following fields.
+ * This class is used to create a document for the lucene search engine. This 
should easily plug into the IndexPDFFiles
+ * that comes with the lucene project. This class will populate the following 
fields.
  * <table>
- *      <tr>
- *          <th>Lucene Field Name</th>
- *          <th>Description</th>
- *      </tr>
- *      <tr>
- *          <td>path</td>
- *          <td>File system path if loaded from a file</td>
- *      </tr>
- *      <tr>
- *          <td>url</td>
- *          <td>URL to PDF document</td>
- *      </tr>
- *      <tr>
- *          <td>contents</td>
- *          <td>Entire contents of PDF document, indexed but not stored</td>
- *      </tr>
- *      <tr>
- *          <td>summary</td>
- *          <td>First 500 characters of content</td>
- *      </tr>
- *      <tr>
- *          <td>modified</td>
- *          <td>The modified date/time according to the url or path</td>
- *      </tr>
- *      <tr>
- *          <td>uid</td>
- *          <td>A unique identifier for the Lucene document.</td>
- *      </tr>
- *      <tr>
- *          <td>CreationDate</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
- *      <tr>
- *          <td>Creator</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
- *      <tr>
- *          <td>Keywords</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
- *      <tr>
- *          <td>ModificationDate</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
- *      <tr>
- *          <td>Producer</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
- *      <tr>
- *          <td>Subject</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
- *      <tr>
- *          <td>Trapped</td>
- *          <td>From PDF meta-data if available</td>
- *      </tr>
+ * <tr>
+ * <th>Lucene Field Name</th>
+ * <th>Description</th>
+ * </tr>
+ * <tr>
+ * <td>path</td>
+ * <td>File system path if loaded from a file</td>
+ * </tr>
+ * <tr>
+ * <td>url</td>
+ * <td>URL to PDF document</td>
+ * </tr>
+ * <tr>
+ * <td>contents</td>
+ * <td>Entire contents of PDF document, indexed but not stored</td>
+ * </tr>
+ * <tr>
+ * <td>summary</td>
+ * <td>First 500 characters of content</td>
+ * </tr>
+ * <tr>
+ * <td>modified</td>
+ * <td>The modified date/time according to the url or path</td>
+ * </tr>
+ * <tr>
+ * <td>uid</td>
+ * <td>A unique identifier for the Lucene document.</td>
+ * </tr>
+ * <tr>
+ * <td>CreationDate</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Creator</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Keywords</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>ModificationDate</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Producer</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Subject</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
+ * <tr>
+ * <td>Trapped</td>
+ * <td>From PDF meta-data if available</td>
+ * </tr>
  * </table>
- *
+ * 
  * @author <a href="mailto:b...@benlitchfield.com";>Ben Litchfield</a>
- * @version $Revision: 1.23 $
+ * 
  */
 public class LucenePDFDocument
 {
     private static final char FILE_SEPARATOR = 
System.getProperty("file.separator").charAt(0);
 
     // given caveat of increased search times when using
-    //MICROSECOND, only use SECOND by default
-    private DateTools.Resolution dateTimeResolution = 
DateTools.Resolution.SECOND;
+    // MICROSECOND, only use SECOND by default
+    private static final DateTools.Resolution DATE_TIME_RES = 
DateTools.Resolution.SECOND;
 
     private PDFTextStripper stripper = null;
 
-    /**
-     * Constructor.
-     */
-    public LucenePDFDocument()
+    private boolean useNonSeqParser;
+
+    /** not Indexed, tokenized, stored. */
+    public static final FieldType TYPE_STORED_NOT_INDEXED = new FieldType();
+
+    static
     {
+        TYPE_STORED_NOT_INDEXED.setIndexed(false);
+        TYPE_STORED_NOT_INDEXED.setStored(true);
+        TYPE_STORED_NOT_INDEXED.setTokenized(true);
+        TYPE_STORED_NOT_INDEXED.freeze();
     }
 
     /**
-     * Set the text stripper that will be used during extraction.
-     *
-     * @param aStripper The new pdf text stripper.
+     * Constructor.
      */
-    public void setTextStripper( PDFTextStripper aStripper )
+    public LucenePDFDocument()
     {
-        stripper = aStripper;
+        this(false);
     }
 
     /**
-     * Get the Lucene data time resolution.
-     *
-     * @return current date/time resolution
+     * Constructor.
+     * 
+     * @param nonSequentialParser indicates if the non-sequential parser 
should be used
+     * 
      */
-    public DateTools.Resolution getDateTimeResolution()
+    public LucenePDFDocument(boolean nonSequentialParser)
     {
-        return dateTimeResolution;
+        useNonSeqParser = nonSequentialParser;
     }
 
     /**
-     * Set the Lucene data time resolution.
-     *
-     * @param resolution set new date/time resolution
+     * Set the text stripper that will be used during extraction.
+     * 
+     * @param aStripper The new pdf text stripper.
      */
-    public void setDateTimeResolution( DateTools.Resolution resolution )
+    public void setTextStripper(PDFTextStripper aStripper)
     {
-        dateTimeResolution = resolution;
+        stripper = aStripper;
     }
 
-    //
-    // compatibility methods for lucene-1.9+
-    //
-    private String timeToString( long time )
+    private static String timeToString(long time)
     {
-        return DateTools.timeToString( time, dateTimeResolution );
+        return DateTools.timeToString(time, DATE_TIME_RES);
     }
 
-    private void addKeywordField( Document document, String name, String value 
)
+    private void addKeywordField(Document document, String name, String value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            document.add( new Field( name, value, Field.Store.YES, 
Field.Index.NOT_ANALYZED ) );
+            document.add(new StringField(name, value, Field.Store.YES));
         }
     }
 
-    private void addTextField( Document document, String name, Reader value )
+    private void addTextField(Document document, String name, Reader value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            document.add( new Field( name, value ) );
+            document.add(new TextField(name, value));
         }
     }
 
-    private void addTextField( Document document, String name, String value )
+    private void addTextField(Document document, String name, String value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            document.add( new Field( name, value, Field.Store.YES, 
Field.Index.ANALYZED ) );
+            document.add(new TextField(name, value, Field.Store.YES));
         }
     }
 
-    private void addTextField( Document document, String name, Date value )
+    private void addTextField(Document document, String name, Date value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            addTextField( document, name, DateTools.dateToString( value, 
dateTimeResolution ) );
+            addTextField(document, name, DateTools.dateToString(value, 
DATE_TIME_RES));
         }
     }
 
-    private void addTextField( Document document, String name, Calendar value )
+    private void addTextField(Document document, String name, Calendar value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            addTextField( document, name, value.getTime() );
+            addTextField(document, name, value.getTime());
         }
     }
 
-    private static void addUnindexedField( Document document, String name, 
String value )
+    private static void addUnindexedField(Document document, String name, 
String value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            document.add( new Field( name, value, Field.Store.YES, 
Field.Index.NO ) );
+            document.add(new Field(name, value, TYPE_STORED_NOT_INDEXED));
         }
     }
 
-    private void addUnstoredKeywordField( Document document, String name, 
String value )
+    private void addUnstoredKeywordField(Document document, String name, 
String value)
     {
-        if ( value != null )
+        if (value != null)
         {
-            document.add( new Field( name, value, Field.Store.NO, 
Field.Index.NOT_ANALYZED ) );
+            document.add(new Field(name, value, TextField.TYPE_NOT_STORED));
         }
     }
 
     /**
      * Convert the PDF stream to a lucene document.
-     *
+     * 
      * @param is The input stream.
      * @return The input stream converted to a lucene document.
      * @throws IOException If there is an error converting the PDF.
      */
-    public Document convertDocument( InputStream is ) throws IOException
+    public Document convertDocument(InputStream is) throws IOException
     {
         Document document = new Document();
-        addContent( document, is, "<inputstream>" );
+        addContent(document, is, "<inputstream>");
         return document;
 
     }
 
     /**
      * This will take a reference to a PDF document and create a lucene 
document.
-     *
+     * 
      * @param file A reference to a PDF document.
      * @return The converted lucene document.
-     *
+     * 
      * @throws IOException If there is an exception while converting the 
document.
      */
-    public Document convertDocument( File file ) throws IOException
+    public Document convertDocument(File file) throws IOException
     {
         Document document = new Document();
 
-        // Add the url as a field named "url".  Use an UnIndexed field, so
+        // Add the url as a field named "url". Use an UnIndexed field, so
         // that the url is just stored with the document, but is not 
searchable.
-        addUnindexedField( document, "path", file.getPath() );
-        addUnindexedField( document, "url", 
file.getPath().replace(FILE_SEPARATOR, '/') );
+        addUnindexedField(document, "path", file.getPath());
+        addUnindexedField(document, "url", 
file.getPath().replace(FILE_SEPARATOR, '/'));
 
-        // Add the last modified date of the file a field named "modified".  
Use a
+        // Add the last modified date of the file a field named "modified". 
Use a
         // Keyword field, so that it's searchable, but so that no attempt is 
made
         // to tokenize the field into words.
-        addKeywordField( document, "modified", timeToString( 
file.lastModified() ) );
+        addKeywordField(document, "modified", 
timeToString(file.lastModified()));
 
-        String uid = file.getPath().replace(FILE_SEPARATOR,'\u0000')
-                     + "\u0000"
-                     + timeToString( file.lastModified() );
+        String uid = createUID(file);
 
         // Add the uid as a field, so that index can be incrementally 
maintained.
         // This field is not stored with document, it is indexed, but it is not
         // tokenized prior to indexing.
-        addUnstoredKeywordField( document, "uid", uid );
+        addUnstoredKeywordField(document, "uid", uid);
 
         FileInputStream input = null;
         try
         {
-            input = new FileInputStream( file );
-            addContent( document, input, file.getPath() );
+            input = new FileInputStream(file);
+            addContent(document, input, file.getPath());
         }
         finally
         {
-            if( input != null )
+            if (input != null)
             {
                 input.close();
             }
         }
 
-
         // return the document
 
         return document;
@@ -287,43 +283,41 @@ public class LucenePDFDocument
 
     /**
      * Convert the document from a PDF to a lucene document.
-     *
+     * 
      * @param url A url to a PDF document.
      * @return The PDF converted to a lucene document.
      * @throws IOException If there is an error while converting the document.
      */
-    public Document convertDocument( URL url ) throws IOException
+    public Document convertDocument(URL url) throws IOException
     {
         Document document = new Document();
         URLConnection connection = url.openConnection();
         connection.connect();
-        // Add the url as a field named "url".  Use an UnIndexed field, so
+        // Add the url as a field named "url". Use an UnIndexed field, so
         // that the url is just stored with the document, but is not 
searchable.
-        addUnindexedField( document, "url", url.toExternalForm() );
+        addUnindexedField(document, "url", url.toExternalForm());
 
-        // Add the last modified date of the file a field named "modified".  
Use a
+        // Add the last modified date of the file a field named "modified". 
Use a
         // Keyword field, so that it's searchable, but so that no attempt is 
made
         // to tokenize the field into words.
-        addKeywordField( document, "modified", 
timeToString(connection.getLastModified() ) );
+        addKeywordField(document, "modified", 
timeToString(connection.getLastModified()));
 
-        String uid = url.toExternalForm().replace(FILE_SEPARATOR, '\u0000')
-                     + "\u0000"
-                     + timeToString( connection.getLastModified() );
+        String uid = createUID(url, connection.getLastModified());
 
         // Add the uid as a field, so that index can be incrementally 
maintained.
         // This field is not stored with document, it is indexed, but it is not
         // tokenized prior to indexing.
-        addUnstoredKeywordField( document, "uid", uid );
+        addUnstoredKeywordField(document, "uid", uid);
 
         InputStream input = null;
         try
         {
             input = connection.getInputStream();
-            addContent( document, input,url.toExternalForm() );
+            addContent(document, input, url.toExternalForm());
         }
         finally
         {
-            if( input != null )
+            if (input != null)
             {
                 input.close();
             }
@@ -335,74 +329,126 @@ public class LucenePDFDocument
 
     /**
      * This will get a lucene document from a PDF file.
-     *
+     * 
      * @param is The stream to read the PDF from.
-     *
+     * 
      * @return The lucene document.
-     *
+     * 
      * @throws IOException If there is an error parsing or indexing the 
document.
      */
-    public static Document getDocument( InputStream is ) throws IOException
+    public static Document getDocument(InputStream is) throws IOException
     {
-        LucenePDFDocument converter = new LucenePDFDocument();
-        return converter.convertDocument( is );
+        return getDocument(is, false);
     }
 
     /**
      * This will get a lucene document from a PDF file.
-     *
+     * 
+     * @param is The stream to read the PDF from.
+     * @param nonSeqParser indicates if the non-sequential parser should be 
used
+     * 
+     * @return The lucene document.
+     * 
+     * @throws IOException If there is an error parsing or indexing the 
document.
+     */
+    public static Document getDocument(InputStream is, boolean nonSeqParser) 
throws IOException
+    {
+        LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser);
+        return converter.convertDocument(is);
+    }
+
+    /**
+     * This will get a lucene document from a PDF file.
+     * 
+     * @param file The file to get the document for.
+     * 
+     * @return The lucene document.
+     * 
+     * @throws IOException If there is an error parsing or indexing the 
document.
+     */
+    public static Document getDocument(File file) throws IOException
+    {
+        return getDocument(file, false);
+    }
+
+    /**
+     * This will get a lucene document from a PDF file.
+     * 
      * @param file The file to get the document for.
-     *
+     * @param nonSeqParser indicates if the non-sequential parser should be 
used
+     * 
      * @return The lucene document.
-     *
+     * 
      * @throws IOException If there is an error parsing or indexing the 
document.
      */
-    public static Document getDocument( File file ) throws IOException
+    public static Document getDocument(File file, boolean nonSeqParser) throws 
IOException
     {
-        LucenePDFDocument converter = new LucenePDFDocument();
-        return converter.convertDocument( file );
+        LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser);
+        return converter.convertDocument(file);
     }
 
     /**
      * This will get a lucene document from a PDF file.
-     *
+     * 
      * @param url The file to get the document for.
-     *
+     * 
      * @return The lucene document.
-     *
+     * 
      * @throws IOException If there is an error parsing or indexing the 
document.
      */
-    public static Document getDocument( URL url ) throws IOException
+    public static Document getDocument(URL url) throws IOException
     {
-        LucenePDFDocument converter = new LucenePDFDocument();
-        return converter.convertDocument( url );
+        return getDocument(url, false);
+    }
+
+    /**
+     * This will get a lucene document from a PDF file.
+     * 
+     * @param url The file to get the document for.
+     * @param nonSeqParser indicates if the non-sequential parser should be 
used
+     * 
+     * @return The lucene document.
+     * 
+     * @throws IOException If there is an error parsing or indexing the 
document.
+     */
+    public static Document getDocument(URL url, boolean nonSeqParser) throws 
IOException
+    {
+        LucenePDFDocument converter = new LucenePDFDocument(nonSeqParser);
+        return converter.convertDocument(url);
     }
 
     /**
      * This will add the contents to the lucene document.
-     *
+     * 
      * @param document The document to add the contents to.
      * @param is The stream to get the contents from.
      * @param documentLocation The location of the document, used just for 
debug messages.
-     *
+     * 
      * @throws IOException If there is an error parsing the document.
      */
-    private void addContent( Document document, InputStream is, String 
documentLocation ) throws IOException
+    private void addContent(Document document, InputStream is, String 
documentLocation) throws IOException
     {
         PDDocument pdfDocument = null;
         try
         {
-            pdfDocument = PDDocument.load( is );
-
-            if( pdfDocument.isEncrypted() )
+            if (useNonSeqParser)
             {
-                //Just try using the default password and move on
-                pdfDocument.decrypt( "" );
+                pdfDocument = PDDocument.loadNonSeq(is, null, "");
+            }
+            else
+            {
+                pdfDocument = PDDocument.load(is);
+
+                if (pdfDocument.isEncrypted())
+                {
+                    // Just try using the default password and move on
+                    pdfDocument.decrypt("");
+                }
             }
 
-            //create a writer where to append the text content.
+            // create a writer where to append the text content.
             StringWriter writer = new StringWriter();
-            if( stripper == null )
+            if (stripper == null)
             {
                 stripper = new PDFTextStripper();
             }
@@ -410,7 +456,7 @@ public class LucenePDFDocument
             {
                 stripper.resetEngine();
             }
-            stripper.writeText( pdfDocument, writer );
+            stripper.writeText(pdfDocument, writer);
 
             // Note: the buffer to string operation is costless;
             // the char array value of the writer buffer and the content string
@@ -418,59 +464,57 @@ public class LucenePDFDocument
             // not occur here.
             String contents = writer.getBuffer().toString();
 
-            StringReader reader = new StringReader( contents );
+            StringReader reader = new StringReader(contents);
 
             // Add the tag-stripped contents as a Reader-valued Text field so 
it will
             // get tokenized and indexed.
-            addTextField( document, "contents", reader );
+            addTextField(document, "contents", reader);
 
             PDDocumentInformation info = pdfDocument.getDocumentInformation();
-            if( info != null )
+            if (info != null)
             {
-                addTextField( document, "Author", info.getAuthor() );
+                addTextField(document, "Author", info.getAuthor());
                 try
                 {
-                    addTextField( document, "CreationDate", 
info.getCreationDate() );
+                    addTextField(document, "CreationDate", 
info.getCreationDate());
                 }
-                catch( IOException io )
+                catch (IOException io)
                 {
-                    //ignore, bad date but continue with indexing
+                    // ignore, bad date but continue with indexing
                 }
-                addTextField( document, "Creator", info.getCreator() );
-                addTextField( document, "Keywords", info.getKeywords() );
+                addTextField(document, "Creator", info.getCreator());
+                addTextField(document, "Keywords", info.getKeywords());
                 try
                 {
-                    addTextField( document, "ModificationDate", 
info.getModificationDate() );
+                    addTextField(document, "ModificationDate", 
info.getModificationDate());
                 }
-                catch( IOException io )
+                catch (IOException io)
                 {
-                    //ignore, bad date but continue with indexing
+                    // ignore, bad date but continue with indexing
                 }
-                addTextField( document, "Producer", info.getProducer() );
-                addTextField( document, "Subject", info.getSubject() );
-                addTextField( document, "Title", info.getTitle() );
-                addTextField( document, "Trapped", info.getTrapped() );
+                addTextField(document, "Producer", info.getProducer());
+                addTextField(document, "Subject", info.getSubject());
+                addTextField(document, "Title", info.getTitle());
+                addTextField(document, "Trapped", info.getTrapped());
             }
-            int summarySize = Math.min( contents.length(), 500 );
-            String summary = contents.substring( 0, summarySize );
+            int summarySize = Math.min(contents.length(), 500);
+            String summary = contents.substring(0, summarySize);
             // Add the summary as an UnIndexed field, so that it is stored and 
returned
             // with hit documents for display.
-            addUnindexedField( document, "summary", summary );
+            addUnindexedField(document, "summary", summary);
         }
-        catch( CryptographyException e )
+        catch (CryptographyException e)
         {
-            throw new IOException( "Error decrypting document(" + 
documentLocation + "): " + e );
+            throw new IOException("Error decrypting document(" + 
documentLocation + "): " + e);
         }
-        catch( InvalidPasswordException e )
+        catch (InvalidPasswordException e)
         {
-            //they didn't suppply a password and the default of "" was wrong.
-            throw new IOException(
-                "Error: The document(" + documentLocation +
-                ") is encrypted and will not be indexed." );
+            // they didn't suppply a password and the default of "" was wrong.
+            throw new IOException("Error: The document(" + documentLocation + 
") is encrypted and will not be indexed.");
         }
         finally
         {
-            if( pdfDocument != null )
+            if (pdfDocument != null)
             {
                 pdfDocument.close();
             }
@@ -478,22 +522,27 @@ public class LucenePDFDocument
     }
 
     /**
-     * This will test creating a document.
-     *
-     * usage: java pdfparser.searchengine.lucene.LucenePDFDocument 
&lt;pdf-document&gt;
-     *
-     * @param args command line arguments.
-     *
-     * @throws IOException If there is an error.
-     */
-    public static void main( String[] args ) throws IOException
-    {
-        if( args.length != 1 )
-        {
-            String us = LucenePDFDocument.class.getName();
-            System.err.println( "usage: java " + us + " <pdf-document>" );
-            System.exit( 1 );
-        }
-        System.out.println( "Document=" + getDocument( new File( args[0] ) ) );
+     * Create an UID for the given file using the given time.
+     * 
+     * @param file the file we have to create an UID for
+     * @param time the time to used to the UID
+     * 
+     * @return the created UID
+     */
+    public static String createUID(URL url, long time)
+    {
+        return url.toExternalForm().replace(FILE_SEPARATOR, '\u0000') + 
"\u0000" + timeToString(time);
+    }
+
+    /**
+     * Create an UID for the given file.
+     * 
+     * @param file the file we have to create an UID for
+     * 
+     * @return the created UID
+     */
+    public static String createUID(File file)
+    {
+        return file.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + 
timeToString(file.lastModified());
     }
 }

Copied: 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html
 (from r1502161, 
pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html)
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html?p2=pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html&p1=pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html&r1=1502161&r2=1530357&rev=1530357&view=diff
==============================================================================
--- pdfbox/trunk/lucene/src/main/java/org/apache/pdfbox/lucene/package.html 
(original)
+++ 
pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/lucene/package.html
 Tue Oct  8 17:15:09 2013
@@ -20,6 +20,6 @@
 
 </head>
 <body>
-This package holds classes that are used to integrate the PDFBox project with 
lucene.
+This example shows how to to integrate the PDFBox project with lucene.
 </body>
 </html>

Modified: pdfbox/trunk/pom.xml
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pom.xml?rev=1530357&r1=1530356&r2=1530357&view=diff
==============================================================================
--- pdfbox/trunk/pom.xml (original)
+++ pdfbox/trunk/pom.xml Tue Oct  8 17:15:09 2013
@@ -50,7 +50,6 @@
     <module>pdfbox</module>
     <module>preflight</module>
     <module>preflight-app</module>
-    <module>lucene</module>
     <module>ant</module>
     <module>war</module>
     <module>app</module>

svn commit: r1530357 - in /pdfbox/trunk: ./ examples/ examples/src/main/java/org/apache/pdfbox/examples/lucene/ lucene/ lucene/src/

Reply via email to