PDFParser.java

jukka Mon, 12 Nov 2007 17:04:58 -0800

Author: jukka
Date: Mon Nov 12 17:04:30 2007
New Revision: 594376

URL: http://svn.apache.org/viewvc?rev=594376&view=rev
Log:
TIKA-100 - Structured PDF parsing
    - Customized the PdfTextStripper class to produce XHTML SAX events
      (there's a somewhat similar PdfText2HTML class in PDFBox, but
      that class produces a character stream instead of SAX events)


Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java   
(with props)
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=594376&r1=594375&r2=594376&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Nov 12 17:04:30 2007
@@ -125,3 +125,5 @@
 56. TIKA-84 - Add MimeTypes.getMimeType(InputStream) (jukka)
 
 57. TIKA-85 - Add glob patterns from the ASF svn:eol-style documentation 
(jukka)
+
+58. TIKA-100 - Structured PDF parsing (jukka)

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=594376&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
Mon Nov 12 17:04:30 2007
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDPage;
+import org.pdfbox.util.PDFTextStripper;
+import org.pdfbox.util.TextPosition;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Utility class that overrides the [EMAIL PROTECTED] PDFTextStripper} 
functionality
+ * to produce a semi-structured XHTML SAX events instead of a plain text
+ * stream.
+ */
+class PDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     * 
+     * @param document PDF document
+     * @param handler SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException if the content handler fails to process SAX events
+     * @throws TikaException if the PDF document can not be processed
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, Metadata metadata)
+            throws SAXException, TikaException {
+        try {
+            new PDF2XHTML(handler, metadata).getText(document);
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+    }
+
+    private final XHTMLContentHandler handler;
+
+    private PDF2XHTML(ContentHandler handler, Metadata metadata)
+            throws IOException {
+        this.handler = new XHTMLContentHandler(handler, metadata);
+    }
+
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            throw new IOException("Unable to start a document", e);
+        }
+    }
+
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            throw new IOException("Unable to end a document", e);
+        }
+    }
+
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            handler.startElement("div");
+        } catch (SAXException e) {
+            throw new IOException("Unable to start a page", e);
+        }
+    }
+
+    protected void endPage(PDPage page) throws IOException {
+        try {
+            handler.endElement("div");
+        } catch (SAXException e) {
+            throw new IOException("Unable to end a page", e);
+        }
+    }
+
+    protected void startParagraph() throws IOException {
+        try {
+            handler.startElement("p");
+        } catch (SAXException e) {
+            throw new IOException("Unable to start a paragraph", e);
+        }
+    }
+
+    protected void endParagraph() throws IOException {
+        try {
+            handler.endElement("p");
+        } catch (SAXException e) {
+            throw new IOException("Unable to end a paragraph", e);
+        }
+    }
+
+    protected void writeCharacters(TextPosition text) throws IOException {
+        try {
+            handler.characters(text.getCharacter());
+        } catch (SAXException e) {
+            throw new IOException("Unable to write a newline", e);
+        }
+    }
+
+    protected void processLineSeparator(TextPosition p) throws IOException {
+        try {
+            handler.characters("\n");
+        } catch (SAXException e) {
+            throw new IOException("Unable to write a newline", e);
+        }
+    }
+
+    protected void processWordSeparator(TextPosition a, TextPosition b)
+            throws IOException {
+        try {
+            handler.characters(" ");
+        } catch (SAXException e) {
+            throw new IOException("Unable to write a space", e);
+        }
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=594376&r1=594375&r2=594376&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
Mon Nov 12 17:04:30 2007
@@ -18,17 +18,14 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.util.Calendar;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
 
 import org.pdfbox.pdmodel.PDDocument;
 import org.pdfbox.pdmodel.PDDocumentInformation;
-import org.pdfbox.util.PDFTextStripper;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -40,64 +37,55 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
+        PDDocument pdfDocument = PDDocument.load(stream);
         try {
-            PDDocument pdfDocument = PDDocument.load(stream);
-            try {
-                if (pdfDocument.isEncrypted()) {
+            if (pdfDocument.isEncrypted()) {
+                try {
                     pdfDocument.decrypt("");
+                } catch (Exception e) {
+                    // Ignore
                 }
-
-                PDDocumentInformation info =
-                    pdfDocument.getDocumentInformation();
-                if (info.getTitle() != null) {
-                    metadata.set(Metadata.TITLE, info.getTitle());
-                }
-                if (info.getAuthor() != null) {
-                    metadata.set(Metadata.AUTHOR, info.getAuthor());
-                }
-                if (info.getCreator() != null) {
-                    metadata.set(Metadata.CREATOR, info.getCreator());
-                }
-                if (info.getKeywords() != null) {
-                    metadata.set(Metadata.KEYWORDS, info.getKeywords());
-                }
-                if (info.getProducer() != null) {
-                    // TODO: Need a Metadata key for producer
-                    metadata.set("producer", info.getProducer());
-                }
-                if (info.getSubject() != null) {
-                    metadata.set(Metadata.SUBJECT, info.getSubject());
-                }
-                if (info.getTrapped() != null) {
-                    // TODO: Need a Metadata key for producer
-                    metadata.set("trapped", info.getTrapped());
-                }
-                Calendar created = info.getCreationDate();
-                if (created != null) {
-                    metadata.set("created", created.getTime().toString());
-                }
-                Calendar modified = info.getModificationDate();
-                if (modified != null) {
-                    metadata.set(
-                            Metadata.LAST_MODIFIED,
-                            modified.getTime().toString());
-                }
-
-                StringWriter writer = new StringWriter();
-                new PDFTextStripper().writeText(pdfDocument, writer);
-
-                XHTMLContentHandler xhtml =
-                    new XHTMLContentHandler(handler, metadata);
-                xhtml.startDocument();
-                xhtml.element("p", writer.getBuffer().toString());
-                xhtml.endDocument();
-            } finally {
-                pdfDocument.close();
             }
+            metadata.add(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata);
+            PDF2XHTML.process(pdfDocument, handler, metadata);
+        } finally {
+            pdfDocument.close();
+        }
+    }
+
+    private void extractMetadata(PDDocument document, Metadata metadata)
+            throws TikaException {
+        PDDocumentInformation info = document.getDocumentInformation();
+        addMetadata(metadata, Metadata.TITLE, info.getTitle());
+        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
+        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
+        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
+        addMetadata(metadata, "producer", info.getProducer());
+        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
+        addMetadata(metadata, "trapped", info.getTrapped());
+        try {
+            addMetadata(metadata, "created", info.getCreationDate());
+        } catch (IOException e) {
+            // Invalid date format, just ignore
+        }
+        try {
+            Calendar modified = info.getModificationDate(); 
+            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
         } catch (IOException e) {
-            throw e;
-        } catch (Exception e) {
-            throw new TikaException("Error parsing a PDF document", e);
+            // Invalid date format, just ignore
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, value);
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, Calendar value) {
+        if (value != null) {
+            metadata.set(name, value.getTime().toString());
         }
     }

svn commit: r594376 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Reply via email to