Author: jukka Date: Mon Nov 12 17:04:30 2007 New Revision: 594376 URL: http://svn.apache.org/viewvc?rev=594376&view=rev Log: TIKA-100 - Structured PDF parsing - Customized the PdfTextStripper class to produce XHTML SAX events (there's a somewhat similar PdfText2HTML class in PDFBox, but that class produces a character stream instead of SAX events)
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (with props) Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=594376&r1=594375&r2=594376&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Mon Nov 12 17:04:30 2007 @@ -125,3 +125,5 @@ 56. TIKA-84 - Add MimeTypes.getMimeType(InputStream) (jukka) 57. TIKA-85 - Add glob patterns from the ASF svn:eol-style documentation (jukka) + +58. TIKA-100 - Structured PDF parsing (jukka) Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=594376&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Nov 12 17:04:30 2007 @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.IOException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.XHTMLContentHandler; +import org.pdfbox.pdmodel.PDDocument; +import org.pdfbox.pdmodel.PDPage; +import org.pdfbox.util.PDFTextStripper; +import org.pdfbox.util.TextPosition; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Utility class that overrides the [EMAIL PROTECTED] PDFTextStripper} functionality + * to produce a semi-structured XHTML SAX events instead of a plain text + * stream. + */ +class PDF2XHTML extends PDFTextStripper { + + /** + * Converts the given PDF document (and related metadata) to a stream + * of XHTML SAX events sent to the given content handler. + * + * @param document PDF document + * @param handler SAX content handler + * @param metadata PDF metadata + * @throws SAXException if the content handler fails to process SAX events + * @throws TikaException if the PDF document can not be processed + */ + public static void process( + PDDocument document, ContentHandler handler, Metadata metadata) + throws SAXException, TikaException { + try { + new PDF2XHTML(handler, metadata).getText(document); + } catch (IOException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Unable to extract PDF content", e); + } + } + } + + private final XHTMLContentHandler handler; + + private PDF2XHTML(ContentHandler handler, Metadata metadata) + throws IOException { + this.handler = new XHTMLContentHandler(handler, metadata); + } + + protected void startDocument(PDDocument pdf) throws IOException { + try { + handler.startDocument(); + } catch (SAXException e) { + throw new IOException("Unable to start a document", e); + } + } + + protected void endDocument(PDDocument pdf) throws IOException { + try { + handler.endDocument(); + } catch (SAXException e) { + throw new IOException("Unable to end a document", e); + } + } + + protected void startPage(PDPage page) throws IOException { + try { + handler.startElement("div"); + } catch (SAXException e) { + throw new IOException("Unable to start a page", e); + } + } + + protected void endPage(PDPage page) throws IOException { + try { + handler.endElement("div"); + } catch (SAXException e) { + throw new IOException("Unable to end a page", e); + } + } + + protected void startParagraph() throws IOException { + try { + handler.startElement("p"); + } catch (SAXException e) { + throw new IOException("Unable to start a paragraph", e); + } + } + + protected void endParagraph() throws IOException { + try { + handler.endElement("p"); + } catch (SAXException e) { + throw new IOException("Unable to end a paragraph", e); + } + } + + protected void writeCharacters(TextPosition text) throws IOException { + try { + handler.characters(text.getCharacter()); + } catch (SAXException e) { + throw new IOException("Unable to write a newline", e); + } + } + + protected void processLineSeparator(TextPosition p) throws IOException { + try { + handler.characters("\n"); + } catch (SAXException e) { + throw new IOException("Unable to write a newline", e); + } + } + + protected void processWordSeparator(TextPosition a, TextPosition b) + throws IOException { + try { + handler.characters(" "); + } catch (SAXException e) { + throw new IOException("Unable to write a space", e); + } + } + +} Propchange: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=594376&r1=594375&r2=594376&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Mon Nov 12 17:04:30 2007 @@ -18,17 +18,14 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import java.util.Calendar; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; -import org.apache.tika.sax.XHTMLContentHandler; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; -import org.pdfbox.util.PDFTextStripper; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -40,64 +37,55 @@ public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { + PDDocument pdfDocument = PDDocument.load(stream); try { - PDDocument pdfDocument = PDDocument.load(stream); - try { - if (pdfDocument.isEncrypted()) { + if (pdfDocument.isEncrypted()) { + try { pdfDocument.decrypt(""); + } catch (Exception e) { + // Ignore } - - PDDocumentInformation info = - pdfDocument.getDocumentInformation(); - if (info.getTitle() != null) { - metadata.set(Metadata.TITLE, info.getTitle()); - } - if (info.getAuthor() != null) { - metadata.set(Metadata.AUTHOR, info.getAuthor()); - } - if (info.getCreator() != null) { - metadata.set(Metadata.CREATOR, info.getCreator()); - } - if (info.getKeywords() != null) { - metadata.set(Metadata.KEYWORDS, info.getKeywords()); - } - if (info.getProducer() != null) { - // TODO: Need a Metadata key for producer - metadata.set("producer", info.getProducer()); - } - if (info.getSubject() != null) { - metadata.set(Metadata.SUBJECT, info.getSubject()); - } - if (info.getTrapped() != null) { - // TODO: Need a Metadata key for producer - metadata.set("trapped", info.getTrapped()); - } - Calendar created = info.getCreationDate(); - if (created != null) { - metadata.set("created", created.getTime().toString()); - } - Calendar modified = info.getModificationDate(); - if (modified != null) { - metadata.set( - Metadata.LAST_MODIFIED, - modified.getTime().toString()); - } - - StringWriter writer = new StringWriter(); - new PDFTextStripper().writeText(pdfDocument, writer); - - XHTMLContentHandler xhtml = - new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - xhtml.element("p", writer.getBuffer().toString()); - xhtml.endDocument(); - } finally { - pdfDocument.close(); } + metadata.add(Metadata.CONTENT_TYPE, "application/pdf"); + extractMetadata(pdfDocument, metadata); + PDF2XHTML.process(pdfDocument, handler, metadata); + } finally { + pdfDocument.close(); + } + } + + private void extractMetadata(PDDocument document, Metadata metadata) + throws TikaException { + PDDocumentInformation info = document.getDocumentInformation(); + addMetadata(metadata, Metadata.TITLE, info.getTitle()); + addMetadata(metadata, Metadata.AUTHOR, info.getAuthor()); + addMetadata(metadata, Metadata.CREATOR, info.getCreator()); + addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords()); + addMetadata(metadata, "producer", info.getProducer()); + addMetadata(metadata, Metadata.SUBJECT, info.getSubject()); + addMetadata(metadata, "trapped", info.getTrapped()); + try { + addMetadata(metadata, "created", info.getCreationDate()); + } catch (IOException e) { + // Invalid date format, just ignore + } + try { + Calendar modified = info.getModificationDate(); + addMetadata(metadata, Metadata.LAST_MODIFIED, modified); } catch (IOException e) { - throw e; - } catch (Exception e) { - throw new TikaException("Error parsing a PDF document", e); + // Invalid date format, just ignore + } + } + + private void addMetadata(Metadata metadata, String name, String value) { + if (value != null) { + metadata.add(name, value); + } + } + + private void addMetadata(Metadata metadata, String name, Calendar value) { + if (value != null) { + metadata.set(name, value.getTime().toString()); } }