Author: mikemccand
Date: Wed Oct 5 10:43:33 2011
New Revision: 1179152
URL: http://svn.apache.org/viewvc?rev=1179152&view=rev
Log:
TIKA-742: extract paragraphs inside PDF pages
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1179152&r1=1179151&r2=1179152&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Oct 5 10:43:33 2011
@@ -12,6 +12,9 @@ Release 0.11 - Current Development
as Unicode zero-width space (U+200B), and non-breaking hyphen as
Unicode non-breaking hyphen (U+2011).
+ * TIKA-742: Paragraphs are now extracted within each page of a PDF
+ document.
+
Release 0.10 - 09/25/2011
The most notable changes in Tika 0.10 over previous releases are:
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1179152&r1=1179151&r2=1179152&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Wed Oct 5 10:43:33 2011
@@ -37,6 +37,9 @@ import org.xml.sax.SAXException;
*/
class PDF2XHTML extends PDFTextStripper {
+ // TODO: remove once PDFBOX-1130 is fixed:
+ private boolean inParagraph = false;
+
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
@@ -121,6 +124,37 @@ class PDF2XHTML extends PDFTextStripper
}
@Override
+ protected void writeParagraphStart() throws IOException {
+ // TODO: remove once PDFBOX-1130 is fixed
+ if (inParagraph) {
+ // Close last paragraph
+ writeParagraphEnd();
+ }
+ assert !inParagraph;
+ inParagraph = true;
+ try {
+ handler.startElement("p");
+ } catch (SAXException se) {
+ throw new IOException(se);
+ }
+ }
+
+ @Override
+ protected void writeParagraphEnd() throws IOException {
+ // TODO: remove once PDFBOX-1130 is fixed
+ if (!inParagraph) {
+ writeParagraphStart();
+ }
+ assert inParagraph;
+ inParagraph = false;
+ try {
+ handler.endElement("p");
+ } catch (SAXException se) {
+ throw new IOException(se);
+ }
+ }
+
+ @Override
protected void writeString(String text) throws IOException {
try {
handler.characters(text);
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1179152&r1=1179151&r2=1179152&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Wed Oct 5 10:43:33 2011
@@ -17,6 +17,12 @@
package org.apache.tika.parser.pdf;
import java.io.InputStream;
+import java.io.StringWriter;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
@@ -25,7 +31,6 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
-
/**
* Test case for parsing pdf files.
*/
@@ -230,4 +235,41 @@ public class PDFParserTest extends TikaT
assertContains("Here is some text", content);
assertContains("Here is a comment", content);
}
+
+ public void testPageNumber() throws Exception {
+ final XMLResult result = getXML("testPageNumber.pdf");
+ final String content = result.xml.replaceAll("\\s+","");
+ assertContains("<p>1</p>", content);
+ }
+
+ private static class XMLResult {
+ public final String xml;
+ public final Metadata metadata;
+
+ public XMLResult(String xml, Metadata metadata) {
+ this.xml = xml;
+ this.metadata = metadata;
+ }
+ }
+
+ private XMLResult getXML(String filename) throws Exception {
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+ handler.setResult(new StreamResult(sw));
+
+ // Try with a document containing various tables and formattings
+ InputStream input = getResourceAsStream("/test-documents/" + filename);
+ try {
+ parser.parse(input, handler, metadata, new ParseContext());
+ return new XMLResult(sw.toString(), metadata);
+ } finally {
+ input.close();
+ }
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf?rev=1179152&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf