Author: mikemccand
Date: Wed Oct  5 10:43:33 2011
New Revision: 1179152

URL: http://svn.apache.org/viewvc?rev=1179152&view=rev
Log:
TIKA-742: extract paragraphs inside PDF pages

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf   
(with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1179152&r1=1179151&r2=1179152&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Oct  5 10:43:33 2011
@@ -12,6 +12,9 @@ Release 0.11 - Current Development
    as Unicode zero-width space (U+200B), and non-breaking hyphen as
    Unicode non-breaking hyphen (U+2011).
 
+ * TIKA-742: Paragraphs are now extracted within each page of a PDF
+   document.
+
 Release 0.10 - 09/25/2011
 
 The most notable changes in Tika 0.10 over previous releases are:

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1179152&r1=1179151&r2=1179152&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
Wed Oct  5 10:43:33 2011
@@ -37,6 +37,9 @@ import org.xml.sax.SAXException;
  */
 class PDF2XHTML extends PDFTextStripper {
 
+    // TODO: remove once PDFBOX-1130 is fixed:
+    private boolean inParagraph = false;
+
     /**
      * Converts the given PDF document (and related metadata) to a stream
      * of XHTML SAX events sent to the given content handler.
@@ -121,6 +124,37 @@ class PDF2XHTML extends PDFTextStripper 
     }
 
     @Override
+    protected void writeParagraphStart() throws IOException {
+        // TODO: remove once PDFBOX-1130 is fixed
+        if (inParagraph) {
+            // Close last paragraph
+            writeParagraphEnd();
+        }
+        assert !inParagraph;
+        inParagraph = true;
+        try {
+            handler.startElement("p");
+        } catch (SAXException se) {
+            throw new IOException(se);
+        }
+    }
+
+    @Override
+    protected void writeParagraphEnd() throws IOException {
+        // TODO: remove once PDFBOX-1130 is fixed
+        if (!inParagraph) {
+            writeParagraphStart();
+        }
+        assert inParagraph;
+        inParagraph = false;
+        try {
+            handler.endElement("p");
+        } catch (SAXException se) {
+            throw new IOException(se);
+        }
+    }
+
+    @Override
     protected void writeString(String text) throws IOException {
         try {
             handler.characters(text);

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1179152&r1=1179151&r2=1179152&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Wed Oct  5 10:43:33 2011
@@ -17,6 +17,12 @@
 package org.apache.tika.parser.pdf;
 
 import java.io.InputStream;
+import java.io.StringWriter;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
@@ -25,7 +31,6 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
-
 /**
  * Test case for parsing pdf files.
  */
@@ -230,4 +235,41 @@ public class PDFParserTest extends TikaT
         assertContains("Here is some text", content);
         assertContains("Here is a comment", content);
     }
+
+    public void testPageNumber() throws Exception {
+        final XMLResult result = getXML("testPageNumber.pdf");
+        final String content = result.xml.replaceAll("\\s+","");
+        assertContains("<p>1</p>", content);
+    }
+
+    private static class XMLResult {
+        public final String xml;
+        public final Metadata metadata;
+
+        public XMLResult(String xml, Metadata metadata) {
+            this.xml = xml;
+            this.metadata = metadata;
+      }
+    }
+
+    private XMLResult getXML(String filename) throws Exception {
+        Metadata metadata = new Metadata();
+        Parser parser = new AutoDetectParser(); // Should auto-detect!        
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
+        handler.setResult(new StreamResult(sw));
+
+        // Try with a document containing various tables and formattings
+        InputStream input = getResourceAsStream("/test-documents/" + filename);
+        try {
+            parser.parse(input, handler, metadata, new ParseContext());
+            return new XMLResult(sw.toString(), metadata);
+        } finally {
+            input.close();
+        }
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf?rev=1179152&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPageNumber.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf


Reply via email to