TestParsingExample.java

tallison Wed, 28 Jan 2015 12:08:26 -0800

Author: tallison
Date: Wed Jan 28 20:08:04 2015
New Revision: 1655449

URL: http://svn.apache.org/r1655449
Log:
TIKA-1329, added examples for the RecursiveParserWrapper


Added:
    
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx
   (with props)
Modified:
    tika/trunk/tika-example/pom.xml
    
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
    
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java

Modified: tika/trunk/tika-example/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1655449&r1=1655448&r2=1655449&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Wed Jan 28 20:08:04 2015
@@ -63,11 +63,14 @@
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
+      <artifactId>tika-serialization</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
       <artifactId>tika-translate</artifactId>
       <version>${project.version}</version>
     </dependency>
-
-
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-parsers</artifactId>

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java?rev=1655449&r1=1655448&r2=1655449&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
 (original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
 Wed Jan 28 20:08:04 2015
@@ -16,15 +16,24 @@
  */
 package org.apache.tika.example;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.List;
+
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
 import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.io.InputStream;
+import org.xml.sax.helpers.DefaultHandler;
 
 public class ParsingExample {
 
@@ -32,6 +41,9 @@ public class ParsingExample {
      * Example of how to use Tika's parseToString method to parse the content 
of a file,
      * and return any text found.
      *
+     * Note: Tika.parseToString() will extract content from the outer container
+     * document and any embedded/attached documents.
+     *
      * @return The content of a file.
      */
     public String parseToStringExample() throws IOException, SAXException, 
TikaException {
@@ -45,7 +57,7 @@ public class ParsingExample {
     }
 
     /**
-     * Example of how to use Tika to parse an file when you do not know its 
file type
+     * Example of how to use Tika to parse a file when you do not know its 
file type
      * ahead of time.
      *
      * AutoDetectParser attempts to discover the file's type automatically, 
then call
@@ -61,6 +73,11 @@ public class ParsingExample {
      * The Metadata object will be filled by the Parser with Metadata 
discovered about
      * the file being parsed.
      *
+     * Note: This example will extract content from the outer document and all
+     * embedded documents.  However, if you choose to use a {@link 
ParseContext},
+     * make sure to set a {@link Parser} or else embedded content will not be
+     * parsed.
+     *
      * @return The content of a file.
      */
     public String parseExample() throws IOException, SAXException, 
TikaException {
@@ -75,4 +92,116 @@ public class ParsingExample {
             stream.close();
         }
     }
+
+    /**
+     * If you don't want content from embedded documents, send in
+     * a {@link org.apache.tika.parser.ParseContext} that does not contain a
+     * {@link Parser}.
+     *
+     * @return The content of a file.
+     */
+    public String parseNoEmbeddedExample() throws IOException, SAXException, 
TikaException {
+        InputStream stream = 
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+        AutoDetectParser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            return handler.toString();
+        } finally {
+            stream.close();
+        }
+    }
+
+
+    /**
+     * This example shows how to extract content from the outer document and 
all
+     * embedded documents.  The key is to specify a {@link Parser} in the 
{@link ParseContext}.
+     *
+     * @return content, including from embedded documents
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public String parseEmbeddedExample() throws IOException, SAXException, 
TikaException {
+        InputStream stream = 
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+        AutoDetectParser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
+        try {
+            parser.parse(stream, handler, metadata, context);
+            return handler.toString();
+        } finally {
+            stream.close();
+        }
+
+    }
+
+    /**
+     * For documents that may contain embedded documents, it might be helpful
+     * to create list of metadata objects, one for the container document and
+     * one for each embedded document.  This allows easy access to both the
+     * extracted content and the metadata of each embedded document.
+     * Note that many document formats can contain embedded documents,
+     * including traditional container formats -- zip, tar and others -- but 
also
+     * common office document formats including: MSWord, MSExcel,
+     * MSPowerPoint, RTF, PDF, MSG and several others.
+     * <p>
+     * The "content" format is determined by the ContentHandlerFactory, and
+     * the content is stored in {@link 
org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
+     * <p>
+     * The drawback to the RecursiveParserWrapper is that it caches metadata 
and contents
+     * in memory.  This should not be used on files whose contents are too big 
to be handled
+     * in memory.
+     *
+     * @return a list of metadata object, one each for the container file and 
each embedded file
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public List<Metadata> recursiveParserWrapperExample() throws IOException,
+            SAXException, TikaException {
+
+        Parser p = new AutoDetectParser();
+        ContentHandlerFactory factory = new BasicContentHandlerFactory(
+                BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
+
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, 
factory);
+        InputStream stream = 
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded.docx");
+        ParseContext context = new ParseContext();
+
+        try {
+            wrapper.parse(stream, new DefaultHandler(), metadata, context);
+        } finally {
+            stream.close();
+        }
+        return wrapper.getMetadata();
+    }
+
+    /**
+     * We include a simple JSON serializer for a list of metadata with
+     * {@link org.apache.tika.metadata.serialization.JsonMetadataList}.
+     * That class also includes a deserializer to convert from JSON
+     * back to a List<Metadata>.
+     * <p>
+     * This functionality is also available in tika-app's GUI, and
+     * with the -J option on tika-app's commandline.  For tika-server
+     * users, there is the "rmeta" service that will return this format.
+     *
+     * @return a JSON representation of a list of Metadata objects
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public String serializedRecursiveParserWrapperExample() throws IOException,
+            SAXException, TikaException {
+        List metadataList = recursiveParserWrapperExample();
+        StringWriter writer = new StringWriter();
+        JsonMetadataList.toJson(metadataList, writer);
+        return writer.toString();
+    }
 }

Added: 
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx?rev=1655449&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java?rev=1655449&r1=1655448&r2=1655449&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
 (original)
+++ 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
 Wed Jan 28 20:08:04 2015
@@ -17,15 +17,21 @@
 
 package org.apache.tika.example;
 
+import static junit.framework.TestCase.assertFalse;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.List;
+
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.junit.Before;
 import org.junit.Test;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-
-import static org.junit.Assert.assertEquals;
-
 public class TestParsingExample {
     ParsingExample parsingExample;
     @Before
@@ -45,4 +51,52 @@ public class TestParsingExample {
         assertEquals("Expected 'test', but got '" + result + "'", "test", 
result);
     }
 
+    @Test
+    public void testNoEmbeddedExample() throws IOException, SAXException, 
TikaException {
+        String result = parsingExample.parseNoEmbeddedExample();
+        assertContains("embed_0", result);
+        assertNotContains("embed1/embed1a.txt", result);
+        assertNotContains("embed3/embed3.txt", result);
+        assertNotContains("When in the Course", result);
+    }
+
+
+    @Test
+    public void testRecursiveParseExample() throws IOException, SAXException, 
TikaException {
+        String result = parsingExample.parseEmbeddedExample();
+        assertContains("embed_0", result);
+        assertContains("embed1/embed1a.txt", result);
+        assertContains("embed3/embed3.txt", result);
+        assertContains("When in the Course", result);
+    }
+
+    @Test
+    public void testRecursiveParserWrapperExample() throws IOException, 
SAXException, TikaException {
+        List<Metadata> metadataList = 
parsingExample.recursiveParserWrapperExample();
+        assertEquals("Number of embedded documents + 1 for the container 
document", 12, metadataList.size());
+        Metadata m = metadataList.get(6);
+        //this is the location the embed3.txt text file within the outer .docx
+        
assertEquals("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
+                m.get("X-TIKA:embedded_resource_path"));
+        //it contains some html encoded content
+        assertContains("When in the Course", m.get("X-TIKA:content"));
+    }
+
+    @Test
+    public void testSerializedRecursiveParserWrapperExample() throws 
IOException, SAXException, TikaException {
+        String json = parsingExample.serializedRecursiveParserWrapperExample();
+        assertTrue(json.indexOf("When in the Course") > -1);
+        //now try deserializing the JSON
+        List<Metadata> metadataList = JsonMetadataList.fromJson(new 
StringReader(json));
+        assertEquals(12, metadataList.size());
+    }
+
+    public static void assertContains(String needle, String haystack) {
+        assertTrue("Should have found " + needle + " in: " + haystack, 
haystack.contains(needle));
+    }
+
+    public static void assertNotContains(String needle, String haystack) {
+        assertFalse("Should not have found " + needle + " in: " + haystack, 
haystack.contains(needle));
+    }
+
 }

svn commit: r1655449 - in /tika/trunk/tika-example: pom.xml src/main/java/org/apache/tika/example/ParsingExample.java src/main/resources/org/apache/tika/example/test_recursive_embedded.docx src/test/java/org/apache/tika/example/TestParsingExample.java

Reply via email to