Author: tallison
Date: Wed Jan 28 20:08:04 2015
New Revision: 1655449
URL: http://svn.apache.org/r1655449
Log:
TIKA-1329, added examples for the RecursiveParserWrapper
Added:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx
(with props)
Modified:
tika/trunk/tika-example/pom.xml
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
Modified: tika/trunk/tika-example/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/pom.xml?rev=1655449&r1=1655448&r2=1655449&view=diff
==============================================================================
--- tika/trunk/tika-example/pom.xml (original)
+++ tika/trunk/tika-example/pom.xml Wed Jan 28 20:08:04 2015
@@ -63,11 +63,14 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
+ <artifactId>tika-serialization</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-translate</artifactId>
<version>${project.version}</version>
</dependency>
-
-
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java?rev=1655449&r1=1655448&r2=1655449&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
Wed Jan 28 20:08:04 2015
@@ -16,15 +16,24 @@
*/
package org.apache.tika.example;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.List;
+
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.io.InputStream;
+import org.xml.sax.helpers.DefaultHandler;
public class ParsingExample {
@@ -32,6 +41,9 @@ public class ParsingExample {
* Example of how to use Tika's parseToString method to parse the content
of a file,
* and return any text found.
*
+ * Note: Tika.parseToString() will extract content from the outer container
+ * document and any embedded/attached documents.
+ *
* @return The content of a file.
*/
public String parseToStringExample() throws IOException, SAXException,
TikaException {
@@ -45,7 +57,7 @@ public class ParsingExample {
}
/**
- * Example of how to use Tika to parse an file when you do not know its
file type
+ * Example of how to use Tika to parse a file when you do not know its
file type
* ahead of time.
*
* AutoDetectParser attempts to discover the file's type automatically,
then call
@@ -61,6 +73,11 @@ public class ParsingExample {
* The Metadata object will be filled by the Parser with Metadata
discovered about
* the file being parsed.
*
+ * Note: This example will extract content from the outer document and all
+ * embedded documents. However, if you choose to use a {@link
ParseContext},
+ * make sure to set a {@link Parser} or else embedded content will not be
+ * parsed.
+ *
* @return The content of a file.
*/
public String parseExample() throws IOException, SAXException,
TikaException {
@@ -75,4 +92,116 @@ public class ParsingExample {
stream.close();
}
}
+
+ /**
+ * If you don't want content from embedded documents, send in
+ * a {@link org.apache.tika.parser.ParseContext} that does not contain a
+ * {@link Parser}.
+ *
+ * @return The content of a file.
+ */
+ public String parseNoEmbeddedExample() throws IOException, SAXException,
TikaException {
+ InputStream stream =
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+ AutoDetectParser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ return handler.toString();
+ } finally {
+ stream.close();
+ }
+ }
+
+
+ /**
+ * This example shows how to extract content from the outer document and
all
+ * embedded documents. The key is to specify a {@link Parser} in the
{@link ParseContext}.
+ *
+ * @return content, including from embedded documents
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ public String parseEmbeddedExample() throws IOException, SAXException,
TikaException {
+ InputStream stream =
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+ AutoDetectParser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ try {
+ parser.parse(stream, handler, metadata, context);
+ return handler.toString();
+ } finally {
+ stream.close();
+ }
+
+ }
+
+ /**
+ * For documents that may contain embedded documents, it might be helpful
+ * to create list of metadata objects, one for the container document and
+ * one for each embedded document. This allows easy access to both the
+ * extracted content and the metadata of each embedded document.
+ * Note that many document formats can contain embedded documents,
+ * including traditional container formats -- zip, tar and others -- but
also
+ * common office document formats including: MSWord, MSExcel,
+ * MSPowerPoint, RTF, PDF, MSG and several others.
+ * <p>
+ * The "content" format is determined by the ContentHandlerFactory, and
+ * the content is stored in {@link
org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
+ * <p>
+ * The drawback to the RecursiveParserWrapper is that it caches metadata
and contents
+ * in memory. This should not be used on files whose contents are too big
to be handled
+ * in memory.
+ *
+ * @return a list of metadata object, one each for the container file and
each embedded file
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ public List<Metadata> recursiveParserWrapperExample() throws IOException,
+ SAXException, TikaException {
+
+ Parser p = new AutoDetectParser();
+ ContentHandlerFactory factory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
+
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
factory);
+ InputStream stream =
ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY,
"test_recursive_embedded.docx");
+ ParseContext context = new ParseContext();
+
+ try {
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ } finally {
+ stream.close();
+ }
+ return wrapper.getMetadata();
+ }
+
+ /**
+ * We include a simple JSON serializer for a list of metadata with
+ * {@link org.apache.tika.metadata.serialization.JsonMetadataList}.
+ * That class also includes a deserializer to convert from JSON
+ * back to a List<Metadata>.
+ * <p>
+ * This functionality is also available in tika-app's GUI, and
+ * with the -J option on tika-app's commandline. For tika-server
+ * users, there is the "rmeta" service that will return this format.
+ *
+ * @return a JSON representation of a list of Metadata objects
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ public String serializedRecursiveParserWrapperExample() throws IOException,
+ SAXException, TikaException {
+ List metadataList = recursiveParserWrapperExample();
+ StringWriter writer = new StringWriter();
+ JsonMetadataList.toJson(metadataList, writer);
+ return writer.toString();
+ }
}
Added:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx?rev=1655449&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test_recursive_embedded.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java?rev=1655449&r1=1655448&r2=1655449&view=diff
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
(original)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
Wed Jan 28 20:08:04 2015
@@ -17,15 +17,21 @@
package org.apache.tika.example;
+import static junit.framework.TestCase.assertFalse;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.List;
+
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.junit.Before;
import org.junit.Test;
import org.xml.sax.SAXException;
-import java.io.IOException;
-
-import static org.junit.Assert.assertEquals;
-
public class TestParsingExample {
ParsingExample parsingExample;
@Before
@@ -45,4 +51,52 @@ public class TestParsingExample {
assertEquals("Expected 'test', but got '" + result + "'", "test",
result);
}
+ @Test
+ public void testNoEmbeddedExample() throws IOException, SAXException,
TikaException {
+ String result = parsingExample.parseNoEmbeddedExample();
+ assertContains("embed_0", result);
+ assertNotContains("embed1/embed1a.txt", result);
+ assertNotContains("embed3/embed3.txt", result);
+ assertNotContains("When in the Course", result);
+ }
+
+
+ @Test
+ public void testRecursiveParseExample() throws IOException, SAXException,
TikaException {
+ String result = parsingExample.parseEmbeddedExample();
+ assertContains("embed_0", result);
+ assertContains("embed1/embed1a.txt", result);
+ assertContains("embed3/embed3.txt", result);
+ assertContains("When in the Course", result);
+ }
+
+ @Test
+ public void testRecursiveParserWrapperExample() throws IOException,
SAXException, TikaException {
+ List<Metadata> metadataList =
parsingExample.recursiveParserWrapperExample();
+ assertEquals("Number of embedded documents + 1 for the container
document", 12, metadataList.size());
+ Metadata m = metadataList.get(6);
+ //this is the location the embed3.txt text file within the outer .docx
+
assertEquals("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
+ m.get("X-TIKA:embedded_resource_path"));
+ //it contains some html encoded content
+ assertContains("When in the Course", m.get("X-TIKA:content"));
+ }
+
+ @Test
+ public void testSerializedRecursiveParserWrapperExample() throws
IOException, SAXException, TikaException {
+ String json = parsingExample.serializedRecursiveParserWrapperExample();
+ assertTrue(json.indexOf("When in the Course") > -1);
+ //now try deserializing the JSON
+ List<Metadata> metadataList = JsonMetadataList.fromJson(new
StringReader(json));
+ assertEquals(12, metadataList.size());
+ }
+
+ public static void assertContains(String needle, String haystack) {
+ assertTrue("Should have found " + needle + " in: " + haystack,
haystack.contains(needle));
+ }
+
+ public static void assertNotContains(String needle, String haystack) {
+ assertFalse("Should not have found " + needle + " in: " + haystack,
haystack.contains(needle));
+ }
+
}