This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0d92bc862c3c344d65d3f6c260b0f5ea4c389fc0 Author: Nick Burch <n...@apache.org> AuthorDate: Wed Oct 18 15:50:59 2017 +0100 Add notes on why we can't get the Numbers or Pages type just yet - need to call out to another library or decode the Document.iwa snappy stream ourselves --- .../parser/iwork/iwana/IWork13PackageParser.java | 34 +++++++++++++--------- .../tika/detect/TestContainerAwareDetector.java | 2 ++ .../tika/parser/iwork/iwana/IWork13ParserTest.java | 14 ++++----- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java index b96cc39..a090e84 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java @@ -17,17 +17,6 @@ package org.apache.tika.parser.iwork.iwana; -import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; -import org.apache.commons.compress.archivers.zip.ZipFile; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import java.io.IOException; import java.io.InputStream; import java.util.Arrays; @@ -38,6 +27,16 @@ import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + public class IWork13PackageParser extends AbstractParser { public enum IWork13DocumentType { @@ -64,16 +63,19 @@ public class IWork13PackageParser extends AbstractParser { type = IWork13DocumentType.detectIfPossible(entry); if (type != null) return type; } + + // If we get here, we don't know what it is return UNKNOWN13.getType(); } /** * @return Specific type if this identifies one, otherwise null */ - public static MediaType detectIfPossible(ZipEntry entry) { + protected static MediaType detectIfPossible(ZipEntry entry) { String name = entry.getName(); if (! name.endsWith(".iwa")) return null; + // Is it a uniquely identifying filename? if (name.equals("Index/MasterSlide.iwa") || name.startsWith("Index/MasterSlide-")) { return KEYNOTE13.getType(); @@ -82,7 +84,13 @@ public class IWork13PackageParser extends AbstractParser { name.startsWith("Index/Slide-")) { return KEYNOTE13.getType(); } - //TODO: figure out how to distinguish numbers from pages + + // Is it the main document? + if (name.equals("Index/Document.iwa")) { + // TODO Decode the snappy stream, and check for the Message Type + // = 2 (TN::SheetArchive), it is a numbers file; + // = 10000 (TP::DocumentArchive), that's a pages file + } // Unknown return null; diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index b6a79eb..e4117c4 100644 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -320,6 +320,8 @@ public class TestContainerAwareDetector { public void testDetectIWork2013() throws Exception { assertTypeByData("testKeynote2013.key", IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString()); + // Without decoding the Document snappy stream, we can't tell the + // difference between these two just based on the zip entries assertTypeByData("testNumbers2013.numbers", IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString()); assertTypeByData("testPages2013.pages", diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java index 4bbbcbf..60477a5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java @@ -16,16 +16,11 @@ */ package org.apache.tika.parser.iwork.iwana; -import static org.apache.tika.TikaTest.assertContains; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.io.InputStream; -import java.util.Arrays; -import java.util.List; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -56,8 +51,7 @@ public class IWork13ParserTest { ContentHandler handler = new BodyContentHandler(); iWorkParser.parse(input, handler, metadata, parseContext); - // Currently parsing is a no-op - // Will only get type + // Currently parsing is a no-op, so will only get the Type assertEquals(1, metadata.size()); assertEquals("", handler.toString()); assertEquals(IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString(), @@ -71,7 +65,8 @@ public class IWork13ParserTest { ContentHandler handler = new BodyContentHandler(); iWorkParser.parse(input, handler, metadata, parseContext); - // Currently parsing is a no-op + // Currently parsing is a no-op, and we can't get the type without + // decoding the Snappy stream // TODO Test properly when a full Parser is added assertEquals(0, metadata.size()); assertEquals("", handler.toString()); @@ -84,7 +79,8 @@ public class IWork13ParserTest { ContentHandler handler = new BodyContentHandler(); iWorkParser.parse(input, handler, metadata, parseContext); - // Currently parsing is a no-op + // Currently parsing is a no-op, and we can't get the type without + // decoding the Snappy stream // TODO Test properly when a full Parser is added assertEquals(0, metadata.size()); assertEquals("", handler.toString()); -- To stop receiving notification emails like this one, please contact "commits@tika.apache.org" <commits@tika.apache.org>.