Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri May 29 14:36:21 2015 @@ -28,300 +28,299 @@ import org.junit.Test; /** * Tests that the various POI powered parsers are - * able to extract their embedded contents. + * able to extract their embedded contents. */ public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest { - + /** * For office files which don't have anything embedded in them */ @Test public void testWithoutEmbedded() throws Exception { - ContainerExtractor extractor = new ParserContainerExtractor(); - - String[] files = new String[] { - "testEXCEL.xls", "testWORD.doc", "testPPT.ppt", - "testVISIO.vsd", "test-outlook.msg" - }; - for(String file : files) { - // Process it without recursing - TrackingHandler handler = process(file, extractor, false); - - // Won't have fired - assertEquals(0, handler.filenames.size()); - assertEquals(0, handler.mediaTypes.size()); - - // Ditto with recursing - handler = process(file, extractor, true); - assertEquals(0, handler.filenames.size()); - assertEquals(0, handler.mediaTypes.size()); - } + ContainerExtractor extractor = new ParserContainerExtractor(); + + String[] files = new String[]{ + "testEXCEL.xls", "testWORD.doc", "testPPT.ppt", + "testVISIO.vsd", "test-outlook.msg" + }; + for (String file : files) { + // Process it without recursing + TrackingHandler handler = process(file, extractor, false); + + // Won't have fired + assertEquals(0, handler.filenames.size()); + assertEquals(0, handler.mediaTypes.size()); + + // Ditto with recursing + handler = process(file, extractor, true); + assertEquals(0, handler.filenames.size()); + assertEquals(0, handler.mediaTypes.size()); + } } - + /** * Office files with embedded images, but no other - * office files in them + * office files in them */ @Test public void testEmbeddedImages() throws Exception { - ContainerExtractor extractor = new ParserContainerExtractor(); - TrackingHandler handler; - - // Excel with 1 image - handler = process("testEXCEL_1img.xls", extractor, false); - assertEquals(1, handler.filenames.size()); - assertEquals(1, handler.mediaTypes.size()); - - assertEquals(null, handler.filenames.get(0)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - - - // PowerPoint with 2 images + sound - // TODO - - - // Word with 1 image - handler = process("testWORD_1img.doc", extractor, false); - assertEquals(1, handler.filenames.size()); - assertEquals(1, handler.mediaTypes.size()); - - assertEquals("image1.png", handler.filenames.get(0)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - - - // Word with 3 images - handler = process("testWORD_3imgs.doc", extractor, false); - assertEquals(3, handler.filenames.size()); - assertEquals(3, handler.mediaTypes.size()); - - assertEquals("image1.png", handler.filenames.get(0)); - assertEquals("image2.jpg", handler.filenames.get(1)); - assertEquals("image3.png", handler.filenames.get(2)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - assertEquals(TYPE_JPG, handler.mediaTypes.get(1)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); + ContainerExtractor extractor = new ParserContainerExtractor(); + TrackingHandler handler; + + // Excel with 1 image + handler = process("testEXCEL_1img.xls", extractor, false); + assertEquals(1, handler.filenames.size()); + assertEquals(1, handler.mediaTypes.size()); + + assertEquals(null, handler.filenames.get(0)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + + + // PowerPoint with 2 images + sound + // TODO + + + // Word with 1 image + handler = process("testWORD_1img.doc", extractor, false); + assertEquals(1, handler.filenames.size()); + assertEquals(1, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + + + // Word with 3 images + handler = process("testWORD_3imgs.doc", extractor, false); + assertEquals(3, handler.filenames.size()); + assertEquals(3, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals("image2.jpg", handler.filenames.get(1)); + assertEquals("image3.png", handler.filenames.get(2)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + assertEquals(TYPE_JPG, handler.mediaTypes.get(1)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); } - + /** * Office files which have other office files - * embedded into them. The embedded office files - * will sometimes have images in them. - * - * eg xls - * -> word - * -> image - * -> image - * -> powerpoint - * -> excel - * -> image + * embedded into them. The embedded office files + * will sometimes have images in them. + * <p/> + * eg xls + * -> word + * -> image + * -> image + * -> powerpoint + * -> excel + * -> image */ @Test public void testEmbeddedOfficeFiles() throws Exception { - ContainerExtractor extractor = new ParserContainerExtractor(); - TrackingHandler handler; - - - // Excel with a word doc and a powerpoint doc, both of which have images in them - // Without recursion, should see both documents + the images - handler = process("testEXCEL_embeded.xls", extractor, false); - assertEquals(5, handler.filenames.size()); - assertEquals(5, handler.mediaTypes.size()); - - // We don't know their filenames - assertEquals(null, handler.filenames.get(0)); - assertEquals(null, handler.filenames.get(1)); - assertEquals(null, handler.filenames.get(2)); - assertEquals("MBD0003271D.ppt", handler.filenames.get(3)); - assertEquals("MBD00032A24.doc", handler.filenames.get(4)); - // But we do know their types - assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image - assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc - assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc - - - // With recursion, should get the images embedded in the office files too - handler = process("testEXCEL_embeded.xls", extractor, true); - assertEquals(17, handler.filenames.size()); - assertEquals(17, handler.mediaTypes.size()); - - assertEquals(null, handler.filenames.get(0)); - assertEquals(null, handler.filenames.get(1)); - assertEquals(null, handler.filenames.get(2)); - assertEquals("MBD0003271D.ppt", handler.filenames.get(3)); - assertEquals("1", handler.filenames.get(4)); - assertEquals(null, handler.filenames.get(5)); - assertEquals("2", handler.filenames.get(6)); - assertEquals("image1.png", handler.filenames.get(7)); - assertEquals("image2.jpg", handler.filenames.get(8)); - assertEquals("image3.png", handler.filenames.get(9)); - assertEquals("image1.png", handler.filenames.get(16)); - - assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image - assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation - assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS - assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image - assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image - assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image - assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image - - // Word with .docx, powerpoint and excel - handler = process("testWORD_embeded.doc", extractor, false); - assertEquals(9, handler.filenames.size()); - assertEquals(9, handler.mediaTypes.size()); - - // Filenames are a bit iffy... - // Should really be 3*embedded pictures then 3*icons then embedded docs - assertEquals("image1.emf", handler.filenames.get(0)); - assertEquals("image4.png", handler.filenames.get(1)); - assertEquals("image5.jpg", handler.filenames.get(2)); - assertEquals("image6.png", handler.filenames.get(3)); - assertEquals("image2.emf", handler.filenames.get(4)); - assertEquals("image3.emf", handler.filenames.get(5)); - assertEquals(null, handler.filenames.get(6)); - assertEquals("_1345471035.ppt", handler.filenames.get(7)); - assertEquals("_1345470949.xls", handler.filenames.get(8)); - - // But we do know their types - assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc? - assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo - assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try - assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc? - assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc? - assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc - assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc - assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc - - - // With recursion, should get their images too - handler = process("testWORD_embeded.doc", extractor, true); - assertEquals(16, handler.filenames.size()); - assertEquals(16, handler.mediaTypes.size()); - - // We don't know their filenames, except for doc images + docx - assertEquals("image1.emf", handler.filenames.get(0)); - assertEquals("image4.png", handler.filenames.get(1)); - assertEquals("image5.jpg", handler.filenames.get(2)); - assertEquals("image6.png", handler.filenames.get(3)); - assertEquals("image2.emf", handler.filenames.get(4)); - assertEquals("image3.emf", handler.filenames.get(5)); - assertEquals(null, handler.filenames.get(6)); - assertEquals("image2.png", handler.filenames.get(7)); - assertEquals("image3.jpeg", handler.filenames.get(8)); - assertEquals("image4.png", handler.filenames.get(9)); - for(int i=11; i<14; i++) { - assertNull(handler.filenames.get(i)); - } - // But we do know their types - assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo - assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try - assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc - assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx - assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx - assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx - assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc - assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls - - - // PowerPoint with excel and word - handler = process("testPPT_embeded.ppt", extractor, false); - assertEquals(7, handler.filenames.size()); - assertEquals(7, handler.mediaTypes.size()); - - // We don't get all that helpful filenames - assertEquals("1", handler.filenames.get(0)); - assertEquals("2", handler.filenames.get(1)); - assertEquals(null, handler.filenames.get(2)); - assertEquals(null, handler.filenames.get(3)); - assertEquals(null, handler.filenames.get(4)); - assertEquals(null, handler.filenames.get(5)); - assertEquals(null, handler.filenames.get(6)); - // But we do know their types - assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc - assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image - - // Run again on PowerPoint but with recursion - handler = process("testPPT_embeded.ppt", extractor, true); - assertEquals(11, handler.filenames.size()); - assertEquals(11, handler.mediaTypes.size()); - - assertEquals("1", handler.filenames.get(0)); - assertEquals(null, handler.filenames.get(1)); - assertEquals("2", handler.filenames.get(2)); - assertEquals("image1.png", handler.filenames.get(3)); - assertEquals("image2.jpg", handler.filenames.get(4)); - assertEquals("image3.png", handler.filenames.get(5)); - assertEquals(null, handler.filenames.get(6)); - assertEquals(null, handler.filenames.get(7)); - assertEquals(null, handler.filenames.get(8)); - assertEquals(null, handler.filenames.get(9)); - assertEquals(null, handler.filenames.get(10)); - - assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .xls - assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .docx - assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // JPG inside .docx - assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .docx - assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image - - - // Word, with a non-office file (PDF) - handler = process("testWORD_embedded_pdf.doc", extractor, true); - assertEquals(2, handler.filenames.size()); - assertEquals(2, handler.mediaTypes.size()); - - assertEquals("image1.emf", handler.filenames.get(0)); - assertEquals("_1402837031.pdf", handler.filenames.get(1)); - - assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf - assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself - - - - // Outlook with a text file and a word document - handler = process("testMSG_att_doc.msg", extractor, true); - assertEquals(2, handler.filenames.size()); - assertEquals(2, handler.mediaTypes.size()); - - assertEquals("test-unicode.doc", handler.filenames.get(0)); - assertEquals(TYPE_DOC, handler.mediaTypes.get(0)); - - assertEquals("pj1.txt", handler.filenames.get(1)); - assertEquals(TYPE_TXT, handler.mediaTypes.get(1)); - - - // Outlook with a pdf and another outlook message - handler = process("testMSG_att_msg.msg", extractor, true); - assertEquals(2, handler.filenames.size()); - assertEquals(2, handler.mediaTypes.size()); - - assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0)); - assertEquals(TYPE_MSG, handler.mediaTypes.get(0)); - - assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1)); - assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); + ContainerExtractor extractor = new ParserContainerExtractor(); + TrackingHandler handler; + + + // Excel with a word doc and a powerpoint doc, both of which have images in them + // Without recursion, should see both documents + the images + handler = process("testEXCEL_embeded.xls", extractor, false); + assertEquals(5, handler.filenames.size()); + assertEquals(5, handler.mediaTypes.size()); + + // We don't know their filenames + assertEquals(null, handler.filenames.get(0)); + assertEquals(null, handler.filenames.get(1)); + assertEquals(null, handler.filenames.get(2)); + assertEquals("MBD0003271D.ppt", handler.filenames.get(3)); + assertEquals("MBD00032A24.doc", handler.filenames.get(4)); + // But we do know their types + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image + assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc + + + // With recursion, should get the images embedded in the office files too + handler = process("testEXCEL_embeded.xls", extractor, true); + assertEquals(17, handler.filenames.size()); + assertEquals(17, handler.mediaTypes.size()); + + assertEquals(null, handler.filenames.get(0)); + assertEquals(null, handler.filenames.get(1)); + assertEquals(null, handler.filenames.get(2)); + assertEquals("MBD0003271D.ppt", handler.filenames.get(3)); + assertEquals("1", handler.filenames.get(4)); + assertEquals(null, handler.filenames.get(5)); + assertEquals("2", handler.filenames.get(6)); + assertEquals("image1.png", handler.filenames.get(7)); + assertEquals("image2.jpg", handler.filenames.get(8)); + assertEquals("image3.png", handler.filenames.get(9)); + assertEquals("image1.png", handler.filenames.get(16)); + + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image + assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation + assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image + assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image + assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image + assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image + + // Word with .docx, powerpoint and excel + handler = process("testWORD_embeded.doc", extractor, false); + assertEquals(9, handler.filenames.size()); + assertEquals(9, handler.mediaTypes.size()); + + // Filenames are a bit iffy... + // Should really be 3*embedded pictures then 3*icons then embedded docs + assertEquals("image1.emf", handler.filenames.get(0)); + assertEquals("image4.png", handler.filenames.get(1)); + assertEquals("image5.jpg", handler.filenames.get(2)); + assertEquals("image6.png", handler.filenames.get(3)); + assertEquals("image2.emf", handler.filenames.get(4)); + assertEquals("image3.emf", handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + assertEquals("_1345471035.ppt", handler.filenames.get(7)); + assertEquals("_1345470949.xls", handler.filenames.get(8)); + + // But we do know their types + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc? + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo + assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try + assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc? + assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc? + assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc + assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc + + + // With recursion, should get their images too + handler = process("testWORD_embeded.doc", extractor, true); + assertEquals(16, handler.filenames.size()); + assertEquals(16, handler.mediaTypes.size()); + + // We don't know their filenames, except for doc images + docx + assertEquals("image1.emf", handler.filenames.get(0)); + assertEquals("image4.png", handler.filenames.get(1)); + assertEquals("image5.jpg", handler.filenames.get(2)); + assertEquals("image6.png", handler.filenames.get(3)); + assertEquals("image2.emf", handler.filenames.get(4)); + assertEquals("image3.emf", handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + assertEquals("image2.png", handler.filenames.get(7)); + assertEquals("image3.jpeg", handler.filenames.get(8)); + assertEquals("image4.png", handler.filenames.get(9)); + for (int i = 11; i < 14; i++) { + assertNull(handler.filenames.get(i)); + } + // But we do know their types + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo + assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try + assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc + assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx + assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx + assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc + assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls + + + // PowerPoint with excel and word + handler = process("testPPT_embeded.ppt", extractor, false); + assertEquals(7, handler.filenames.size()); + assertEquals(7, handler.mediaTypes.size()); + + // We don't get all that helpful filenames + assertEquals("1", handler.filenames.get(0)); + assertEquals("2", handler.filenames.get(1)); + assertEquals(null, handler.filenames.get(2)); + assertEquals(null, handler.filenames.get(3)); + assertEquals(null, handler.filenames.get(4)); + assertEquals(null, handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + // But we do know their types + assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image + + // Run again on PowerPoint but with recursion + handler = process("testPPT_embeded.ppt", extractor, true); + assertEquals(11, handler.filenames.size()); + assertEquals(11, handler.mediaTypes.size()); + + assertEquals("1", handler.filenames.get(0)); + assertEquals(null, handler.filenames.get(1)); + assertEquals("2", handler.filenames.get(2)); + assertEquals("image1.png", handler.filenames.get(3)); + assertEquals("image2.jpg", handler.filenames.get(4)); + assertEquals("image3.png", handler.filenames.get(5)); + assertEquals(null, handler.filenames.get(6)); + assertEquals(null, handler.filenames.get(7)); + assertEquals(null, handler.filenames.get(8)); + assertEquals(null, handler.filenames.get(9)); + assertEquals(null, handler.filenames.get(10)); + + assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .xls + assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .docx + assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // JPG inside .docx + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .docx + assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image + + + // Word, with a non-office file (PDF) + handler = process("testWORD_embedded_pdf.doc", extractor, true); + assertEquals(2, handler.filenames.size()); + assertEquals(2, handler.mediaTypes.size()); + + assertEquals("image1.emf", handler.filenames.get(0)); + assertEquals("_1402837031.pdf", handler.filenames.get(1)); + + assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf + assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself + + + // Outlook with a text file and a word document + handler = process("testMSG_att_doc.msg", extractor, true); + assertEquals(2, handler.filenames.size()); + assertEquals(2, handler.mediaTypes.size()); + + assertEquals("test-unicode.doc", handler.filenames.get(0)); + assertEquals(TYPE_DOC, handler.mediaTypes.get(0)); + + assertEquals("pj1.txt", handler.filenames.get(1)); + assertEquals(TYPE_TXT, handler.mediaTypes.get(1)); + + + // Outlook with a pdf and another outlook message + handler = process("testMSG_att_msg.msg", extractor, true); + assertEquals(2, handler.filenames.size()); + assertEquals(2, handler.mediaTypes.size()); + + assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0)); + assertEquals(TYPE_MSG, handler.mediaTypes.get(0)); + + assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1)); + assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); } @Test
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,7 +17,6 @@ package org.apache.tika.parser.microsoft; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.io.InputStream; import java.util.Locale; @@ -148,9 +147,9 @@ public class PowerPointParserTest extend // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); - - //TIKA-1171 - assertEquals(-1, content.indexOf("*")); + + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); } /** @@ -207,54 +206,54 @@ public class PowerPointParserTest extend */ @Test public void testCustomProperties() throws Exception { - InputStream input = PowerPointParserTest.class.getResourceAsStream( - "/test-documents/testPPT_custom_props.ppt"); - Metadata metadata = new Metadata(); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - } finally { - input.close(); - } - - assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR)); - assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE)); - assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE)); - assertEquals("1", metadata.get(Office.SLIDE_COUNT)); - assertEquals("3", metadata.get(Office.WORD_COUNT)); - assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("true", metadata.get("custom:myCustomBoolean")); - assertEquals("3", metadata.get("custom:myCustomNumber")); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); - assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + InputStream input = PowerPointParserTest.class.getResourceAsStream( + "/test-documents/testPPT_custom_props.ppt"); + Metadata metadata = new Metadata(); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + } finally { + input.close(); + } + + assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE)); + assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("1", metadata.get(Office.SLIDE_COUNT)); + assertEquals("3", metadata.get(Office.WORD_COUNT)); + assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } // TIKA-1025 @Test public void testEmbeddedPlacedholder() throws Exception { - XMLResult result = getXML("testPPT_embedded2.ppt"); - assertContains("<div class=\"embedded\" id=\"1\" />", result.xml); - assertContains("<div class=\"embedded\" id=\"14\" />", result.xml); + XMLResult result = getXML("testPPT_embedded2.ppt"); + assertContains("<div class=\"embedded\" id=\"1\" />", result.xml); + assertContains("<div class=\"embedded\" id=\"14\" />", result.xml); } // TIKA-817 @Test public void testAutoDatePPT() throws Exception { - //decision was made in POI-52367 not to generate - //autodate automatically. For pptx, where value is stored, - //value is extracted. For ppt, however, no date is extracted. - XMLResult result = getXML("testPPT_autodate.ppt"); - assertContains( - "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />", - result.xml); + //decision was made in POI-52367 not to generate + //autodate automatically. For pptx, where value is stored, + //value is extracted. For ppt, however, no date is extracted. + XMLResult result = getXML("testPPT_autodate.ppt"); + assertContains( + "<p class=\"slide-content\">Now<br />\n*<br />\n*<br />", + result.xml); } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,21 +31,21 @@ import org.xml.sax.ContentHandler; /** * Tests for Microsoft Project (MPP) Files. - * + * * Note - we don't currently have a dedicated Project * Parser, all we have is the common office metadata */ public class ProjectParserTest { - + @Test public void testProject2003() throws Exception { - InputStream input = ProjectParserTest.class.getResourceAsStream( - "/test-documents/testPROJECT2003.mpp"); - try { - doTestProject(input); - } finally { - input.close(); - } + InputStream input = ProjectParserTest.class.getResourceAsStream( + "/test-documents/testPROJECT2003.mpp"); + try { + doTestProject(input); + } finally { + input.close(); + } } @Test @@ -60,40 +60,40 @@ public class ProjectParserTest { } private void doTestProject(InputStream input) throws Exception { - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - new OfficeParser().parse(input, handler, metadata, new ParseContext()); - - assertEquals( - "application/vnd.ms-project", - metadata.get(Metadata.CONTENT_TYPE)); - - assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT)); - assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.KEYWORDS)); - assertEquals("Comment Vulpes vulpes comment", metadata.get(TikaCoreProperties.COMMENTS)); - - assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY)); - assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER)); - assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY)); - - assertEquals("2011-11-24T10:58:00Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2011-11-24T10:58:00Z", metadata.get(Metadata.CREATION_DATE)); - assertEquals("2011-11-24T11:31:00Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE)); - - // Custom Project metadata is present with prefix - assertEquals("0%", metadata.get("custom:% Complete")); - assertEquals("0%", metadata.get("custom:% Work Complete")); - assertEquals("\u00a3"+"0.00", metadata.get("custom:Cost")); - assertEquals("2d?", metadata.get("custom:Duration")); - assertEquals("16h", metadata.get("custom:Work")); - - // Currently, we don't do textual contents of the file - String content = handler.toString(); - assertEquals("", content); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new OfficeParser().parse(input, handler, metadata, new ParseContext()); + + assertEquals( + "application/vnd.ms-project", + metadata.get(Metadata.CONTENT_TYPE)); + + assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT)); + assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Comment Vulpes vulpes comment", metadata.get(TikaCoreProperties.COMMENTS)); + + assertEquals("Category1", metadata.get(OfficeOpenXMLCore.CATEGORY)); + assertEquals("Mr Burns", metadata.get(OfficeOpenXMLExtended.MANAGER)); + assertEquals("CompanyA", metadata.get(OfficeOpenXMLExtended.COMPANY)); + + assertEquals("2011-11-24T10:58:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-11-24T10:58:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2011-11-24T11:31:00Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.DATE)); + + // Custom Project metadata is present with prefix + assertEquals("0%", metadata.get("custom:% Complete")); + assertEquals("0%", metadata.get("custom:% Work Complete")); + assertEquals("\u00a3" + "0.00", metadata.get("custom:Cost")); + assertEquals("2d?", metadata.get("custom:Duration")); + assertEquals("16h", metadata.get("custom:Work")); + + // Currently, we don't do textual contents of the file + String content = handler.toString(); + assertEquals("", content); } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java Fri May 29 14:36:21 2015 @@ -36,66 +36,66 @@ import org.xml.sax.ContentHandler; * Tests for the TNEF (winmail.dat) parser */ public class TNEFParserTest extends AbstractPOIContainerExtractionTest { - private static final String file = "testWINMAIL.dat"; - - @Test - public void testBasics() throws Exception { - TikaInputStream stream = getTestFile(file); - Detector detector = new DefaultDetector(); - try { - assertEquals( - MediaType.application("vnd.ms-tnef"), - detector.detect(stream, new Metadata())); - } finally { - stream.close(); - } - } - - @Test - public void testMetadata() throws Exception { - TikaInputStream stream = getTestFile(file); - - Metadata metadata = new Metadata(); - ContentHandler handler = new BodyContentHandler(); - - TNEFParser tnef = new TNEFParser(); - tnef.parse(stream, handler, metadata, new ParseContext()); - - assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("This is a test message", metadata.get(Metadata.SUBJECT)); - } - + private static final String file = "testWINMAIL.dat"; + + @Test + public void testBasics() throws Exception { + TikaInputStream stream = getTestFile(file); + Detector detector = new DefaultDetector(); + try { + assertEquals( + MediaType.application("vnd.ms-tnef"), + detector.detect(stream, new Metadata())); + } finally { + stream.close(); + } + } + + @Test + public void testMetadata() throws Exception { + TikaInputStream stream = getTestFile(file); + + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + + TNEFParser tnef = new TNEFParser(); + tnef.parse(stream, handler, metadata, new ParseContext()); + + assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("This is a test message", metadata.get(Metadata.SUBJECT)); + } + /** * Check the Rtf and Attachments are returned - * as expected + * as expected */ - @Test + @Test public void testBodyAndAttachments() throws Exception { - ContainerExtractor extractor = new ParserContainerExtractor(); - - // Process it with recursing - // Will have the message body RTF and the attachments - TrackingHandler handler = process(file, extractor, true); - assertEquals(6, handler.filenames.size()); - assertEquals(6, handler.mediaTypes.size()); - - // We know the filenames for all of them - assertEquals("message.rtf", handler.filenames.get(0)); - assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0)); - - assertEquals("quick.doc", handler.filenames.get(1)); - assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1)); - - assertEquals("quick.html", handler.filenames.get(2)); - assertEquals(MediaType.text("html"), handler.mediaTypes.get(2)); - - assertEquals("quick.pdf", handler.filenames.get(3)); - assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3)); - - assertEquals("quick.txt", handler.filenames.get(4)); - assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4)); - - assertEquals("quick.xml", handler.filenames.get(5)); - assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5)); + ContainerExtractor extractor = new ParserContainerExtractor(); + + // Process it with recursing + // Will have the message body RTF and the attachments + TrackingHandler handler = process(file, extractor, true); + assertEquals(6, handler.filenames.size()); + assertEquals(6, handler.mediaTypes.size()); + + // We know the filenames for all of them + assertEquals("message.rtf", handler.filenames.get(0)); + assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0)); + + assertEquals("quick.doc", handler.filenames.get(1)); + assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1)); + + assertEquals("quick.html", handler.filenames.get(2)); + assertEquals(MediaType.text("html"), handler.mediaTypes.get(2)); + + assertEquals("quick.pdf", handler.filenames.get(3)); + assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3)); + + assertEquals("quick.txt", handler.filenames.get(4)); + assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4)); + + assertEquals("quick.xml", handler.filenames.get(5)); + assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5)); } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -89,8 +89,8 @@ public class WordParserTest extends Tika Metadata metadata = result.metadata; assertEquals( - "application/msword", - metadata.get(Metadata.CONTENT_TYPE)); + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); @@ -117,9 +117,9 @@ public class WordParserTest extends Tika xml = getXML("testWORD_3imgs.doc").xml; // Images 1-3 - assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\"")); - assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\"")); - assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\"")); + assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image1.png\"")); + assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image2.jpg\"")); + assertTrue("Image not found in:\n" + xml, xml.contains("src=\"embedded:image3.png\"")); // Text too assertTrue(xml.contains("<p>The end!")); @@ -131,7 +131,7 @@ public class WordParserTest extends Tika // Make sure bold text arrived as single // contiguous string even though Word parser // handled this as 3 character runs - assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>")); + assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); // TIKA-692: test document containing multiple // character runs within a bold tag: @@ -140,7 +140,7 @@ public class WordParserTest extends Tika // Make sure bold text arrived as single // contiguous string even though Word parser // handled this as 3 character runs - assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>")); + assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); } @Test @@ -277,19 +277,19 @@ public class WordParserTest extends Tika */ @Test public void testNoFormat() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); - InputStream stream = WordParserTest.class.getResourceAsStream( - "/test-documents/testWORD_no_format.doc"); - try { - new OfficeParser().parse(stream, handler, metadata, new ParseContext()); - } finally { - stream.close(); - } + InputStream stream = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD_no_format.doc"); + try { + new OfficeParser().parse(stream, handler, metadata, new ParseContext()); + } finally { + stream.close(); + } - String content = handler.toString(); - assertContains("Will generate an exception", content); + String content = handler.toString(); + assertContains("Will generate an exception", content); } /** @@ -297,55 +297,55 @@ public class WordParserTest extends Tika */ @Test public void testCustomProperties() throws Exception { - InputStream input = WordParserTest.class.getResourceAsStream( - "/test-documents/testWORD_custom_props.doc"); - Metadata metadata = new Metadata(); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - } finally { - input.close(); - } - - assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR)); - assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE)); - assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE)); - assertEquals("Microsoft Office Word",metadata.get(OfficeOpenXMLExtended.APPLICATION)); - assertEquals("1", metadata.get(Office.PAGE_COUNT)); - assertEquals("2", metadata.get(Office.WORD_COUNT)); - assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS)); - assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); - assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS)); - // TODO: Move to OO subject in Tika 2.0 - assertEquals("My subject", metadata.get(Metadata.SUBJECT)); - assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT)); - assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY)); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); + InputStream input = WordParserTest.class.getResourceAsStream( + "/test-documents/testWORD_custom_props.doc"); + Metadata metadata = new Metadata(); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + } finally { + input.close(); + } + + assertEquals("application/msword", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE)); + assertEquals("2010-10-05T09:03:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2010-10-05T09:03:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("1", metadata.get(Office.PAGE_COUNT)); + assertEquals("2", metadata.get(Office.WORD_COUNT)); + assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); + assertEquals("My Comments", metadata.get(TikaCoreProperties.COMMENTS)); + // TODO: Move to OO subject in Tika 2.0 + assertEquals("My subject", metadata.get(Metadata.SUBJECT)); + assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("EDF-DIT", metadata.get(OfficeOpenXMLExtended.COMPANY)); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); } @Test public void testExceptions1() throws Exception { - XMLResult xml; - Level logLevelStart = Logger.getRootLogger().getLevel(); - Logger.getRootLogger().setLevel(Level.ERROR); - try { - xml = getXML("testException1.doc"); - assertContains("total population", xml.xml); - xml = getXML("testException2.doc"); - assertContains("electric charge", xml.xml); - } finally { - Logger.getRootLogger().setLevel(logLevelStart); - } + XMLResult xml; + Level logLevelStart = Logger.getRootLogger().getLevel(); + Logger.getRootLogger().setLevel(Level.ERROR); + try { + xml = getXML("testException1.doc"); + assertContains("total population", xml.xml); + xml = getXML("testException2.doc"); + assertContains("electric charge", xml.xml); + } finally { + Logger.getRootLogger().setLevel(logLevelStart); + } } @Test @@ -364,8 +364,8 @@ public class WordParserTest extends Tika Metadata metadata = result.metadata; assertEquals( - "application/msword", - metadata.get(Metadata.CONTENT_TYPE)); + "application/msword", + metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Lutz Theurer", metadata.get(TikaCoreProperties.CREATOR)); assertContains("example.com", xml); @@ -381,7 +381,7 @@ public class WordParserTest extends Tika @Test public void testControlCharacter() throws Exception { - assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " ")); + assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " ")); } @Test Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java Fri May 29 14:36:21 2015 @@ -27,7 +27,7 @@ import org.junit.Test; import org.xml.sax.ContentHandler; public class WriteProtectedParserTest { - + @Test public void testWriteProtected() throws Exception { InputStream input = ExcelParserTest.class.getResourceAsStream( Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri May 29 14:36:21 2015 @@ -29,11 +29,11 @@ import org.junit.Test; /** * Tests that the various POI OOXML powered parsers are - * able to extract their embedded contents. + * able to extract their embedded contents. */ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtractionTest { private ContainerExtractor extractor; - + @Before public void setUp() { Tika tika = new Tika(); @@ -41,231 +41,231 @@ public class OOXMLContainerExtractionTes tika.getParser(), tika.getDetector()); } - /** + /** * For office files which don't have anything embedded in them */ @Test public void testWithoutEmbedded() throws Exception { - assertEmbeddedFiles(0, "testEXCEL.xlsx" ); - assertEmbeddedFiles(0, "testWORD.docx" ); - assertEmbeddedFiles(1 /* thumbnail as default */, "testPPT.pptx" ); + assertEmbeddedFiles(0, "testEXCEL.xlsx"); + assertEmbeddedFiles(0, "testWORD.docx"); + assertEmbeddedFiles(1 /* thumbnail as default */, "testPPT.pptx"); } - private void assertEmbeddedFiles(int expectedNbFiles, String file ) throws Exception { - // Process it without recursing - TrackingHandler handler = process(file, extractor, false); - - // Won't have fired - assertEquals(expectedNbFiles, handler.filenames.size()); - assertEquals(expectedNbFiles, handler.mediaTypes.size()); - - // Ditto with recursing - handler = process(file, extractor, true); - assertEquals(expectedNbFiles, handler.filenames.size()); - assertEquals(expectedNbFiles, handler.mediaTypes.size()); + private void assertEmbeddedFiles(int expectedNbFiles, String file) throws Exception { + // Process it without recursing + TrackingHandler handler = process(file, extractor, false); + + // Won't have fired + assertEquals(expectedNbFiles, handler.filenames.size()); + assertEquals(expectedNbFiles, handler.mediaTypes.size()); + + // Ditto with recursing + handler = process(file, extractor, true); + assertEquals(expectedNbFiles, handler.filenames.size()); + assertEquals(expectedNbFiles, handler.mediaTypes.size()); } - + /** * Office files with embedded images, but no other - * office files in them + * office files in them */ @Test public void testEmbeddedImages() throws Exception { - TrackingHandler handler; - - // Excel with 1 image - handler = process("testEXCEL_1img.xlsx", extractor, false); - assertEquals(1, handler.filenames.size()); - assertEquals(1, handler.mediaTypes.size()); - - assertEquals("image1.png", handler.filenames.get(0)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - - - // PowerPoint with 2 images + sound - // TODO Figure out why we can't find the sound anywhere... - handler = process("testPPT_2imgs.pptx", extractor, false); - assertEquals(3 + 1 /*thumbnail */, handler.filenames.size()); - assertEquals(3 + 1 /*thumbnail */, handler.mediaTypes.size()); - - assertEquals("image1.png", handler.filenames.get(0)); - assertEquals("image2.gif", handler.filenames.get(1)); - assertEquals("image3.png", handler.filenames.get(2)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // icon of sound - assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); - - - // Word with 1 image - handler = process("testWORD_1img.docx", extractor, false); - assertEquals(1, handler.filenames.size()); - assertEquals(1, handler.mediaTypes.size()); - - assertEquals("image1.png", handler.filenames.get(0)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - - - // Word with 3 images - handler = process("testWORD_3imgs.docx", extractor, false); - assertEquals(3, handler.filenames.size()); - assertEquals(3, handler.mediaTypes.size()); - - assertEquals("image2.png", handler.filenames.get(0)); - assertEquals("image3.jpeg", handler.filenames.get(1)); - assertEquals("image4.png", handler.filenames.get(2)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); - assertEquals(TYPE_JPG, handler.mediaTypes.get(1)); - assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); + TrackingHandler handler; + + // Excel with 1 image + handler = process("testEXCEL_1img.xlsx", extractor, false); + assertEquals(1, handler.filenames.size()); + assertEquals(1, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + + + // PowerPoint with 2 images + sound + // TODO Figure out why we can't find the sound anywhere... + handler = process("testPPT_2imgs.pptx", extractor, false); + assertEquals(3 + 1 /*thumbnail */, handler.filenames.size()); + assertEquals(3 + 1 /*thumbnail */, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals("image2.gif", handler.filenames.get(1)); + assertEquals("image3.png", handler.filenames.get(2)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // icon of sound + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); + + + // Word with 1 image + handler = process("testWORD_1img.docx", extractor, false); + assertEquals(1, handler.filenames.size()); + assertEquals(1, handler.mediaTypes.size()); + + assertEquals("image1.png", handler.filenames.get(0)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + + + // Word with 3 images + handler = process("testWORD_3imgs.docx", extractor, false); + assertEquals(3, handler.filenames.size()); + assertEquals(3, handler.mediaTypes.size()); + + assertEquals("image2.png", handler.filenames.get(0)); + assertEquals("image3.jpeg", handler.filenames.get(1)); + assertEquals("image4.png", handler.filenames.get(2)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); + assertEquals(TYPE_JPG, handler.mediaTypes.get(1)); + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); } - + /** * Office files which have other office files - * embedded into them. The embedded office files - * will sometimes have images in them. - * - * eg xls - * -> word - * -> image - * -> image - * -> powerpoint - * -> excel - * -> image + * embedded into them. The embedded office files + * will sometimes have images in them. + * <p/> + * eg xls + * -> word + * -> image + * -> image + * -> powerpoint + * -> excel + * -> image */ @Test public void testEmbeddedOfficeFiles() throws Exception { - TrackingHandler handler; - - - // Excel with a word doc and a powerpoint doc, both of which have images in them - // Without recursion, should see both documents + the images - handler = process("testEXCEL_embeded.xlsx", extractor, false); - assertEquals(7, handler.filenames.size()); - assertEquals(7, handler.mediaTypes.size()); - - // We know the rough filenames - assertEquals("Microsoft_Office_PowerPoint_Presentation1.pptx", handler.filenames.get(0)); - assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(1)); - assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(2)); - assertEquals("image1.png", handler.filenames.get(3)); - assertEquals("image2.emf", handler.filenames.get(4)); - assertEquals("image3.emf", handler.filenames.get(5)); - assertEquals("image4.emf", handler.filenames.get(6)); - // But we do know their types - assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc - assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc - assertEquals(TYPE_DOCX, handler.mediaTypes.get(2)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc - - - // With recursion, should get the images embedded in the office files too - handler = process("testEXCEL_embeded.xlsx", extractor, true); - assertEquals(23 + 1 /*thumbnail */, handler.filenames.size()); - assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size()); - - assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx - assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // PNG inside .pptx - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx - assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); // .xlsx inside .pptx - assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .xlsx inside .pptx - assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // .docx inside .pptx - assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx inside .pptx - assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx inside .pptx - assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx inside .pptx - assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // .doc inside .pptx - assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // PNG inside .doc inside .pptx - assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); // Icon of item inside .pptx - assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); // Icon of item inside .pptx - assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of item inside .pptx - assertEquals(TYPE_JPG, handler.mediaTypes.get(15)); // Embedded thumbnail - assertEquals(TYPE_DOC, handler.mediaTypes.get(16)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(17)); // PNG inside .doc - assertEquals(TYPE_DOCX, handler.mediaTypes.get(18)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(19)); // PNG inside .docx - assertEquals(TYPE_PNG, handler.mediaTypes.get(20)); // Embedded image - assertEquals(TYPE_EMF, handler.mediaTypes.get(21)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(22)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(23)); // Icon of embedded office doc - - - // Word with .docx, powerpoint and excel - handler = process("testWORD_embeded.docx", extractor, false); - assertEquals(9, handler.filenames.size()); - assertEquals(9, handler.mediaTypes.size()); - - // We know their rough filenames - assertEquals("Microsoft_Office_PowerPoint_Presentation2.pptx", handler.filenames.get(0)); - assertEquals("image6.emf", handler.filenames.get(1)); - assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(2)); - assertEquals("image1.png", handler.filenames.get(3)); - assertEquals("image2.jpeg", handler.filenames.get(4)); - assertEquals("image3.png", handler.filenames.get(5)); - assertEquals("image4.emf", handler.filenames.get(6)); - assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(7)); - assertEquals("image5.emf", handler.filenames.get(8)); - // But we do know their types - assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc - assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image - assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc - assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); // Embeded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc - - - // With recursion, should get their images too - handler = process("testWORD_embeded.docx", extractor, true); - assertEquals(14 + 1 /* thumbnail */, handler.filenames.size()); - assertEquals(14 + 1 /* thumbnail */, handler.mediaTypes.size()); - - // But we do know their types - assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx - assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // GIF inside .pptx - assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx - assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // Embedded thumbnail - assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc - assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image - assertEquals(TYPE_JPG, handler.mediaTypes.get(9)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image - assertEquals(TYPE_EMF, handler.mediaTypes.get(11)); // Icon of embedded office doc - assertEquals(TYPE_XLSX, handler.mediaTypes.get(12)); // Embeded office doc - assertEquals(TYPE_PNG, handler.mediaTypes.get(13)); // PNG inside .xlsx - assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of embedded office doc - - - // PowerPoint with excel and word - handler = process("testPPT_embeded.pptx", extractor, false); - assertEquals(9 + 1 /* thumbnail */, handler.filenames.size()); - assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size()); - - // We don't know their exact filenames - assertEquals("image4.png", handler.filenames.get(0)); - assertEquals("image5.gif", handler.filenames.get(1)); - assertEquals("image6.png", handler.filenames.get(2)); - assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3)); - assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4)); - assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5)); - assertEquals("image1.emf", handler.filenames.get(6)); - assertEquals("image2.emf", handler.filenames.get(7)); - assertEquals("image3.emf", handler.filenames.get(8)); - // But we do know their types - assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); // Embedded image - assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // Embedded image - assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image - assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office doc - assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office doc - assertEquals(TYPE_DOC, handler.mediaTypes.get(5)); // Embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc - assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc + TrackingHandler handler; + + + // Excel with a word doc and a powerpoint doc, both of which have images in them + // Without recursion, should see both documents + the images + handler = process("testEXCEL_embeded.xlsx", extractor, false); + assertEquals(7, handler.filenames.size()); + assertEquals(7, handler.mediaTypes.size()); + + // We know the rough filenames + assertEquals("Microsoft_Office_PowerPoint_Presentation1.pptx", handler.filenames.get(0)); + assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(1)); + assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(2)); + assertEquals("image1.png", handler.filenames.get(3)); + assertEquals("image2.emf", handler.filenames.get(4)); + assertEquals("image3.emf", handler.filenames.get(5)); + assertEquals("image4.emf", handler.filenames.get(6)); + // But we do know their types + assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc + assertEquals(TYPE_DOCX, handler.mediaTypes.get(2)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image + assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc + + + // With recursion, should get the images embedded in the office files too + handler = process("testEXCEL_embeded.xlsx", extractor, true); + assertEquals(23 + 1 /*thumbnail */, handler.filenames.size()); + assertEquals(23 + 1 /*thumbnail */, handler.mediaTypes.size()); + + assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx + assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // PNG inside .pptx + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx + assertEquals(TYPE_XLSX, handler.mediaTypes.get(4)); // .xlsx inside .pptx + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .xlsx inside .pptx + assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // .docx inside .pptx + assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .docx inside .pptx + assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx inside .pptx + assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx inside .pptx + assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // .doc inside .pptx + assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // PNG inside .doc inside .pptx + assertEquals(TYPE_EMF, handler.mediaTypes.get(12)); // Icon of item inside .pptx + assertEquals(TYPE_EMF, handler.mediaTypes.get(13)); // Icon of item inside .pptx + assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of item inside .pptx + assertEquals(TYPE_JPG, handler.mediaTypes.get(15)); // Embedded thumbnail + assertEquals(TYPE_DOC, handler.mediaTypes.get(16)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(17)); // PNG inside .doc + assertEquals(TYPE_DOCX, handler.mediaTypes.get(18)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(19)); // PNG inside .docx + assertEquals(TYPE_PNG, handler.mediaTypes.get(20)); // Embedded image + assertEquals(TYPE_EMF, handler.mediaTypes.get(21)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(22)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(23)); // Icon of embedded office doc + + + // Word with .docx, powerpoint and excel + handler = process("testWORD_embeded.docx", extractor, false); + assertEquals(9, handler.filenames.size()); + assertEquals(9, handler.mediaTypes.size()); + + // We know their rough filenames + assertEquals("Microsoft_Office_PowerPoint_Presentation2.pptx", handler.filenames.get(0)); + assertEquals("image6.emf", handler.filenames.get(1)); + assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(2)); + assertEquals("image1.png", handler.filenames.get(3)); + assertEquals("image2.jpeg", handler.filenames.get(4)); + assertEquals("image3.png", handler.filenames.get(5)); + assertEquals("image4.emf", handler.filenames.get(6)); + assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(7)); + assertEquals("image5.emf", handler.filenames.get(8)); + // But we do know their types + assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image + assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image + assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc + assertEquals(TYPE_XLSX, handler.mediaTypes.get(7)); // Embeded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc + + + // With recursion, should get their images too + handler = process("testWORD_embeded.docx", extractor, true); + assertEquals(14 + 1 /* thumbnail */, handler.filenames.size()); + assertEquals(14 + 1 /* thumbnail */, handler.mediaTypes.size()); + + // But we do know their types + assertEquals(TYPE_PPTX, handler.mediaTypes.get(0)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside .pptx + assertEquals(TYPE_GIF, handler.mediaTypes.get(2)); // GIF inside .pptx + assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside .pptx + assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // Embedded thumbnail + assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside .doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image + assertEquals(TYPE_JPG, handler.mediaTypes.get(9)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image + assertEquals(TYPE_EMF, handler.mediaTypes.get(11)); // Icon of embedded office doc + assertEquals(TYPE_XLSX, handler.mediaTypes.get(12)); // Embeded office doc + assertEquals(TYPE_PNG, handler.mediaTypes.get(13)); // PNG inside .xlsx + assertEquals(TYPE_EMF, handler.mediaTypes.get(14)); // Icon of embedded office doc + + + // PowerPoint with excel and word + handler = process("testPPT_embeded.pptx", extractor, false); + assertEquals(9 + 1 /* thumbnail */, handler.filenames.size()); + assertEquals(9 + 1 /* thumbnail */, handler.mediaTypes.size()); + + // We don't know their exact filenames + assertEquals("image4.png", handler.filenames.get(0)); + assertEquals("image5.gif", handler.filenames.get(1)); + assertEquals("image6.png", handler.filenames.get(2)); + assertEquals("Microsoft_Office_Excel_Worksheet1.xlsx", handler.filenames.get(3)); + assertEquals("Microsoft_Office_Word_Document2.docx", handler.filenames.get(4)); + assertEquals("Microsoft_Office_Word_97_-_2003_Document1.doc", handler.filenames.get(5)); + assertEquals("image1.emf", handler.filenames.get(6)); + assertEquals("image2.emf", handler.filenames.get(7)); + assertEquals("image3.emf", handler.filenames.get(8)); + // But we do know their types + assertEquals(TYPE_PNG, handler.mediaTypes.get(0)); // Embedded image + assertEquals(TYPE_GIF, handler.mediaTypes.get(1)); // Embedded image + assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image + assertEquals(TYPE_XLSX, handler.mediaTypes.get(3)); // Embedded office doc + assertEquals(TYPE_DOCX, handler.mediaTypes.get(4)); // Embedded office doc + assertEquals(TYPE_DOC, handler.mediaTypes.get(5)); // Embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc + assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc } @Test
