Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Fri May 29 14:36:21 2015 @@ -33,15 +33,15 @@ import org.junit.Test; import org.xml.sax.helpers.DefaultHandler; public class JpegParserTest { - + private final Parser parser = new JpegParser(); - + @Test public void testJPEG() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); // Core EXIF/TIFF tags @@ -49,7 +49,7 @@ public class JpegParserTest { assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); - + assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600 assertEquals("5.6", metadata.get(Metadata.F_NUMBER)); assertEquals("false", metadata.get(Metadata.FLASH_FIRED)); @@ -62,24 +62,24 @@ public class JpegParserTest { assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL)); assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL)); assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); - + // Check that EXIF/TIFF tags come through with their raw values too // (This may be removed for Tika 1.0, as we support more of them // with explicit Metadata entries) assertEquals("Canon EOS 40D", metadata.get("Model")); - + // Common tags //assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED)); assertEquals("Date/Time Original for when the photo was taken, unspecified time zone", "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED)); List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250")); - assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds")); + assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds")); assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor")); assertFalse(keywords.contains("canon-55-250 moscow-birds serbor")); List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); assertTrue("'canon-55-250' expected in " + subject, subject.contains("canon-55-250")); - assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds")); + assertTrue("'moscow-birds' expected in " + subject, subject.contains("moscow-birds")); assertTrue("'serbor' expected in " + subject, subject.contains("serbor")); assertFalse(subject.contains("canon-55-250 moscow-birds serbor")); } @@ -92,19 +92,19 @@ public class JpegParserTest { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - + // Geo tags assertEquals("12.54321", metadata.get(Metadata.LATITUDE)); assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE)); - + // Core EXIF/TIFF tags assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); - + assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600 assertEquals("5.6", metadata.get(Metadata.F_NUMBER)); assertEquals("false", metadata.get(Metadata.FLASH_FIRED)); @@ -117,7 +117,7 @@ public class JpegParserTest { assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL)); assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL)); assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); - + // Common tags assertEquals("Date/Time Original for when the photo was taken, unspecified time zone", "2009-08-11T09:09:45", metadata.get(TikaCoreProperties.CREATED)); @@ -131,48 +131,48 @@ public class JpegParserTest { /** * Test for an image with the geographic information stored in a slightly - * different way, see TIKA-915 for details + * different way, see TIKA-915 for details * Disabled for now, pending a fix to the underlying library */ @Test public void testJPEGGeo2() throws Exception { - Metadata metadata = new Metadata(); - metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); - InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg"); - parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - - // Geo tags should be there with 5dp, and not rounded - assertEquals("51.575762", metadata.get(Metadata.LATITUDE)); - assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE)); + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + // Geo tags should be there with 5dp, and not rounded + assertEquals("51.575762", metadata.get(Metadata.LATITUDE)); + assertEquals("-1.567886", metadata.get(Metadata.LONGITUDE)); } - + @Test public void testJPEGTitleAndDescription() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - + // embedded comments with non-ascii characters assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); // Dublin Core // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands" // but we have to replace them with underscore - + List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS)); assertTrue(keywords.contains("coast")); assertTrue(keywords.contains("bird watching")); assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS))); - + // Core EXIF/TIFF tags assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH)); assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH)); assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); - + assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000 assertEquals("2.8", metadata.get(Metadata.F_NUMBER)); assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH)); @@ -183,35 +183,35 @@ public class JpegParserTest { assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL)); assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL)); - assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); + assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT)); } - + @Test public void testJPEGTitleAndDescriptionPhotoshop() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - + // embedded comments with non-ascii characters assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); List<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue("got " + keywords, keywords.contains("bird watching")); + assertTrue("got " + keywords, keywords.contains("bird watching")); List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); - assertTrue("got " + subject, subject.contains("bird watching")); + assertTrue("got " + subject, subject.contains("bird watching")); } - + @Test public void testJPEGTitleAndDescriptionXnviewmp() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - + // XnViewMp's default comment dialog has only comment, not headline. // Comment is embedded only if "Write comments in XMP" is enabled in settings assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); @@ -220,31 +220,31 @@ public class JpegParserTest { String[] subject = metadata.getValues(TikaCoreProperties.KEYWORDS); List<String> keywords = Arrays.asList(subject); assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast")); - assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve")); + assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve")); } - + @Test public void testJPEGoddTagComponent() throws Exception { - Metadata metadata = new Metadata(); - metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); - InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg"); - parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - - assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); - assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH)); - assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH)); + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testJPEG_oddTagComponent.jpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); + assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("251", metadata.get(Metadata.IMAGE_WIDTH)); + assertEquals("384", metadata.get(Metadata.IMAGE_LENGTH)); } - + @Test public void testJPEGEmptyEXIFDateTime() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = - getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg"); + getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL)); assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL)); - } + } }
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Fri May 29 14:36:21 2015 @@ -54,6 +54,13 @@ import org.xml.sax.helpers.DefaultHandle public class RFC822ParserTest extends TikaTest { + private static InputStream getStream(String name) { + InputStream stream = Thread.currentThread().getContextClassLoader() + .getResourceAsStream(name); + assertNotNull("Test file not found " + name, stream); + return stream; + } + @Test public void testSimple() { Parser parser = new RFC822Parser(); @@ -73,9 +80,9 @@ public class RFC822ParserTest extends Ti verify(handler).endDocument(); //note no leading spaces, and no quotes assertEquals("Julien Nioche (JIRA) <[email protected]>", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", + assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", + assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT)); } catch (Exception e) { fail("Exception thrown: " + e.getMessage()); @@ -103,11 +110,11 @@ public class RFC822ParserTest extends Ti verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class)); verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p"); verify(handler).endDocument(); - + } catch (Exception e) { fail("Exception thrown: " + e.getMessage()); } - + //repeat, this time looking at content parser = new RFC822Parser(); metadata = new Metadata(); @@ -172,35 +179,35 @@ public class RFC822ParserTest extends Ti parser.parse(stream, handler, metadata, new ParseContext()); //tests correct decoding of internationalized headers, both //quoted-printable (Q) and Base64 (B). - assertEquals("Keld J\u00F8rn Simonsen <[email protected]>", + assertEquals("Keld J\u00F8rn Simonsen <[email protected]>", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("If you can read this you understand the example.", + assertEquals("If you can read this you understand the example.", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("If you can read this you understand the example.", + assertEquals("If you can read this you understand the example.", metadata.get(Metadata.SUBJECT)); } catch (Exception e) { fail("Exception thrown: " + e.getMessage()); } } - + /** * The from isn't in the usual form. * See TIKA-618 */ @Test public void testUnusualFromAddress() throws Exception { - Parser parser = new RFC822Parser(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("test-documents/testRFC822_oddfrom"); - ContentHandler handler = mock(DefaultHandler.class); - - parser.parse(stream, handler, metadata, new ParseContext()); - assertEquals("Saved by Windows Internet Explorer 7", - metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("Air Permit Programs | Air & Radiation | US EPA", - metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Air Permit Programs | Air & Radiation | US EPA", - metadata.get(Metadata.SUBJECT)); + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822_oddfrom"); + ContentHandler handler = mock(DefaultHandler.class); + + parser.parse(stream, handler, metadata, new ParseContext()); + assertEquals("Saved by Windows Internet Explorer 7", + metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Air Permit Programs | Air & Radiation | US EPA", + metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Air Permit Programs | Air & Radiation | US EPA", + metadata.get(Metadata.SUBJECT)); } /** @@ -236,36 +243,36 @@ public class RFC822ParserTest extends Ti new ByteArrayInputStream(data), handler, metadata, context); assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR)); } - + /** * Test for TIKA-678 - not all headers may be present */ @Test public void testSomeMissingHeaders() throws Exception { - Parser parser = new RFC822Parser(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("test-documents/testRFC822-limitedheaders"); - ContentHandler handler = new BodyContentHandler(); - - parser.parse(stream, handler, metadata, new ParseContext()); - assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR)); - assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]); - assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]); - assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM)); - assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]); - assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]); - assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO)); - assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]); - assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]); - assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("abcd", metadata.get(Metadata.SUBJECT)); - assertContains("bar biz bat", handler.toString()); + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822-limitedheaders"); + ContentHandler handler = new BodyContentHandler(); + + parser.parse(stream, handler, metadata, new ParseContext()); + assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR)); + assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]); + assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]); + assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM)); + assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]); + assertEquals("abc", metadata.getValues(Metadata.MESSAGE_FROM)[1]); + assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_TO)); + assertEquals("abc", metadata.getValues(Metadata.MESSAGE_TO)[0]); + assertEquals("def", metadata.getValues(Metadata.MESSAGE_TO)[1]); + assertEquals("abcd", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("abcd", metadata.get(Metadata.SUBJECT)); + assertContains("bar biz bat", handler.toString()); } - + /** * Test TIKA-1028 - If the mail contains an encrypted attachment (or - * an attachment that others triggers an error), parsing should carry - * on for the remainder regardless + * an attachment that others triggers an error), parsing should carry + * on for the remainder regardless */ @Test public void testEncryptedZipAttachment() throws Exception { @@ -275,40 +282,40 @@ public class RFC822ParserTest extends Ti InputStream stream = getStream("test-documents/testRFC822_encrypted_zip"); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, context); - + // Check we go the metadata assertEquals("Juha Haaga <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE)); - + // Check we got the message text, for both Plain Text and HTML assertContains("Includes encrypted zip file", handler.toString()); assertContains("password is \"test\".", handler.toString()); assertContains("This is the Plain Text part", handler.toString()); assertContains("This is the HTML part", handler.toString()); - + // We won't get the contents of the zip file, but we will get the name assertContains("text.txt", handler.toString()); assertNotContained("ENCRYPTED ZIP FILES", handler.toString()); - + // Try again, this time with the password supplied // Check that we also get the zip's contents as well context.set(PasswordProvider.class, new PasswordProvider() { public String getPassword(Metadata metadata) { return "test"; } - }); + }); stream = getStream("test-documents/testRFC822_encrypted_zip"); handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, context); - + assertContains("Includes encrypted zip file", handler.toString()); assertContains("password is \"test\".", handler.toString()); assertContains("This is the Plain Text part", handler.toString()); assertContains("This is the HTML part", handler.toString()); - + // We do get the name of the file in the encrypted zip file assertContains("text.txt", handler.toString()); - + // TODO Upgrade to a version of Commons Compress with Encryption // support, then verify we get the contents of the text file // held within the encrypted zip @@ -317,10 +324,10 @@ public class RFC822ParserTest extends Ti assertContains("ENCRYPTED ZIP FILES", handler.toString()); assertContains("TIKA-1028", handler.toString()); } - + /** * Test TIKA-1028 - Ensure we can get the contents of an - * un-encrypted zip file + * un-encrypted zip file */ @Test public void testNormalZipAttachment() throws Exception { @@ -330,26 +337,26 @@ public class RFC822ParserTest extends Ti InputStream stream = getStream("test-documents/testRFC822_normal_zip"); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, context); - + // Check we go the metadata assertEquals("Juha Haaga <[email protected]>", metadata.get(Metadata.MESSAGE_FROM)); assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE)); - + // Check we got the message text, for both Plain Text and HTML assertContains("Includes a normal, unencrypted zip file", handler.toString()); assertContains("This is the Plain Text part", handler.toString()); assertContains("This is the HTML part", handler.toString()); - + // We get both name and contents of the zip file's contents assertContains("text.txt", handler.toString()); assertContains("TEST DATA FOR TIKA.", handler.toString()); assertContains("This is text inside an unencrypted zip file", handler.toString()); assertContains("TIKA-1028", handler.toString()); } - + /** * TIKA-1222 When requested, ensure that the various attachments of - * the mail come through properly as embedded resources + * the mail come through properly as embedded resources */ @Test public void testGetAttachmentsAsEmbeddedResources() throws Exception { @@ -364,11 +371,11 @@ public class RFC822ParserTest extends Ti if (tis != null) tis.close(); } - + // Check we found all 3 parts assertEquals(3, tracker.filenames.size()); assertEquals(3, tracker.mediaTypes.size()); - + // No filenames available assertEquals(null, tracker.filenames.get(0)); assertEquals(null, tracker.filenames.get(1)); @@ -378,11 +385,4 @@ public class RFC822ParserTest extends Ti assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1)); assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2)); } - - private static InputStream getStream(String name) { - InputStream stream = Thread.currentThread().getContextClassLoader() - .getResourceAsStream(name); - assertNotNull("Test file not found " + name, stream); - return stream; - } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Fri May 29 14:36:21 2015 @@ -35,137 +35,137 @@ import org.xml.sax.ContentHandler; public class MboxParserTest { - protected ParseContext recursingContext; - private Parser autoDetectParser; - private TypeDetector typeDetector; - private MboxParser mboxParser; - - @Before - public void setUp() throws Exception { - typeDetector = new TypeDetector(); - autoDetectParser = new AutoDetectParser(typeDetector); - recursingContext = new ParseContext(); - recursingContext.set(Parser.class, autoDetectParser); - - mboxParser = new MboxParser(); - mboxParser.setTracking(true); - } - - @Test - public void testSimple() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("/test-documents/simple.mbox"); - - try { - mboxParser.parse(stream, handler, metadata, recursingContext); - } finally { - stream.close(); - } - - String content = handler.toString(); - assertContains("Test content 1", content); - assertContains("Test content 2", content); - assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE)); - - Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata(); - assertEquals("Nb. Of mails", 2, mailsMetadata.size()); - - Metadata mail1 = mailsMetadata.get(0); - assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE)); - assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from")); - - Metadata mail2 = mailsMetadata.get(1); - assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE)); - assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from")); - } - - @Test - public void testHeaders() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("/test-documents/headers.mbox"); - - try { - mboxParser.parse(stream, handler, metadata, recursingContext); - } finally { - stream.close(); - } - - assertContains("Test content", handler.toString()); - assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); - - Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); - - assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED)); - assertEquals("<[email protected]>", mailMetadata.get(TikaCoreProperties.CREATOR)); - assertEquals("subject", mailMetadata.get(Metadata.SUBJECT)); - assertEquals("<[email protected]>", mailMetadata.get(Metadata.AUTHOR)); - assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE)); - assertEquals("[email protected]", mailMetadata.get("Message-From")); - assertEquals("<[email protected]>", mailMetadata.get("MboxParser-return-path")); - } - - @Test - public void testMultilineHeader() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("/test-documents/multiline.mbox"); - - try { - mboxParser.parse(stream, handler, metadata, recursingContext); - } finally { - stream.close(); - } - - assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); - - Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); - assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received")); - } - - @Test - public void testQuoted() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("/test-documents/quoted.mbox"); - - try { - mboxParser.parse(stream, handler, metadata, recursingContext); - } finally { - stream.close(); - } - - assertContains("Test content", handler.toString()); - assertContains("> quoted stuff", handler.toString()); - } - - @Test - public void testComplex() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("/test-documents/complex.mbox"); - - try { - mboxParser.parse(stream, handler, metadata, recursingContext); - } finally { - stream.close(); - } - - assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size()); - - Metadata firstMail = mboxParser.getTrackingMetadata().get(0); - assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT)); - assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE)); - assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(Metadata.AUTHOR)); - assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(TikaCoreProperties.CREATOR)); - assertEquals("[email protected]", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS)); - - assertContains("When a Mapper completes", handler.toString()); - } - - private static InputStream getStream(String name) { - return MboxParserTest.class.getClass().getResourceAsStream(name); - } + protected ParseContext recursingContext; + private Parser autoDetectParser; + private TypeDetector typeDetector; + private MboxParser mboxParser; + + private static InputStream getStream(String name) { + return MboxParserTest.class.getClass().getResourceAsStream(name); + } + + @Before + public void setUp() throws Exception { + typeDetector = new TypeDetector(); + autoDetectParser = new AutoDetectParser(typeDetector); + recursingContext = new ParseContext(); + recursingContext.set(Parser.class, autoDetectParser); + + mboxParser = new MboxParser(); + mboxParser.setTracking(true); + } + + @Test + public void testSimple() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("/test-documents/simple.mbox"); + + try { + mboxParser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + String content = handler.toString(); + assertContains("Test content 1", content); + assertContains("Test content 2", content); + assertEquals("application/mbox", metadata.get(Metadata.CONTENT_TYPE)); + + Map<Integer, Metadata> mailsMetadata = mboxParser.getTrackingMetadata(); + assertEquals("Nb. Of mails", 2, mailsMetadata.size()); + + Metadata mail1 = mailsMetadata.get(0); + assertEquals("message/rfc822", mail1.get(Metadata.CONTENT_TYPE)); + assertEquals("envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009", mail1.get("MboxParser-from")); + + Metadata mail2 = mailsMetadata.get(1); + assertEquals("message/rfc822", mail2.get(Metadata.CONTENT_TYPE)); + assertEquals("envelope-sender-mailbox-name Mon Jun 01 11:00:00 2010", mail2.get("MboxParser-from")); + } + + @Test + public void testHeaders() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("/test-documents/headers.mbox"); + + try { + mboxParser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertContains("Test content", handler.toString()); + assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); + + Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); + + assertEquals("2009-06-10T03:58:45Z", mailMetadata.get(TikaCoreProperties.CREATED)); + assertEquals("<[email protected]>", mailMetadata.get(TikaCoreProperties.CREATOR)); + assertEquals("subject", mailMetadata.get(Metadata.SUBJECT)); + assertEquals("<[email protected]>", mailMetadata.get(Metadata.AUTHOR)); + assertEquals("message/rfc822", mailMetadata.get(Metadata.CONTENT_TYPE)); + assertEquals("[email protected]", mailMetadata.get("Message-From")); + assertEquals("<[email protected]>", mailMetadata.get("MboxParser-return-path")); + } + + @Test + public void testMultilineHeader() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("/test-documents/multiline.mbox"); + + try { + mboxParser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertEquals("Nb. Of mails", 1, mboxParser.getTrackingMetadata().size()); + + Metadata mailMetadata = mboxParser.getTrackingMetadata().get(0); + assertEquals("from xxx by xxx with xxx; date", mailMetadata.get("MboxParser-received")); + } + + @Test + public void testQuoted() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("/test-documents/quoted.mbox"); + + try { + mboxParser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertContains("Test content", handler.toString()); + assertContains("> quoted stuff", handler.toString()); + } + + @Test + public void testComplex() throws Exception { + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("/test-documents/complex.mbox"); + + try { + mboxParser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertEquals("Nb. Of mails", 3, mboxParser.getTrackingMetadata().size()); + + Metadata firstMail = mboxParser.getTrackingMetadata().get(0); + assertEquals("Re: question about when shuffle/sort start working", firstMail.get(Metadata.SUBJECT)); + assertEquals("Re: question about when shuffle/sort start working", firstMail.get(TikaCoreProperties.TITLE)); + assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(Metadata.AUTHOR)); + assertEquals("Jothi Padmanabhan <[email protected]>", firstMail.get(TikaCoreProperties.CREATOR)); + assertEquals("[email protected]", firstMail.get(Metadata.MESSAGE_RECIPIENT_ADDRESS)); + + assertContains("When a Mapper completes", handler.toString()); + } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Fri May 29 14:36:21 2015 @@ -28,7 +28,7 @@ import org.apache.tika.mime.MediaType; /** * Parent class of tests that the various POI powered parsers are - * able to extract their embedded contents. + * able to extract their embedded contents. */ public abstract class AbstractPOIContainerExtractionTest { public static final MediaType TYPE_DOC = MediaType.application("msword"); @@ -38,16 +38,24 @@ public abstract class AbstractPOIContain public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"); public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"); public static final MediaType TYPE_MSG = MediaType.application("vnd.ms-outlook"); - + public static final MediaType TYPE_TXT = MediaType.text("plain"); public static final MediaType TYPE_PDF = MediaType.application("pdf"); - + public static final MediaType TYPE_JPG = MediaType.image("jpeg"); public static final MediaType TYPE_GIF = MediaType.image("gif"); public static final MediaType TYPE_PNG = MediaType.image("png"); public static final MediaType TYPE_EMF = MediaType.application("x-emf"); public static final MediaType TYPE_WMF = MediaType.application("x-msmetafile"); + protected static TikaInputStream getTestFile(String filename) throws Exception { + URL input = AbstractPOIContainerExtractionTest.class.getResource( + "/test-documents/" + filename); + assertNotNull(filename + " not found", input); + + return TikaInputStream.get(input); + } + protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception { TikaInputStream stream = getTestFile(filename); try { @@ -55,7 +63,7 @@ public abstract class AbstractPOIContain // Process it TrackingHandler handler = new TrackingHandler(); - if(recurse) { + if (recurse) { extractor.extract(stream, extractor, handler); } else { extractor.extract(stream, null, handler); @@ -67,12 +75,4 @@ public abstract class AbstractPOIContain stream.close(); } } - - protected static TikaInputStream getTestFile(String filename) throws Exception { - URL input = AbstractPOIContainerExtractionTest.class.getResource( - "/test-documents/" + filename); - assertNotNull(filename + " not found", input); - - return TikaInputStream.get(input); - } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Fri May 29 14:36:21 2015 @@ -5,9 +5,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -60,15 +60,15 @@ public class ExcelParserTest { assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); - + // Mon Oct 01 17:13:56 BST 2007 assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED)); assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE)); - + // Mon Oct 01 17:31:43 BST 2007 assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED)); assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE)); - + String content = handler.toString(); assertContains("Sample Excel Worksheet", content); assertContains("Numbers and their Squares", content); @@ -115,7 +115,7 @@ public class ExcelParserTest { // Percentage. assertContains("2.50%", content); // Excel rounds up to 3%, but that requires Java 1.6 or later - if(System.getProperty("java.version").startsWith("1.5")) { + if (System.getProperty("java.version").startsWith("1.5")) { assertContains("2%", content); } else { assertContains("3%", content); @@ -130,31 +130,31 @@ public class ExcelParserTest { // Date Format: m/d/yy assertContains("10/3/09", content); - + // Date/Time Format: m/d/yy h:mm assertContains("1/19/08 4:35", content); // Fraction (2.5): # ?/? assertContains("2 1/2", content); - + // Below assertions represent outstanding formatting issues to be addressed // they are included to allow the issues to be progressed with the Apache POI // team - See TIKA-103. /************************************************************************* - // Custom Number (0 "dollars and" .00 "cents") - assertContains("19 dollars and .99 cents", content); + // Custom Number (0 "dollars and" .00 "cents") + assertContains("19 dollars and .99 cents", content); - // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) - assertContains("At 4:20 AM on Thursday May 17, 2007", content); - **************************************************************************/ + // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy) + assertContains("At 4:20 AM on Thursday May 17, 2007", content); + **************************************************************************/ } finally { input.close(); } } - + @Test public void testExcelParserPassword() throws Exception { InputStream input = ExcelParserTest.class.getResourceAsStream( @@ -191,11 +191,11 @@ public class ExcelParserTest { assertEquals( "application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); - + assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED)); - + String content = handler.toString(); assertContains("This is an Encrypted Excel spreadsheet", content); assertNotContained("9.0", content); @@ -210,24 +210,24 @@ public class ExcelParserTest { @Test public void testExcelParserCharts() throws Exception { InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL-charts.xls"); + "/test-documents/testEXCEL-charts.xls"); try { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); ContentHandler handler = new BodyContentHandler(); new OfficeParser().parse(input, handler, metadata, context); - + assertEquals( "application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); - + String content = handler.toString(); - + // The first sheet has a pie chart assertContains("charttabyodawg", content); assertContains("WhamPuff", content); - + // The second sheet has a bar chart and some text assertContains("Sheet1", content); assertContains("Test Excel Spreasheet", content); @@ -236,7 +236,7 @@ public class ExcelParserTest { assertContains("fizzlepuff", content); assertContains("whyaxis", content); assertContains("eksaxis", content); - + // The third sheet has some text assertContains("Sheet2", content); assertContains("dingdong", content); @@ -265,7 +265,7 @@ public class ExcelParserTest { input.close(); } } - + @Test public void testWorksSpreadsheet70() throws Exception { InputStream input = ExcelParserTest.class.getResourceAsStream( @@ -291,43 +291,43 @@ public class ExcelParserTest { */ @Test public void testExcelXLSB() throws Exception { - Detector detector = new DefaultDetector(); - AutoDetectParser parser = new AutoDetectParser(); - - InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL.xlsb"); - Metadata m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); - - // Should be detected correctly - MediaType type = null; - try { - type = detector.detect(input, m); - assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); - } finally { - input.close(); - } - - // OfficeParser won't handle it - assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); - - // OOXMLParser won't handle it - assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); - - // AutoDetectParser doesn't break on it - input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb"); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - parser.parse(input, handler, m, context); - - String content = handler.toString(); - assertEquals("", content); - } finally { - input.close(); - } + Detector detector = new DefaultDetector(); + AutoDetectParser parser = new AutoDetectParser(); + + InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL.xlsb"); + Metadata m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); + + // Should be detected correctly + MediaType type = null; + try { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); + } finally { + input.close(); + } + + // OfficeParser won't handle it + assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // OOXMLParser won't handle it + assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); + + // AutoDetectParser doesn't break on it + input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb"); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + parser.parse(input, handler, m, context); + + String content = handler.toString(); + assertEquals("", content); + } finally { + input.close(); + } } /** @@ -335,32 +335,32 @@ public class ExcelParserTest { */ @Test public void testExcel95() throws Exception { - Detector detector = new DefaultDetector(); - AutoDetectParser parser = new AutoDetectParser(); - InputStream input; - MediaType type; - Metadata m; - - // First try detection of Excel 5 - m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls"); - input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls"); - try { - type = detector.detect(input, m); - assertEquals("application/vnd.ms-excel", type.toString()); - } finally { - input.close(); - } - - // Now Excel 95 - m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls"); - input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls"); - try { - type = detector.detect(input, m); - assertEquals("application/vnd.ms-excel", type.toString()); + Detector detector = new DefaultDetector(); + AutoDetectParser parser = new AutoDetectParser(); + InputStream input; + MediaType type; + Metadata m; + + // First try detection of Excel 5 + m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls"); + input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls"); + try { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel", type.toString()); + } finally { + input.close(); + } + + // Now Excel 95 + m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls"); + input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls"); + try { + type = detector.detect(input, m); + assertEquals("application/vnd.ms-excel", type.toString()); } finally { - input.close(); + input.close(); } // OfficeParser can handle it @@ -368,8 +368,8 @@ public class ExcelParserTest { // OOXMLParser won't handle it assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); - - + + // Parse the Excel 5 file m = new Metadata(); input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls"); @@ -380,26 +380,26 @@ public class ExcelParserTest { parser.parse(input, handler, m, context); String content = handler.toString(); - + // Sheet names assertContains("Feuil1", content); assertContains("Feuil3", content); - + // Text assertContains("Sample Excel", content); assertContains("Number", content); - + // Numbers assertContains("15", content); assertContains("225", content); - + // Metadata was also fetched assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE)); assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR)); } finally { input.close(); } - + // Parse the Excel 95 file m = new Metadata(); input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls"); @@ -410,12 +410,12 @@ public class ExcelParserTest { parser.parse(input, handler, m, context); String content = handler.toString(); - + // Sheet name assertContains("Foglio1", content); - + // Very boring file, no actual text or numbers! - + // Metadata was also fetched assertEquals(null, m.get(TikaCoreProperties.TITLE)); assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR)); @@ -423,35 +423,35 @@ public class ExcelParserTest { input.close(); } } - + /** * Ensures that custom OLE2 (HPSF) properties are extracted */ @Test public void testCustomProperties() throws Exception { - InputStream input = ExcelParserTest.class.getResourceAsStream( - "/test-documents/testEXCEL_custom_props.xls"); - Metadata metadata = new Metadata(); - - try { - ContentHandler handler = new BodyContentHandler(-1); - ParseContext context = new ParseContext(); - context.set(Locale.class, Locale.US); - new OfficeParser().parse(input, handler, metadata, context); - } finally { - input.close(); - } - - assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("", metadata.get(TikaCoreProperties.CREATOR)); - assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); - assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED)); - assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED)); - assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION)); - assertEquals("true", metadata.get("custom:myCustomBoolean")); - assertEquals("3", metadata.get("custom:myCustomNumber")); - assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); - assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); - assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + InputStream input = ExcelParserTest.class.getResourceAsStream( + "/test-documents/testEXCEL_custom_props.xls"); + Metadata metadata = new Metadata(); + + try { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OfficeParser().parse(input, handler, metadata, context); + } finally { + input.close(); + } + + assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java Fri May 29 14:36:21 2015 @@ -28,20 +28,19 @@ import org.apache.tika.parser.microsoft. import org.junit.Test; - public class OfficeParserTest extends TikaTest { - @Test - public void parseOfficeWord() throws Exception { - Metadata metadata = new Metadata(); - Parser parser = new OfficeParser(); - - String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml; - - assertTrue(xml.contains("test")); - } - - private InputStream getTestDocument(String name) { - return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name)); -} + @Test + public void parseOfficeWord() throws Exception { + Metadata metadata = new Metadata(); + Parser parser = new OfficeParser(); + + String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml; + + assertTrue(xml.contains("test")); + } + + private InputStream getTestDocument(String name) { + return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name)); + } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java Fri May 29 14:36:21 2015 @@ -65,12 +65,12 @@ public class OldExcelParserTest extends // We can get the content type assertEquals("application/vnd.ms-excel.sheet.4", metadata.get(Metadata.CONTENT_TYPE)); - + // But no other metadata assertEquals(null, metadata.get(TikaCoreProperties.TITLE)); assertEquals(null, metadata.get(Metadata.SUBJECT)); } - + /** * Check we can get the plain text properly */ @@ -85,9 +85,9 @@ public class OldExcelParserTest extends } finally { stream.close(); } - + String text = handler.toString(); - + // Check we find a few words we expect in there assertContains("Size", text); assertContains("Returns", text); @@ -104,15 +104,15 @@ public class OldExcelParserTest extends public void testHTML() throws Exception { XMLResult result = getXML(file); String xml = result.xml; - + // Sheet name not found - only 5+ have sheet names assertNotContained("<p>Sheet 1</p>", xml); - + // String cells assertContains("<p>Table 10 -", xml); assertContains("<p>Tax</p>", xml); assertContains("<p>N/A</p>", xml); - + // Number cells assertContains("<p>(1)</p>", xml); assertContains("<p>5.0</p>", xml); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Fri May 29 14:36:21 2015 @@ -21,15 +21,14 @@ import static org.junit.Assert.assertEqu import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import java.io.InputStream; -import java.io.StringWriter; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - import javax.xml.transform.OutputKeys; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; +import java.io.InputStream; +import java.io.StringWriter; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -74,7 +73,7 @@ public class OutlookParserTest { assertEquals( "L'\u00C9quipe Microsoft Outlook Express", metadata.get(Metadata.AUTHOR)); - + // Stored as Thu, 5 Apr 2007 09:26:06 -0700 assertEquals( "2007-04-05T16:26:06Z", @@ -118,7 +117,7 @@ public class OutlookParserTest { } /** - * Test case for TIKA-395, to ensure parser works for new Outlook formats. + * Test case for TIKA-395, to ensure parser works for new Outlook formats. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a> */ @@ -148,29 +147,29 @@ public class OutlookParserTest { assertContains("Streamlined Mail Experience", content); assertContains("Navigation Pane", content); } - + @Test public void testOutlookHTMLVersion() throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); - + // Check the HTML version StringWriter sw = new StringWriter(); SAXTransformerFactory factory = (SAXTransformerFactory) - SAXTransformerFactory.newInstance(); + SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(new StreamResult(sw)); InputStream stream = OutlookParserTest.class.getResourceAsStream( - "/test-documents/testMSG_chinese.msg"); + "/test-documents/testMSG_chinese.msg"); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, new ParseContext()); } finally { - stream.close(); + stream.close(); } - + // As the HTML version should have been processed, ensure // we got some of the links String content = sw.toString(); @@ -178,7 +177,7 @@ public class OutlookParserTest { assertContains("<p>Alfresco MSG format testing", content); assertContains("<li>1", content); assertContains("<li>2", content); - + // Make sure we don't have nested html docs assertEquals(2, content.split("<body>").length); assertEquals(2, content.split("<\\/body>").length); @@ -188,39 +187,39 @@ public class OutlookParserTest { public void testOutlookForwarded() throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); - + // Check the HTML version StringWriter sw = new StringWriter(); SAXTransformerFactory factory = (SAXTransformerFactory) - SAXTransformerFactory.newInstance(); + SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(new StreamResult(sw)); InputStream stream = OutlookParserTest.class.getResourceAsStream( - "/test-documents/testMSG_forwarded.msg"); + "/test-documents/testMSG_forwarded.msg"); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, new ParseContext()); } finally { - stream.close(); + stream.close(); } - + // Make sure we don't have nested docs String content = sw.toString(); assertEquals(2, content.split("<body>").length); assertEquals(2, content.split("<\\/body>").length); } - + @Test public void testOutlookHTMLfromRTF() throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); - + // Check the HTML version StringWriter sw = new StringWriter(); SAXTransformerFactory factory = (SAXTransformerFactory) - SAXTransformerFactory.newInstance(); + SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); @@ -229,24 +228,24 @@ public class OutlookParserTest { InputStream stream = OutlookParserTest.class.getResourceAsStream( "/test-documents/test-outlook2003.msg"); try { - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, new ParseContext()); } finally { - stream.close(); + stream.close(); } - + // As the HTML version should have been processed, ensure // we got some of the links - String content = sw.toString().replaceAll("<p>\\s+","<p>"); + String content = sw.toString().replaceAll("<p>\\s+", "<p>"); assertContains("<dd>New Outlook User</dd>", content); assertContains("designed <i>to help you", content); assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content); - + // Link - check text around it, and the link itself assertContains("sign up for a free subscription", content); assertContains("Office Newsletter", content); assertContains("newsletter will be sent to you", content); assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content); - + // Make sure we don't have nested html docs assertEquals(2, content.split("<body>").length); assertEquals(2, content.split("<\\/body>").length);
