This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4381 in repository https://gitbox.apache.org/repos/asf/tika.git
commit a840a527fe882afac21d1ac5514188e681c7bc4d Author: tallison <[email protected]> AuthorDate: Thu Feb 6 08:52:46 2025 -0500 TIKA-4381 -- first steps --- .../main/java/org/apache/tika/metadata/MAPI.java | 16 ++++ .../parser/microsoft/AbstractPOIFSExtractor.java | 10 ++- .../tika/parser/microsoft/OutlookExtractor.java | 87 +++++++++++++-------- .../microsoft/msg/ExtendedMetadataExtractor.java | 91 ++++++++++++++++++++++ .../tika/parser/microsoft/OutlookParserTest.java | 9 ++- 5 files changed, 178 insertions(+), 35 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index 57b46307f..af369aae6 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -25,6 +25,8 @@ package org.apache.tika.metadata; public interface MAPI { String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX_MAPI_APPT_META = "mapi:appt" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** * MAPI message class. What type of .msg/MAPI file is it? @@ -64,4 +66,18 @@ public interface MAPI { Property IMPORTANCE = Property.internalInteger(PREFIX_MAPI_META + "importance"); Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority"); Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + "is-flagged"); + + Property ATTACH_LONG_PATH_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name"); + Property ATTACH_LONG_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name"); + Property ATTACH_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "file-name"); + Property ATTACH_CONTENT_ID = Property.internalText(PREFIX_MAPI_ATTACH_META + "content-id"); + Property ATTACH_CONTENT_LOCATION = Property.internalText(PREFIX_MAPI_ATTACH_META + "content-location"); + Property ATTACH_DISPLAY_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "display-name"); + Property ATTACH_EXTENSION = Property.internalText(PREFIX_MAPI_ATTACH_META + "extension"); + Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META + "mime"); + Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META + "language"); + + Property APPT_START_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "start-time"); + Property APPT_END_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "end-time"); + Property APPT_END_REPEAT_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "end-repeat-time"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index b42c0f588..8910b1c00 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -156,6 +156,14 @@ abstract class AbstractPOIFSExtractor { protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, TikaException { + handleEmbeddedOfficeDoc(dir, new Metadata(), resourceName, xhtml, outputHtml); + } + /** + * Handle an office document that's embedded at the POIFS level + */ + protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, Metadata metadata, + String resourceName, XHTMLContentHandler xhtml, boolean outputHtml) + throws IOException, SAXException, TikaException { // Is it an embedded OLE2 document, or an embedded OOXML document? @@ -165,7 +173,6 @@ abstract class AbstractPOIFSExtractor { if (ooxml != null) { // It's OOXML (has a ZipFile): - Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(((DocumentEntry)ooxml).getSize())); try (TikaInputStream stream = TikaInputStream @@ -191,7 +198,6 @@ abstract class AbstractPOIFSExtractor { // It's regular OLE2: // What kind of document is it? - Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, dir.getName()); if (dir.getStorageClsid() != null) { metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index b9c14c115..10aa1310e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -73,12 +73,15 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.html.HtmlEncodingDetector; import org.apache.tika.parser.html.JSoupParser; import org.apache.tika.parser.mailcommons.MailDateParser; +import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor; import org.apache.tika.parser.microsoft.rtf.RTFParser; import org.apache.tika.parser.txt.CharsetDetector; import org.apache.tika.parser.txt.CharsetMatch; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; + /** * Outlook Message Parser. @@ -197,13 +200,11 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { return "UNKNOWN"; } - public void parse(XHTMLContentHandler xhtml) - throws TikaException, SAXException, IOException { + public void parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException { try { _parse(xhtml); } catch (ChunkNotFoundException e) { - throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", - e); + throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e); } /*finally { //You'd think you'd want to call msg.close(). //Don't do that. That closes down the file system. @@ -214,8 +215,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { }*/ } - private void _parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, - IOException, ChunkNotFoundException { + private void _parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException, ChunkNotFoundException { msg.setReturnNullOnMissingChunk(true); // If the message contains strings that aren't stored @@ -229,6 +229,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { handleFromTo(headers, parentMetadata); handleMessageInfo(msg, headers, parentMetadata); + ExtendedMetadataExtractor.extract(msg, parentMetadata); try { for (String recipientAddress : msg.getRecipientEmailAddressList()) { @@ -268,26 +269,55 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { - + Metadata metadata = new Metadata(); + updateAttachmentMetadata(attachment, metadata); String filename = null; - if (attachment.getAttachLongFileName() != null) { - filename = attachment.getAttachLongFileName().getValue(); - } else if (attachment.getAttachFileName() != null) { - filename = attachment.getAttachFileName().getValue(); + if (!StringUtils.isBlank(metadata.get(MAPI.ATTACH_LONG_FILE_NAME))) { + filename = metadata.get(MAPI.ATTACH_LONG_FILE_NAME); + } else if (!StringUtils.isBlank(metadata.get(MAPI.ATTACH_DISPLAY_NAME))) { + filename = metadata.get(MAPI.ATTACH_DISPLAY_NAME); + } else if (!StringUtils.isBlank(metadata.get(MAPI.ATTACH_FILE_NAME))) { + filename = metadata.get(MAPI.ATTACH_FILE_NAME); } - + //this is allowed to be null; + String mimeType = metadata.get(MAPI.ATTACH_MIME); if (attachment.getAttachData() != null) { - handleEmbeddedResource( - TikaInputStream.get(attachment.getAttachData().getValue()), filename, - null, null, xhtml, true); + handleEmbeddedResource(TikaInputStream.get(attachment + .getAttachData() + .getValue()), metadata, filename, null, null, mimeType, xhtml, true); } if (attachment.getAttachmentDirectory() != null) { - handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), filename, - xhtml, true); + handleEmbeddedOfficeDoc(attachment + .getAttachmentDirectory() + .getDirectory(), metadata, filename, xhtml, true); } } } + + private void updateAttachmentMetadata(AttachmentChunks attachment, Metadata metadata) { + addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME, attachment.getAttachLongPathName(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME, attachment.getAttachLongFileName(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME, attachment.getAttachFileName(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_CONTENT_ID, attachment.getAttachContentId(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_CONTENT_LOCATION, attachment.getAttachContentLocation(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_DISPLAY_NAME, attachment.getAttachDisplayName(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_EXTENSION, attachment.getAttachExtension(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_MIME, attachment.getAttachMimeTag(), metadata); + addStringChunkToMetadata(MAPI.ATTACH_LANGUAGE, attachment.getAttachLanguage(), metadata); + } + + private void addStringChunkToMetadata(Property property, StringChunk stringChunk, Metadata metadata) { + if (stringChunk == null) { + return; + } + String v = stringChunk.getValue(); + if (StringUtils.isBlank(v)) { + return; + } + metadata.set(property, v); + } + private void handleMessageInfo(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException { //this is the literal subject including "re: " metadata.set(TikaCoreProperties.TITLE, msg.getSubject()); @@ -379,7 +409,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { Date d = MailDateParser.parseDateLenient(date); metadata.set(TikaCoreProperties.CREATED, d); metadata.set(TikaCoreProperties.MODIFIED, d); - } catch (SecurityException e ) { + } catch (SecurityException e) { throw e; } catch (Exception e) { // Store it as-is, and hope for the best... @@ -531,15 +561,11 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { //sometimes in SMTP .msg files there is an email in the sender name field. - setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, - metadata); - setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), - MAPI.FROM_REPRESENTING_NAME, metadata); + setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata); + setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), MAPI.FROM_REPRESENTING_NAME, metadata); - setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, - metadata); - setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), - MAPI.FROM_REPRESENTING_EMAIL, metadata); + setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata); + setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), MAPI.FROM_REPRESENTING_EMAIL, metadata); for (Recipient recipient : buildRecipients()) { switch (recipient.recipientType) { @@ -555,8 +581,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { break; case BCC: addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata); - addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, - metadata); + addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata); addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata); break; default: @@ -653,8 +678,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties(); if (props != null) { // First choice is a codepage property - for (MAPIProperty prop : new MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE, - MAPIProperty.INTERNET_CPID}) { + for (MAPIProperty prop : new MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID}) { List<PropertyValue> val = props.get(prop); if (val != null && val.size() > 0) { int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue(); @@ -676,8 +700,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { String[] headers = msg.getHeaders(); if (headers != null && headers.length > 0) { // Look for a content type with a charset - Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", - Pattern.CASE_INSENSITIVE); + Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); for (String header : headers) { if (header.startsWith("Content-Type")) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java new file mode 100644 index 000000000..45463d7d3 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.msg; + +import java.util.Calendar; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.datatypes.MAPIProperty; +import org.apache.poi.hsmf.datatypes.PropertyValue; +import org.apache.poi.hsmf.datatypes.Types; + +import org.apache.tika.metadata.MAPI; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; + +/** + * This class is intended to handle the metadata that is typically not + * included in "Note" types. This focuses on Appointments, Tasks, etc. + */ +public class ExtendedMetadataExtractor { + + static Map<Integer, Property> PROPERTIES = new ConcurrentHashMap<>(); + + static { + //TODO -- figure out how these differ and how they overlap with other types + PROPERTIES.put(0x8003, MAPI.APPT_START_TIME); + PROPERTIES.put(0x8005, MAPI.APPT_START_TIME); + PROPERTIES.put(0x8007, MAPI.APPT_START_TIME); + PROPERTIES.put(0x8009, MAPI.APPT_START_TIME); + PROPERTIES.put(0x801b, MAPI.APPT_START_TIME); + + PROPERTIES.put(0x8004, MAPI.APPT_END_TIME); + PROPERTIES.put(0x8006, MAPI.APPT_END_TIME); + PROPERTIES.put(0x801c, MAPI.APPT_END_TIME); + PROPERTIES.put(0x8015, MAPI.APPT_END_REPEAT_TIME); + } + + public static void extract(MAPIMessage msg, Metadata metadata) { + // + for (Map.Entry<MAPIProperty, List<PropertyValue>> e : msg + .getMainChunks() + .getMessageProperties() + .getProperties() + .entrySet()) { + if (PROPERTIES.containsKey(e.getKey().id)) { + Property p = PROPERTIES.get(e.getKey().id); + List<PropertyValue> values = e.getValue(); + if (p.getValueType() == Property.ValueType.DATE) { + if (!e.getValue() + .isEmpty() && values + .get(0) + .getActualType() == Types.TIME) { + metadata.set(p, (Calendar) values + .get(0) + .getValue()); + } + } + } + /* + Metadata tmp = new Metadata(); + for (PropertyValue v : e.getValue()) { + if (v instanceof PropertyValue.TimePropertyValue) { + MAPIProperty k = e.getKey(); + //System.out.println(k.name + " " + Integer.toHexString(k.id) + + // " " + k.mapiProperty + " :" + v.getValue()); + tmp.set(Property.internalDate(Integer.toHexString(k.id)), (Calendar) v.getValue()); + } + } + for (String n : tmp.names()) { + System.out.println(n + " " + tmp.get(n)); + }*/ + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index a01bd5a8d..5f1a9f402 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -299,7 +299,6 @@ public class OutlookParserTest extends TikaTest { } testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata); - } private void testMsgClass(String expected, Metadata metadata) { @@ -308,6 +307,14 @@ public class OutlookParserTest extends TikaTest { expected + ", but got: " + metadata.get(MAPI.MESSAGE_CLASS)); } + @Test + public void testAppointment() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testMSG_Appointment.msg"); + Metadata m = metadataList.get(0); + assertTrue(m.get(MAPI.APPT_START_TIME).contains("2017-02-28T18")); + assertTrue(m.get(MAPI.APPT_END_TIME).contains("2017-02-28T19")); + } + @Test public void testHandlingAllAlternativesBodies() throws Exception { //test that default only has one body
