This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4381
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a840a527fe882afac21d1ac5514188e681c7bc4d
Author: tallison <[email protected]>
AuthorDate: Thu Feb 6 08:52:46 2025 -0500

    TIKA-4381 -- first steps
---
 .../main/java/org/apache/tika/metadata/MAPI.java   | 16 ++++
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 10 ++-
 .../tika/parser/microsoft/OutlookExtractor.java    | 87 +++++++++++++--------
 .../microsoft/msg/ExtendedMetadataExtractor.java   | 91 ++++++++++++++++++++++
 .../tika/parser/microsoft/OutlookParserTest.java   |  9 ++-
 5 files changed, 178 insertions(+), 35 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java 
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 57b46307f..af369aae6 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -25,6 +25,8 @@ package org.apache.tika.metadata;
 public interface MAPI {
 
     String PREFIX_MAPI_META = "mapi" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    String PREFIX_MAPI_ATTACH_META = "mapi:attach" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    String PREFIX_MAPI_APPT_META = "mapi:appt" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
     /**
      * MAPI message class.  What type of .msg/MAPI file is it?
@@ -64,4 +66,18 @@ public interface MAPI {
     Property IMPORTANCE = Property.internalInteger(PREFIX_MAPI_META + 
"importance");
     Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority");
     Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + 
"is-flagged");
+
+    Property ATTACH_LONG_PATH_NAME = 
Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name");
+    Property ATTACH_LONG_FILE_NAME = 
Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name");
+    Property ATTACH_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META 
+ "file-name");
+    Property ATTACH_CONTENT_ID = Property.internalText(PREFIX_MAPI_ATTACH_META 
+ "content-id");
+    Property ATTACH_CONTENT_LOCATION = 
Property.internalText(PREFIX_MAPI_ATTACH_META + "content-location");
+    Property ATTACH_DISPLAY_NAME = 
Property.internalText(PREFIX_MAPI_ATTACH_META + "display-name");
+    Property ATTACH_EXTENSION = Property.internalText(PREFIX_MAPI_ATTACH_META 
+ "extension");
+    Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META + 
"mime");
+    Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META + 
"language");
+
+    Property APPT_START_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + 
"start-time");
+    Property APPT_END_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + 
"end-time");
+    Property APPT_END_REPEAT_TIME = 
Property.internalDate(PREFIX_MAPI_APPT_META + "end-repeat-time");
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index b42c0f588..8910b1c00 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -156,6 +156,14 @@ abstract class AbstractPOIFSExtractor {
     protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String 
resourceName,
                                            XHTMLContentHandler xhtml, boolean 
outputHtml)
             throws IOException, SAXException, TikaException {
+        handleEmbeddedOfficeDoc(dir, new Metadata(), resourceName, xhtml, 
outputHtml);
+    }
+    /**
+     * Handle an office document that's embedded at the POIFS level
+     */
+    protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, Metadata 
metadata,
+                                           String resourceName, 
XHTMLContentHandler xhtml, boolean outputHtml)
+            throws IOException, SAXException, TikaException {
 
 
         // Is it an embedded OLE2 document, or an embedded OOXML document?
@@ -165,7 +173,6 @@ abstract class AbstractPOIFSExtractor {
 
         if (ooxml != null) {
             // It's OOXML (has a ZipFile):
-            Metadata metadata = new Metadata();
             metadata.set(Metadata.CONTENT_LENGTH,
                     Integer.toString(((DocumentEntry)ooxml).getSize()));
             try (TikaInputStream stream = TikaInputStream
@@ -191,7 +198,6 @@ abstract class AbstractPOIFSExtractor {
         // It's regular OLE2:
 
         // What kind of document is it?
-        Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, 
dir.getName());
         if (dir.getStorageClsid() != null) {
             metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index b9c14c115..10aa1310e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -73,12 +73,15 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlEncodingDetector;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.parser.mailcommons.MailDateParser;
+import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
 import org.apache.tika.parser.microsoft.rtf.RTFParser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
+
 
 /**
  * Outlook Message Parser.
@@ -197,13 +200,11 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         return "UNKNOWN";
     }
 
-    public void parse(XHTMLContentHandler xhtml)
-            throws TikaException, SAXException, IOException {
+    public void parse(XHTMLContentHandler xhtml) throws TikaException, 
SAXException, IOException {
         try {
             _parse(xhtml);
         } catch (ChunkNotFoundException e) {
-            throw new TikaException("POI MAPIMessage broken - didn't return 
null on missing chunk",
-                    e);
+            throw new TikaException("POI MAPIMessage broken - didn't return 
null on missing chunk", e);
         } /*finally {
             //You'd think you'd want to call msg.close().
             //Don't do that.  That closes down the file system.
@@ -214,8 +215,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }*/
     }
 
-    private void _parse(XHTMLContentHandler xhtml) throws TikaException, 
SAXException,
-            IOException, ChunkNotFoundException {
+    private void _parse(XHTMLContentHandler xhtml) throws TikaException, 
SAXException, IOException, ChunkNotFoundException {
         msg.setReturnNullOnMissingChunk(true);
 
         // If the message contains strings that aren't stored
@@ -229,6 +229,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
         handleFromTo(headers, parentMetadata);
         handleMessageInfo(msg, headers, parentMetadata);
+        ExtendedMetadataExtractor.extract(msg, parentMetadata);
 
         try {
             for (String recipientAddress : msg.getRecipientEmailAddressList()) 
{
@@ -268,26 +269,55 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
         // Process the attachments
         for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
-
+            Metadata metadata = new Metadata();
+            updateAttachmentMetadata(attachment, metadata);
             String filename = null;
-            if (attachment.getAttachLongFileName() != null) {
-                filename = attachment.getAttachLongFileName().getValue();
-            } else if (attachment.getAttachFileName() != null) {
-                filename = attachment.getAttachFileName().getValue();
+            if 
(!StringUtils.isBlank(metadata.get(MAPI.ATTACH_LONG_FILE_NAME))) {
+                filename = metadata.get(MAPI.ATTACH_LONG_FILE_NAME);
+            } else if 
(!StringUtils.isBlank(metadata.get(MAPI.ATTACH_DISPLAY_NAME))) {
+                filename = metadata.get(MAPI.ATTACH_DISPLAY_NAME);
+            } else if 
(!StringUtils.isBlank(metadata.get(MAPI.ATTACH_FILE_NAME))) {
+                filename = metadata.get(MAPI.ATTACH_FILE_NAME);
             }
-
+            //this is allowed to be null;
+            String mimeType = metadata.get(MAPI.ATTACH_MIME);
             if (attachment.getAttachData() != null) {
-                handleEmbeddedResource(
-                        
TikaInputStream.get(attachment.getAttachData().getValue()), filename,
-                        null, null, xhtml, true);
+                handleEmbeddedResource(TikaInputStream.get(attachment
+                        .getAttachData()
+                        .getValue()), metadata, filename, null, null, 
mimeType, xhtml, true);
             }
             if (attachment.getAttachmentDirectory() != null) {
-                
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), 
filename,
-                        xhtml, true);
+                handleEmbeddedOfficeDoc(attachment
+                        .getAttachmentDirectory()
+                        .getDirectory(), metadata, filename, xhtml, true);
             }
         }
 
     }
+
+    private void updateAttachmentMetadata(AttachmentChunks attachment, 
Metadata metadata) {
+        addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME, 
attachment.getAttachLongPathName(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME, 
attachment.getAttachLongFileName(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME, 
attachment.getAttachFileName(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_CONTENT_ID, 
attachment.getAttachContentId(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_CONTENT_LOCATION, 
attachment.getAttachContentLocation(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_DISPLAY_NAME, 
attachment.getAttachDisplayName(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_EXTENSION, 
attachment.getAttachExtension(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_MIME, 
attachment.getAttachMimeTag(), metadata);
+        addStringChunkToMetadata(MAPI.ATTACH_LANGUAGE, 
attachment.getAttachLanguage(), metadata);
+    }
+
+    private void addStringChunkToMetadata(Property property, StringChunk 
stringChunk, Metadata metadata) {
+        if (stringChunk == null) {
+            return;
+        }
+        String v = stringChunk.getValue();
+        if (StringUtils.isBlank(v)) {
+            return;
+        }
+        metadata.set(property, v);
+    }
+
     private void handleMessageInfo(MAPIMessage msg, Map<String, String[]> 
headers, Metadata metadata) throws ChunkNotFoundException {
         //this is the literal subject including "re: "
         metadata.set(TikaCoreProperties.TITLE, msg.getSubject());
@@ -379,7 +409,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                             Date d = MailDateParser.parseDateLenient(date);
                             metadata.set(TikaCoreProperties.CREATED, d);
                             metadata.set(TikaCoreProperties.MODIFIED, d);
-                        } catch (SecurityException e ) {
+                        } catch (SecurityException e) {
                             throw e;
                         } catch (Exception e) {
                             // Store it as-is, and hope for the best...
@@ -531,15 +561,11 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
         //sometimes in SMTP .msg files there is an email in the sender name 
field.
 
-        setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), 
Message.MESSAGE_FROM_NAME,
-                metadata);
-        setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME),
-                MAPI.FROM_REPRESENTING_NAME, metadata);
+        setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), 
Message.MESSAGE_FROM_NAME, metadata);
+        setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), 
MAPI.FROM_REPRESENTING_NAME, metadata);
 
-        setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), 
Message.MESSAGE_FROM_EMAIL,
-                metadata);
-        
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS),
-                MAPI.FROM_REPRESENTING_EMAIL, metadata);
+        setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), 
Message.MESSAGE_FROM_EMAIL, metadata);
+        
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), 
MAPI.FROM_REPRESENTING_EMAIL, metadata);
 
         for (Recipient recipient : buildRecipients()) {
             switch (recipient.recipientType) {
@@ -555,8 +581,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                     break;
                 case BCC:
                     addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, 
metadata);
-                    addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, 
recipient.displayName,
-                            metadata);
+                    addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, 
recipient.displayName, metadata);
                     addEvenIfNull(Message.MESSAGE_BCC_EMAIL, 
recipient.emailAddress, metadata);
                     break;
                 default:
@@ -653,8 +678,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         Map<MAPIProperty, List<PropertyValue>> props = 
mainChunks.getProperties();
         if (props != null) {
             // First choice is a codepage property
-            for (MAPIProperty prop : new 
MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE,
-                    MAPIProperty.INTERNET_CPID}) {
+            for (MAPIProperty prop : new 
MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID}) {
                 List<PropertyValue> val = props.get(prop);
                 if (val != null && val.size() > 0) {
                     int codepage = ((PropertyValue.LongPropertyValue) 
val.get(0)).getValue();
@@ -676,8 +700,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
             String[] headers = msg.getHeaders();
             if (headers != null && headers.length > 0) {
                 // Look for a content type with a charset
-                Pattern p = 
Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
-                        Pattern.CASE_INSENSITIVE);
+                Pattern p = 
Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", 
Pattern.CASE_INSENSITIVE);
 
                 for (String header : headers) {
                     if (header.startsWith("Content-Type")) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
new file mode 100644
index 000000000..45463d7d3
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.msg;
+
+import java.util.Calendar;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.PropertyValue;
+import org.apache.poi.hsmf.datatypes.Types;
+
+import org.apache.tika.metadata.MAPI;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * This class is intended to handle the metadata that is typically not
+ * included in "Note" types. This focuses on Appointments, Tasks, etc.
+ */
+public class ExtendedMetadataExtractor {
+
+    static Map<Integer, Property> PROPERTIES = new ConcurrentHashMap<>();
+
+    static {
+        //TODO -- figure out how these differ and how they overlap with other 
types
+        PROPERTIES.put(0x8003, MAPI.APPT_START_TIME);
+        PROPERTIES.put(0x8005, MAPI.APPT_START_TIME);
+        PROPERTIES.put(0x8007, MAPI.APPT_START_TIME);
+        PROPERTIES.put(0x8009, MAPI.APPT_START_TIME);
+        PROPERTIES.put(0x801b, MAPI.APPT_START_TIME);
+
+        PROPERTIES.put(0x8004, MAPI.APPT_END_TIME);
+        PROPERTIES.put(0x8006, MAPI.APPT_END_TIME);
+        PROPERTIES.put(0x801c, MAPI.APPT_END_TIME);
+        PROPERTIES.put(0x8015, MAPI.APPT_END_REPEAT_TIME);
+    }
+
+    public static void extract(MAPIMessage msg, Metadata metadata) {
+        //
+        for (Map.Entry<MAPIProperty, List<PropertyValue>> e : msg
+                .getMainChunks()
+                .getMessageProperties()
+                .getProperties()
+                .entrySet()) {
+            if (PROPERTIES.containsKey(e.getKey().id)) {
+                Property p = PROPERTIES.get(e.getKey().id);
+                List<PropertyValue> values = e.getValue();
+                if (p.getValueType() == Property.ValueType.DATE) {
+                    if (!e.getValue()
+                            .isEmpty() && values
+                            .get(0)
+                            .getActualType() == Types.TIME) {
+                        metadata.set(p, (Calendar) values
+                                .get(0)
+                                .getValue());
+                    }
+                }
+            }
+            /*
+            Metadata tmp = new Metadata();
+            for (PropertyValue v : e.getValue()) {
+                if (v instanceof PropertyValue.TimePropertyValue) {
+                    MAPIProperty k = e.getKey();
+                    //System.out.println(k.name + " " + 
Integer.toHexString(k.id) +
+                    //      " " + k.mapiProperty + " :" + v.getValue());
+                    tmp.set(Property.internalDate(Integer.toHexString(k.id)), 
(Calendar) v.getValue());
+                }
+            }
+            for (String n : tmp.names()) {
+                System.out.println(n + " " + tmp.get(n));
+            }*/
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index a01bd5a8d..5f1a9f402 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -299,7 +299,6 @@ public class OutlookParserTest extends TikaTest {
         }
 
         testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
-
     }
 
     private void testMsgClass(String expected, Metadata metadata) {
@@ -308,6 +307,14 @@ public class OutlookParserTest extends TikaTest {
                 expected + ", but got: " + metadata.get(MAPI.MESSAGE_CLASS));
     }
 
+    @Test
+    public void testAppointment() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testMSG_Appointment.msg");
+        Metadata m = metadataList.get(0);
+        assertTrue(m.get(MAPI.APPT_START_TIME).contains("2017-02-28T18"));
+        assertTrue(m.get(MAPI.APPT_END_TIME).contains("2017-02-28T19"));
+    }
+
     @Test
     public void testHandlingAllAlternativesBodies() throws Exception {
         //test that default only has one body

Reply via email to