This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4381
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 451140128d1f5d3c775e7c8ddd355f6700027134
Author: tallison <[email protected]>
AuthorDate: Wed Feb 12 12:24:40 2025 -0500

    TIKA-4381 -- further updates
---
 .../main/java/org/apache/tika/metadata/MAPI.java   |  18 +++-
 .../microsoft/msg/ExtendedMetadataExtractor.java   | 118 +++++++++++++++------
 .../tika/parser/microsoft/msg/PIDShortID.csv       |   1 +
 .../tika/parser/microsoft/OutlookParserTest.java   |   9 +-
 .../org/apache/tika/parser/pkg/ZipParserTest.java  |   5 +
 5 files changed, 113 insertions(+), 38 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java 
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index af369aae6..dba87a5b3 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -27,20 +27,20 @@ public interface MAPI {
     String PREFIX_MAPI_META = "mapi" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
     String PREFIX_MAPI_ATTACH_META = "mapi:attach" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
     String PREFIX_MAPI_APPT_META = "mapi:appt" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    String PREFIX_MAPI_REMINDER_META = "mapi:reminder" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
     /**
      * MAPI message class.  What type of .msg/MAPI file is it?
      * This is normalized via "mapi_message_classes.properties
      */
-    Property MESSAGE_CLASS =
-            Property.internalText(PREFIX_MAPI_META + "message-class");
+    Property MESSAGE_CLASS = Property.internalText(PREFIX_MAPI_META + 
"message-class");
 
     /**
      * MAPI message class.  What type of .msg/MAPI file is it?
      * This is the raw value that is retrieved from the underlying chunk
      */
-    Property MESSAGE_CLASS_RAW =
-            Property.internalText(PREFIX_MAPI_META + "message-class-raw");
+    Property MESSAGE_CLASS_RAW = Property.internalText(PREFIX_MAPI_META + 
"message-class-raw");
 
     Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + 
"sent-by-server-type");
 
@@ -79,5 +79,13 @@ public interface MAPI {
 
     Property APPT_START_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + 
"start-time");
     Property APPT_END_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + 
"end-time");
-    Property APPT_END_REPEAT_TIME = 
Property.internalDate(PREFIX_MAPI_APPT_META + "end-repeat-time");
+    Property APPT_PROPOSED_START_TIME = 
Property.internalDate(PREFIX_MAPI_APPT_META + "proposed-start-time");
+    Property APPT_PROPOSED_END_TIME = 
Property.internalDate(PREFIX_MAPI_APPT_META + "proposed-end-time");
+    Property APPT_LOCATION = Property.internalText(PREFIX_MAPI_APPT_META + 
"location");
+
+    Property REMINDER_TIME = Property.internalDate(PREFIX_MAPI_REMINDER_META + 
"time");
+    //PidLidReminderSignalTime -- when a reminder transitions from pending to 
overdue
+    Property REMINDER_SIGNAL_TIME = 
Property.internalDate(PREFIX_MAPI_REMINDER_META + "signal-time");
+
 }
+
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
index aefb406f9..1e6597dc1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
@@ -21,16 +21,11 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.time.format.DateTimeFormatter;
 import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Calendar;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.poi.hsmf.MAPIMessage;
@@ -42,6 +37,7 @@ import org.apache.tika.metadata.MAPI;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.parser.microsoft.OutlookExtractor;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * This class is intended to handle the metadata that is typically not
@@ -56,10 +52,8 @@ public class ExtendedMetadataExtractor {
     }
 
     private static void loadProperties() {
-        Set<String> areas = new TreeSet<>();
         try (BufferedReader r = new BufferedReader(
-                new InputStreamReader(
-                        
OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/PIDShortID.csv"),
 UTF_8))) {
+                new 
InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/PIDShortID.csv"),
 UTF_8))) {
             String line = r.readLine();
             while (line != null) {
                 if (line.isBlank() || line.startsWith("#")) {
@@ -125,7 +119,9 @@ public class ExtendedMetadataExtractor {
         if (col.startsWith("\"") && col.endsWith("\"")) {
             //this is not robust, but we're running it
             // on known data.
-            return col.substring(1, col.length() - 1).trim();
+            return col
+                    .substring(1, col.length() - 1)
+                    .trim();
         } else {
             throw new IllegalArgumentException("cell must start and end with a 
quote: '" + col + "'");
         }
@@ -135,53 +131,115 @@ public class ExtendedMetadataExtractor {
     static Map<Integer, Property> PROPERTIES = new ConcurrentHashMap<>();
 
     static {
-        //TODO -- figure out how these differ and how they overlap with other 
types
-
-        PROPERTIES.put(0x8003, MAPI.APPT_START_TIME);
-        PROPERTIES.put(0x8005, MAPI.APPT_START_TIME);
-        PROPERTIES.put(0x8007, MAPI.APPT_START_TIME);
-        PROPERTIES.put(0x8009, MAPI.APPT_START_TIME);
-        PROPERTIES.put(0x801b, MAPI.APPT_START_TIME);
-
-        PROPERTIES.put(0x8004, MAPI.APPT_END_TIME);
-        PROPERTIES.put(0x8006, MAPI.APPT_END_TIME);
-        PROPERTIES.put(0x801c, MAPI.APPT_END_TIME);
-//        PROPERTIES.put(0x8015, MAPI.APPT_END_REPEAT_TIME);
+        //PidLidAppointmentStartWhole
+        PROPERTIES.put(0x820D, MAPI.APPT_START_TIME);
+        //PidLidAppointmentProposedStartWhole
+        PROPERTIES.put(0x8250, MAPI.APPT_PROPOSED_START_TIME);
+        //PidLidAppointmentEndWhole
+        PROPERTIES.put(0x820E, MAPI.APPT_END_TIME);
+        //PidLidAppointmentProposedEndWhole
+        PROPERTIES.put(0x8251, MAPI.APPT_PROPOSED_END_TIME);
+
+        PROPERTIES.put(0x8005, MAPI.REMINDER_TIME);
+        PROPERTIES.put(0x8006, MAPI.REMINDER_SIGNAL_TIME);
+
+        //there are other values for this key see
+        PROPERTIES.put(0x8009, MAPI.APPT_LOCATION);
     }
 
     public static void extract(MAPIMessage msg, Metadata metadata) {
-
+        //TODO -- we should map properties to message class types so that 
we're not
+        //reporting contact metadata for an appointment etc...
+        //I started down this path with PIDShortID.csv's "area" field,
+        //but that requires quite a bit of work.
+        //perhaps we could map by Defining Reference?
         for (Map.Entry<MAPIProperty, List<PropertyValue>> e : msg
                 .getMainChunks()
                 .getMessageProperties()
                 .getProperties()
                 .entrySet()) {
-            if (e.getValue().isEmpty()) {
+            List<PropertyValue> props = e.getValue();
+
+            if (props == null || props.isEmpty()) {
                 continue;
             }
+            //we could allow user configured levels for extended properties
+            //small, medium, large...
             MAPIProperty mapiProperty = e.getKey();
-            if (TIKA_MAPI_PROPERTIES.containsKey(mapiProperty.id)) {
+            boolean added = false;
+            if (PROPERTIES.containsKey(mapiProperty.id)) {
+                PropertyValue propertyValue = props.get(0);
+                added = addKnownProperty(PROPERTIES.get(mapiProperty.id), 
propertyValue, metadata);
+            }
+
+            if (!added && TIKA_MAPI_PROPERTIES.containsKey(mapiProperty.id)) {
                 List<TikaMapiProperty> tikaMapiProperties = 
TIKA_MAPI_PROPERTIES.get(mapiProperty.id);
                 for (TikaMapiProperty tikaMapiProperty : tikaMapiProperties) {
-                    for (PropertyValue propertyValue : e.getValue()) {
+                    for (PropertyValue propertyValue : props) {
                         if 
(tikaMapiProperty.containsType(propertyValue.getActualType())) {
-                            updateMetadata(tikaMapiProperty, propertyValue, 
metadata);
+                            added = updateMetadata(tikaMapiProperty, 
propertyValue, metadata);
                         }
                     }
                 }
             }
+            if (!added) {
+                for (PropertyValue propertyValue : e.getValue()) {
+                    //narrowly scoped to current interests...maybe broaden out?
+                    if (propertyValue.getActualType() == Types.TIME) {
+                        String key = MAPI.PREFIX_MAPI_RAW_META + 
"unknown-date-prop:" +
+                                
StringUtils.leftPad(Integer.toHexString(propertyValue.getProperty().id), 4, 
'0');
+                        Calendar cal = (Calendar) propertyValue.getValue();
+                        //truncate to seconds? 
toInstant().truncatedTo(ChronoUnit.SECONDS)....
+                        metadata.add(key, cal
+                                .toInstant()
+                                .toString());
+                    }
+                }
+            }
         }
     }
 
-    private static void updateMetadata(TikaMapiProperty tikaMapiProperty, 
PropertyValue propertyValue, Metadata metadata) {
-        String key = "mapi-raw:" + tikaMapiProperty.name;
+    private static boolean addKnownProperty(Property property, PropertyValue 
propertyValue, Metadata metadata) {
+        //this is quite limited.
+        if (propertyValue.getActualType() == Types.TIME && 
property.getValueType() == Property.ValueType.DATE) {
+            metadata.set(property, (Calendar) propertyValue.getValue());
+            return true;
+        } else if (isString(propertyValue) && property.getValueType() == 
Property.ValueType.TEXT) {
+            metadata.set(property, propertyValue.toString());
+            return true;
+        }
+        return false;
+    }
+
+
+    private static boolean updateMetadata(TikaMapiProperty tikaMapiProperty, 
PropertyValue propertyValue, Metadata metadata) {
+        String key = MAPI.PREFIX_MAPI_RAW_META + tikaMapiProperty.name;
         if (propertyValue.getActualType() == Types.TIME) {
             Calendar calendar = (Calendar) propertyValue.getValue();
-            String calendarString = 
calendar.toInstant().truncatedTo(ChronoUnit.SECONDS).toString();
+            String calendarString = calendar
+                    .toInstant()
+                    .truncatedTo(ChronoUnit.SECONDS)
+                    .toString();
             metadata.add(key, calendarString);
-        } else {
+            return true;
+        } else if (shouldIncludeUnknownType(propertyValue)) {
             metadata.add(key, propertyValue.toString());
+            return true;
         }
+        return false;
+    }
+
+    private static boolean shouldIncludeUnknownType(PropertyValue 
propertyValue) {
+        Types.MAPIType mapiType = propertyValue.getActualType();
+        if (mapiType == Types.BINARY || mapiType == Types.UNKNOWN || mapiType 
== Types.UNSPECIFIED || mapiType == Types.DIRECTORY || mapiType.isPointer()) {
+            return false;
+        }
+        return true;
+    }
+
+    private static boolean isString(PropertyValue propertyValue) {
+        Types.MAPIType mapiType = propertyValue.getActualType();
+        return mapiType == Types.ASCII_STRING || mapiType == 
Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType == 
Types.UNICODE_STRING;
     }
 
     private static class TikaMapiProperty {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv
index 2c8a41474..81cadcc3e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv
@@ -17,6 +17,7 @@
 # This file derives directly from
 # 
https://github.com/rjohnsondev/java-libpst/blob/develop/src/main/resources/PIDShortID.csv
 # under the ASL 2.0 license option
+# These ids are not unique. Some of them are differentiated by data type and 
some by context (note, appt, etc)
 #
 #"ID";"Property";"Description:";"Data type";"Area";"Defining 
reference";"Alternate names"
 "0x0001";"PidTagTemplateData";"Describes the controls used in the template 
that is used to retrieve address book information.";"PtypBinary, 
0x0102";"Address Book";"[MS-OXOABKT] section 2.2.2";"PR_EMS_TEMPLATE_BLOB"
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 4afc08416..fcac0f45e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -130,6 +130,7 @@ public class OutlookParserTest extends TikaTest {
         assertEquals("[email protected]", 
metadata.get(Message.MESSAGE_TO_EMAIL));
         assertEquals("[email protected]", 
metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
         assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));
+
     }
 
     /**
@@ -311,9 +312,11 @@ public class OutlookParserTest extends TikaTest {
     public void testAppointment() throws Exception {
         List<Metadata> metadataList = 
getRecursiveMetadata("testMSG_Appointment.msg");
         Metadata m = metadataList.get(0);
-        //debug(m);
-        //assertTrue(m.get(MAPI.APPT_START_TIME).contains("2017-02-28T18"));
-       // assertTrue(m.get(MAPI.APPT_END_TIME).contains("2017-02-28T19"));
+        //for some reason, the normal start/end time properties do not appear 
to be stored
+        //in this msg file. There are quite a few unknown date 
properties...may be an older
+        //format ? Or may be how I generated the msg file originally ???
+        assertTrue(m.get(MAPI.REMINDER_TIME).contains("2017-02-28T18"));
+        assertTrue(m.get(MAPI.REMINDER_SIGNAL_TIME).contains("2017-02-28T19"));
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 9f9f71357..931e4df78 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -105,6 +105,11 @@ public class ZipParserTest extends AbstractPkgTest {
         assertContains("hello world", 
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testZipX() throws Exception {
+        
debug(getRecursiveMetadataFromFullPath("/home/tallison/Downloads/demozipxfile.zipx"));
+    }
+
     @Test
     public void testDataDescriptorWithEmptyEntry() throws Exception {
 

Reply via email to