This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4381 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 451140128d1f5d3c775e7c8ddd355f6700027134 Author: tallison <[email protected]> AuthorDate: Wed Feb 12 12:24:40 2025 -0500 TIKA-4381 -- further updates --- .../main/java/org/apache/tika/metadata/MAPI.java | 18 +++- .../microsoft/msg/ExtendedMetadataExtractor.java | 118 +++++++++++++++------ .../tika/parser/microsoft/msg/PIDShortID.csv | 1 + .../tika/parser/microsoft/OutlookParserTest.java | 9 +- .../org/apache/tika/parser/pkg/ZipParserTest.java | 5 + 5 files changed, 113 insertions(+), 38 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index af369aae6..dba87a5b3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -27,20 +27,20 @@ public interface MAPI { String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String PREFIX_MAPI_APPT_META = "mapi:appt" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX_MAPI_REMINDER_META = "mapi:reminder" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** * MAPI message class. What type of .msg/MAPI file is it? * This is normalized via "mapi_message_classes.properties */ - Property MESSAGE_CLASS = - Property.internalText(PREFIX_MAPI_META + "message-class"); + Property MESSAGE_CLASS = Property.internalText(PREFIX_MAPI_META + "message-class"); /** * MAPI message class. What type of .msg/MAPI file is it? * This is the raw value that is retrieved from the underlying chunk */ - Property MESSAGE_CLASS_RAW = - Property.internalText(PREFIX_MAPI_META + "message-class-raw"); + Property MESSAGE_CLASS_RAW = Property.internalText(PREFIX_MAPI_META + "message-class-raw"); Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + "sent-by-server-type"); @@ -79,5 +79,13 @@ public interface MAPI { Property APPT_START_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "start-time"); Property APPT_END_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "end-time"); - Property APPT_END_REPEAT_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "end-repeat-time"); + Property APPT_PROPOSED_START_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "proposed-start-time"); + Property APPT_PROPOSED_END_TIME = Property.internalDate(PREFIX_MAPI_APPT_META + "proposed-end-time"); + Property APPT_LOCATION = Property.internalText(PREFIX_MAPI_APPT_META + "location"); + + Property REMINDER_TIME = Property.internalDate(PREFIX_MAPI_REMINDER_META + "time"); + //PidLidReminderSignalTime -- when a reminder transitions from pending to overdue + Property REMINDER_SIGNAL_TIME = Property.internalDate(PREFIX_MAPI_REMINDER_META + "signal-time"); + } + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java index aefb406f9..1e6597dc1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java @@ -21,16 +21,11 @@ import static java.nio.charset.StandardCharsets.UTF_8; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; -import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Calendar; -import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; -import java.util.Set; -import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import org.apache.poi.hsmf.MAPIMessage; @@ -42,6 +37,7 @@ import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.parser.microsoft.OutlookExtractor; +import org.apache.tika.utils.StringUtils; /** * This class is intended to handle the metadata that is typically not @@ -56,10 +52,8 @@ public class ExtendedMetadataExtractor { } private static void loadProperties() { - Set<String> areas = new TreeSet<>(); try (BufferedReader r = new BufferedReader( - new InputStreamReader( - OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/PIDShortID.csv"), UTF_8))) { + new InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/PIDShortID.csv"), UTF_8))) { String line = r.readLine(); while (line != null) { if (line.isBlank() || line.startsWith("#")) { @@ -125,7 +119,9 @@ public class ExtendedMetadataExtractor { if (col.startsWith("\"") && col.endsWith("\"")) { //this is not robust, but we're running it // on known data. - return col.substring(1, col.length() - 1).trim(); + return col + .substring(1, col.length() - 1) + .trim(); } else { throw new IllegalArgumentException("cell must start and end with a quote: '" + col + "'"); } @@ -135,53 +131,115 @@ public class ExtendedMetadataExtractor { static Map<Integer, Property> PROPERTIES = new ConcurrentHashMap<>(); static { - //TODO -- figure out how these differ and how they overlap with other types - - PROPERTIES.put(0x8003, MAPI.APPT_START_TIME); - PROPERTIES.put(0x8005, MAPI.APPT_START_TIME); - PROPERTIES.put(0x8007, MAPI.APPT_START_TIME); - PROPERTIES.put(0x8009, MAPI.APPT_START_TIME); - PROPERTIES.put(0x801b, MAPI.APPT_START_TIME); - - PROPERTIES.put(0x8004, MAPI.APPT_END_TIME); - PROPERTIES.put(0x8006, MAPI.APPT_END_TIME); - PROPERTIES.put(0x801c, MAPI.APPT_END_TIME); -// PROPERTIES.put(0x8015, MAPI.APPT_END_REPEAT_TIME); + //PidLidAppointmentStartWhole + PROPERTIES.put(0x820D, MAPI.APPT_START_TIME); + //PidLidAppointmentProposedStartWhole + PROPERTIES.put(0x8250, MAPI.APPT_PROPOSED_START_TIME); + //PidLidAppointmentEndWhole + PROPERTIES.put(0x820E, MAPI.APPT_END_TIME); + //PidLidAppointmentProposedEndWhole + PROPERTIES.put(0x8251, MAPI.APPT_PROPOSED_END_TIME); + + PROPERTIES.put(0x8005, MAPI.REMINDER_TIME); + PROPERTIES.put(0x8006, MAPI.REMINDER_SIGNAL_TIME); + + //there are other values for this key see + PROPERTIES.put(0x8009, MAPI.APPT_LOCATION); } public static void extract(MAPIMessage msg, Metadata metadata) { - + //TODO -- we should map properties to message class types so that we're not + //reporting contact metadata for an appointment etc... + //I started down this path with PIDShortID.csv's "area" field, + //but that requires quite a bit of work. + //perhaps we could map by Defining Reference? for (Map.Entry<MAPIProperty, List<PropertyValue>> e : msg .getMainChunks() .getMessageProperties() .getProperties() .entrySet()) { - if (e.getValue().isEmpty()) { + List<PropertyValue> props = e.getValue(); + + if (props == null || props.isEmpty()) { continue; } + //we could allow user configured levels for extended properties + //small, medium, large... MAPIProperty mapiProperty = e.getKey(); - if (TIKA_MAPI_PROPERTIES.containsKey(mapiProperty.id)) { + boolean added = false; + if (PROPERTIES.containsKey(mapiProperty.id)) { + PropertyValue propertyValue = props.get(0); + added = addKnownProperty(PROPERTIES.get(mapiProperty.id), propertyValue, metadata); + } + + if (!added && TIKA_MAPI_PROPERTIES.containsKey(mapiProperty.id)) { List<TikaMapiProperty> tikaMapiProperties = TIKA_MAPI_PROPERTIES.get(mapiProperty.id); for (TikaMapiProperty tikaMapiProperty : tikaMapiProperties) { - for (PropertyValue propertyValue : e.getValue()) { + for (PropertyValue propertyValue : props) { if (tikaMapiProperty.containsType(propertyValue.getActualType())) { - updateMetadata(tikaMapiProperty, propertyValue, metadata); + added = updateMetadata(tikaMapiProperty, propertyValue, metadata); } } } } + if (!added) { + for (PropertyValue propertyValue : e.getValue()) { + //narrowly scoped to current interests...maybe broaden out? + if (propertyValue.getActualType() == Types.TIME) { + String key = MAPI.PREFIX_MAPI_RAW_META + "unknown-date-prop:" + + StringUtils.leftPad(Integer.toHexString(propertyValue.getProperty().id), 4, '0'); + Calendar cal = (Calendar) propertyValue.getValue(); + //truncate to seconds? toInstant().truncatedTo(ChronoUnit.SECONDS).... + metadata.add(key, cal + .toInstant() + .toString()); + } + } + } } } - private static void updateMetadata(TikaMapiProperty tikaMapiProperty, PropertyValue propertyValue, Metadata metadata) { - String key = "mapi-raw:" + tikaMapiProperty.name; + private static boolean addKnownProperty(Property property, PropertyValue propertyValue, Metadata metadata) { + //this is quite limited. + if (propertyValue.getActualType() == Types.TIME && property.getValueType() == Property.ValueType.DATE) { + metadata.set(property, (Calendar) propertyValue.getValue()); + return true; + } else if (isString(propertyValue) && property.getValueType() == Property.ValueType.TEXT) { + metadata.set(property, propertyValue.toString()); + return true; + } + return false; + } + + + private static boolean updateMetadata(TikaMapiProperty tikaMapiProperty, PropertyValue propertyValue, Metadata metadata) { + String key = MAPI.PREFIX_MAPI_RAW_META + tikaMapiProperty.name; if (propertyValue.getActualType() == Types.TIME) { Calendar calendar = (Calendar) propertyValue.getValue(); - String calendarString = calendar.toInstant().truncatedTo(ChronoUnit.SECONDS).toString(); + String calendarString = calendar + .toInstant() + .truncatedTo(ChronoUnit.SECONDS) + .toString(); metadata.add(key, calendarString); - } else { + return true; + } else if (shouldIncludeUnknownType(propertyValue)) { metadata.add(key, propertyValue.toString()); + return true; } + return false; + } + + private static boolean shouldIncludeUnknownType(PropertyValue propertyValue) { + Types.MAPIType mapiType = propertyValue.getActualType(); + if (mapiType == Types.BINARY || mapiType == Types.UNKNOWN || mapiType == Types.UNSPECIFIED || mapiType == Types.DIRECTORY || mapiType.isPointer()) { + return false; + } + return true; + } + + private static boolean isString(PropertyValue propertyValue) { + Types.MAPIType mapiType = propertyValue.getActualType(); + return mapiType == Types.ASCII_STRING || mapiType == Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType == Types.UNICODE_STRING; } private static class TikaMapiProperty { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv index 2c8a41474..81cadcc3e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/org/apache/tika/parser/microsoft/msg/PIDShortID.csv @@ -17,6 +17,7 @@ # This file derives directly from # https://github.com/rjohnsondev/java-libpst/blob/develop/src/main/resources/PIDShortID.csv # under the ASL 2.0 license option +# These ids are not unique. Some of them are differentiated by data type and some by context (note, appt, etc) # #"ID";"Property";"Description:";"Data type";"Area";"Defining reference";"Alternate names" "0x0001";"PidTagTemplateData";"Describes the controls used in the template that is used to retrieve address book information.";"PtypBinary, 0x0102";"Address Book";"[MS-OXOABKT] section 2.2.2";"PR_EMS_TEMPLATE_BLOB" diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 4afc08416..fcac0f45e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -130,6 +130,7 @@ public class OutlookParserTest extends TikaTest { assertEquals("[email protected]", metadata.get(Message.MESSAGE_TO_EMAIL)); assertEquals("[email protected]", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME)); assertEquals("", metadata.get(Message.MESSAGE_TO_NAME)); + } /** @@ -311,9 +312,11 @@ public class OutlookParserTest extends TikaTest { public void testAppointment() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testMSG_Appointment.msg"); Metadata m = metadataList.get(0); - //debug(m); - //assertTrue(m.get(MAPI.APPT_START_TIME).contains("2017-02-28T18")); - // assertTrue(m.get(MAPI.APPT_END_TIME).contains("2017-02-28T19")); + //for some reason, the normal start/end time properties do not appear to be stored + //in this msg file. There are quite a few unknown date properties...may be an older + //format ? Or may be how I generated the msg file originally ??? + assertTrue(m.get(MAPI.REMINDER_TIME).contains("2017-02-28T18")); + assertTrue(m.get(MAPI.REMINDER_SIGNAL_TIME).contains("2017-02-28T19")); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java index 9f9f71357..931e4df78 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java @@ -105,6 +105,11 @@ public class ZipParserTest extends AbstractPkgTest { assertContains("hello world", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testZipX() throws Exception { + debug(getRecursiveMetadataFromFullPath("/home/tallison/Downloads/demozipxfile.zipx")); + } + @Test public void testDataDescriptorWithEmptyEntry() throws Exception {
