This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5737f0923 TIKA-4389 cleanups for TIKA-4381 (#2144)
5737f0923 is described below
commit 5737f09234cae63b92f199b0503db8b97d00d5bf
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 26 13:07:50 2025 -0500
TIKA-4389 cleanups for TIKA-4381 (#2144)
* TIKA-4389 cleanups for TIKA-4381
---
.../main/java/org/apache/tika/metadata/MAPI.java | 2 +-
.../microsoft/msg/ExtendedMetadataExtractor.java | 129 ++++++++++++---------
.../parser/microsoft/msg/TikaNameIdChunks.java | 6 +-
.../tika/parser/microsoft/OutlookParserTest.java | 47 +++++---
4 files changed, 107 insertions(+), 77 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 45fbad1bc..5f4ef12ae 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -26,7 +26,7 @@ public interface MAPI {
String PREFIX_MAPI_META = "mapi" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PREFIX_MAPI_ATTACH_META = "mapi:attach" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
- String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+ String PREFIX_MAPI_PROPERTY = PREFIX_MAPI_META + "property" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
/**
* MAPI message class. What type of .msg/MAPI file is it?
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
index 6d4880c51..877bd796e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
@@ -42,7 +42,6 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.microsoft.OutlookExtractor;
import org.apache.tika.utils.StringUtils;
/**
@@ -59,60 +58,23 @@ public class ExtendedMetadataExtractor {
loadProperties();
}
-
- private static List<Types.MAPIType> parseDataTypes(String[] arr) {
- if (arr.length == 1) {
- Types.MAPIType type = parseDataType(arr[0]);
- if (type != null) {
- return List.of(type);
- }
- return Collections.EMPTY_LIST;
- }
- List<Types.MAPIType> types = new ArrayList<>();
- for (String s : arr) {
- Types.MAPIType type = parseDataType(s);
- if (type != null) {
- types.add(type);
- }
- }
- return types;
- }
-
- private static Types.MAPIType parseDataType(String s) {
- if (StringUtils.isBlank(s)) {
- return null;
- }
- String[] parts = s.split(", ");
- if (parts.length != 2) {
- throw new IllegalArgumentException("expected two parts: " + s);
- }
- String num = parts[1];
- if (num.startsWith("0x")) {
- num = num.substring(2);
+ public static void extract(MAPIMessage msg, Metadata metadata) {
+ if (msg.getNameIdChunks() == null) {
+ return;
}
- int id = Integer.parseInt(num, 16);
- Types.MAPIType type = Types.getById(id);
- if (type == null) {
- //TODO:
- /*
- PtypRestriction, 0x00FD
- PtypRuleAction, 0x00FE
- PtypServerId, 0x00FB
- */
- return Types.createCustom(id);
+ if (msg.getMainChunks() == null ||
msg.getMainChunks().getRawProperties() == null) {
+ return;
}
- return type;
- }
-
-
- public static void extract(MAPIMessage msg, Metadata metadata) {
- //prep our custom nameIdchunk handler
+ //prep our custom nameIdChunk handler
TikaNameIdChunks tikaNameIdChunks = new TikaNameIdChunks();
//short-circuit for files that have an empty nameIdChunk
long len = 0;
for (Chunk chunk : msg
.getNameIdChunks()
.getAll()) {
+ if (chunk == null) {
+ continue;
+ }
tikaNameIdChunks.record(chunk);
if (chunk instanceof ByteChunk) {
byte[] value = ((ByteChunk)chunk).getValue();
@@ -124,7 +86,11 @@ public class ExtendedMetadataExtractor {
if (len == 0) {
return;
}
- tikaNameIdChunks.chunksComplete();
+ try {
+ tikaNameIdChunks.chunksComplete();
+ } catch (IllegalStateException e) {
+ LOGGER.warn("bad namechunks stream", e);
+ }
for (Map.Entry<MAPIProperty, PropertyValue> e : msg
.getMainChunks()
.getRawProperties()
@@ -132,6 +98,9 @@ public class ExtendedMetadataExtractor {
//the mapiproperties from POI are the literal storage id for that
particular file.
//Those storage ids must be mapped via the name chunk ids into a
known id
PropertyValue v = e.getValue();
+ if (v == null) {
+ continue;
+ }
List<MAPITag> mapiTags = tikaNameIdChunks.getTags(e.getKey().id);
MAPITagPair pair = null;
for (MAPITag mapiTag : mapiTags) {
@@ -146,7 +115,6 @@ public class ExtendedMetadataExtractor {
}
updateMetadata(pair, v, metadata);
}
-
}
@@ -180,7 +148,7 @@ public class ExtendedMetadataExtractor {
if (!includeType(propertyValue)) {
return;
}
- String key = MAPI.PREFIX_MAPI_RAW_META + pair.tikaMapiProperty.name;
+ String key = MAPI.PREFIX_MAPI_PROPERTY + pair.tikaMapiProperty.name;
Types.MAPIType type = propertyValue.getActualType();
if (type == Types.TIME || type == Types.MV_TIME || type ==
Types.APP_TIME || type == Types.MV_APP_TIME) {
Calendar calendar = (Calendar) propertyValue.getValue();
@@ -190,8 +158,12 @@ public class ExtendedMetadataExtractor {
.toString();
metadata.add(key, calendarString);
} else if (type == Types.BOOLEAN) {
- metadata.add(key, Boolean.toString((boolean)
propertyValue.getValue()));
- } else {
+ Boolean val = (Boolean)propertyValue.getValue();
+ if (val == null) {
+ return;
+ }
+ metadata.add(key, Boolean.toString(val));
+ } else if (! StringUtils.isBlank(propertyValue.toString())) {
metadata.add(key, propertyValue.toString());
}
@@ -205,11 +177,6 @@ public class ExtendedMetadataExtractor {
return true;
}
- private static boolean isString(PropertyValue propertyValue) {
- Types.MAPIType mapiType = propertyValue.getActualType();
- return mapiType == Types.ASCII_STRING || mapiType ==
Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType ==
Types.UNICODE_STRING;
- }
-
private static class TikaMapiProperty {
String name;
ClassID classID; // can be null
@@ -237,7 +204,7 @@ public class ExtendedMetadataExtractor {
.toUUIDString(), setType.getClassID());
}
try (BufferedReader r = new BufferedReader(
- new
InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"),
UTF_8))) {
+ new
InputStreamReader(ExtendedMetadataExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"),
UTF_8))) {
String line = r.readLine();
while (line != null) {
if (line.isBlank() || line.startsWith("#")) {
@@ -309,4 +276,50 @@ public class ExtendedMetadataExtractor {
this.tikaMapiProperty = tikaMapiProperty;
}
}
+
+
+ private static List<Types.MAPIType> parseDataTypes(String[] arr) {
+ if (arr.length == 1) {
+ Types.MAPIType type = parseDataType(arr[0]);
+ if (type != null) {
+ return List.of(type);
+ }
+ return Collections.EMPTY_LIST;
+ }
+ List<Types.MAPIType> types = new ArrayList<>();
+ for (String s : arr) {
+ Types.MAPIType type = parseDataType(s);
+ if (type != null) {
+ types.add(type);
+ }
+ }
+ return types;
+ }
+
+ private static Types.MAPIType parseDataType(String s) {
+ if (StringUtils.isBlank(s)) {
+ return null;
+ }
+ String[] parts = s.split(", ");
+ if (parts.length != 2) {
+ throw new IllegalArgumentException("expected two parts: " + s);
+ }
+ String num = parts[1];
+ if (num.startsWith("0x")) {
+ num = num.substring(2);
+ }
+ int id = Integer.parseInt(num, 16);
+ Types.MAPIType type = Types.getById(id);
+ if (type == null) {
+ //TODO:
+ /*
+ PtypRestriction, 0x00FD
+ PtypRuleAction, 0x00FE
+ PtypServerId, 0x00FB
+ */
+ return Types.createCustom(id);
+ }
+ return type;
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
index 54e963ee3..ba54f6e4e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
@@ -18,6 +18,7 @@
package org.apache.tika.parser.microsoft.msg;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@@ -132,10 +133,11 @@ public final class TikaNameIdChunks implements ChunkGroup
{
loadTags();
}
+ //does not return null
public List<MAPITag> getTags(int storageId) {
List<MAPITag> tags = mapiTagMap.get(storageId);
if (tags == null) {
- return new ArrayList<>();
+ return Collections.emptyList();
}
return tags;
}
@@ -235,7 +237,7 @@ public final class TikaNameIdChunks implements ChunkGroup {
return 0;
}
for (Chunk chunk : chunks) {
- if (chunk.getType() != Types.BINARY || chunk.getChunkId() !=
streamID) {
+ if (chunk == null || chunk.getType() != Types.BINARY ||
chunk.getChunkId() != streamID) {
continue;
}
byte[] matchChunkBytes = ((ByteChunk) chunk).getValue();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 10d2e02b4..6c532a84e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -312,30 +312,45 @@ public class OutlookParserTest extends TikaTest {
public void testAppointmentExtendedMetadata() throws Exception {
List<Metadata> metadataList =
getRecursiveMetadata("testMSG_Appointment.msg");
Metadata m = metadataList.get(0);
- debug(m);
-
assertTrue(m.get("mapi:raw:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
-
assertTrue(m.get("mapi:raw:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
-
assertTrue(m.get("mapi:raw:PidLidClipStart").contains("2017-02-28T18"));
- assertTrue(m.get("mapi:raw:PidLidClipEnd").contains("2017-02-28T19"));
-
assertTrue(m.get("mapi:raw:PidLidCommonStart").contains("2017-02-28T18"));
-
assertTrue(m.get("mapi:raw:PidLidCommonEnd").contains("2017-02-28T19"));
-
assertTrue(m.get("mapi:raw:PidLidReminderSignalTime").contains("4501-01-01T00"));
-
assertTrue(m.get("mapi:raw:PidLidReminderTime").contains("2017-02-28T18"));
-
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
- assertEquals("0", m.get("mapi:raw:PidLidAppointmentSequence"));
- assertEquals("false", m.get("mapi:raw:PidLidRecurring"));
+
assertTrue(m.get("mapi:property:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
+
assertTrue(m.get("mapi:property:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
+
assertTrue(m.get("mapi:property:PidLidClipStart").contains("2017-02-28T18"));
+
assertTrue(m.get("mapi:property:PidLidClipEnd").contains("2017-02-28T19"));
+
assertTrue(m.get("mapi:property:PidLidCommonStart").contains("2017-02-28T18"));
+
assertTrue(m.get("mapi:property:PidLidCommonEnd").contains("2017-02-28T19"));
+
assertTrue(m.get("mapi:property:PidLidReminderSignalTime").contains("4501-01-01T00"));
+
assertTrue(m.get("mapi:property:PidLidReminderTime").contains("2017-02-28T18"));
+
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
+ assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence"));
+ assertEquals("false", m.get("mapi:property:PidLidRecurring"));
}
@Test
public void testTaskExtendedMetadata() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Task.msg");
Metadata m = metadataList.get(0);
-
assertTrue(m.get("mapi:raw:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
-
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
- assertEquals("0", m.get("mapi:raw:PidLidTaskActualEffort"));
- assertEquals("false", m.get("mapi:raw:PidLidTeamTask"));
+
assertTrue(m.get("mapi:property:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
+
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
+ assertEquals("0", m.get("mapi:property:PidLidTaskActualEffort"));
+ assertEquals("false", m.get("mapi:property:PidLidTeamTask"));
}
+ @Test
+ public void testContactExtendedMetadata() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testMSG_Contact.msg");
+ Metadata m = metadataList.get(0);
+ assertEquals("2017-02-28T18:41:37Z",
m.get("mapi:property:PidLidValidFlagStringProof"));
+ }
+
+
+ @Test
+ public void testPostExtendedMetadata() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testMSG_Post.msg");
+ Metadata m = metadataList.get(0);
+ assertEquals("2017-02-28T18:47:11Z",
m.get("mapi:property:PidLidValidFlagStringProof"));
+ }
+
+
@Test
public void testHandlingAllAlternativesBodies() throws Exception {
//test that default only has one body