This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 5737f0923 TIKA-4389 cleanups for TIKA-4381 (#2144)
5737f0923 is described below

commit 5737f09234cae63b92f199b0503db8b97d00d5bf
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 26 13:07:50 2025 -0500

    TIKA-4389 cleanups for TIKA-4381 (#2144)
    
    * TIKA-4389 cleanups for TIKA-4381
---
 .../main/java/org/apache/tika/metadata/MAPI.java   |   2 +-
 .../microsoft/msg/ExtendedMetadataExtractor.java   | 129 ++++++++++++---------
 .../parser/microsoft/msg/TikaNameIdChunks.java     |   6 +-
 .../tika/parser/microsoft/OutlookParserTest.java   |  47 +++++---
 4 files changed, 107 insertions(+), 77 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java 
b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
index 45fbad1bc..5f4ef12ae 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java
@@ -26,7 +26,7 @@ public interface MAPI {
 
     String PREFIX_MAPI_META = "mapi" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
     String PREFIX_MAPI_ATTACH_META = "mapi:attach" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
-    String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    String PREFIX_MAPI_PROPERTY = PREFIX_MAPI_META + "property" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
     /**
      * MAPI message class.  What type of .msg/MAPI file is it?
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
index 6d4880c51..877bd796e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java
@@ -42,7 +42,6 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.tika.metadata.MAPI;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.microsoft.OutlookExtractor;
 import org.apache.tika.utils.StringUtils;
 
 /**
@@ -59,60 +58,23 @@ public class ExtendedMetadataExtractor {
         loadProperties();
     }
 
-
-    private static List<Types.MAPIType> parseDataTypes(String[] arr) {
-        if (arr.length == 1) {
-            Types.MAPIType type = parseDataType(arr[0]);
-            if (type != null) {
-                return List.of(type);
-            }
-            return Collections.EMPTY_LIST;
-        }
-        List<Types.MAPIType> types = new ArrayList<>();
-        for (String s : arr) {
-            Types.MAPIType type = parseDataType(s);
-            if (type != null) {
-                types.add(type);
-            }
-        }
-        return types;
-    }
-
-    private static Types.MAPIType parseDataType(String s) {
-        if (StringUtils.isBlank(s)) {
-            return null;
-        }
-        String[] parts = s.split(", ");
-        if (parts.length != 2) {
-            throw new IllegalArgumentException("expected two parts: " + s);
-        }
-        String num = parts[1];
-        if (num.startsWith("0x")) {
-            num = num.substring(2);
+    public static void extract(MAPIMessage msg, Metadata metadata) {
+        if (msg.getNameIdChunks() == null) {
+            return;
         }
-        int id = Integer.parseInt(num, 16);
-        Types.MAPIType type = Types.getById(id);
-        if (type == null) {
-            //TODO:
-            /*
-                PtypRestriction, 0x00FD
-                PtypRuleAction, 0x00FE
-                PtypServerId, 0x00FB
-             */
-            return Types.createCustom(id);
+        if (msg.getMainChunks() == null || 
msg.getMainChunks().getRawProperties() == null) {
+            return;
         }
-        return type;
-    }
-
-
-    public static void extract(MAPIMessage msg, Metadata metadata) {
-        //prep our custom nameIdchunk handler
+        //prep our custom nameIdChunk handler
         TikaNameIdChunks tikaNameIdChunks = new TikaNameIdChunks();
         //short-circuit for files that have an empty nameIdChunk
         long len = 0;
         for (Chunk chunk : msg
                 .getNameIdChunks()
                 .getAll()) {
+            if (chunk == null) {
+                continue;
+            }
             tikaNameIdChunks.record(chunk);
             if (chunk instanceof ByteChunk) {
                 byte[] value = ((ByteChunk)chunk).getValue();
@@ -124,7 +86,11 @@ public class ExtendedMetadataExtractor {
         if (len == 0) {
             return;
         }
-        tikaNameIdChunks.chunksComplete();
+        try {
+            tikaNameIdChunks.chunksComplete();
+        } catch (IllegalStateException e) {
+            LOGGER.warn("bad namechunks stream", e);
+        }
         for (Map.Entry<MAPIProperty, PropertyValue> e : msg
                 .getMainChunks()
                 .getRawProperties()
@@ -132,6 +98,9 @@ public class ExtendedMetadataExtractor {
             //the mapiproperties from POI are the literal storage id for that 
particular file.
             //Those storage ids must be mapped via the name chunk ids into a 
known id
             PropertyValue v = e.getValue();
+            if (v == null) {
+                continue;
+            }
             List<MAPITag> mapiTags = tikaNameIdChunks.getTags(e.getKey().id);
             MAPITagPair pair = null;
             for (MAPITag mapiTag : mapiTags) {
@@ -146,7 +115,6 @@ public class ExtendedMetadataExtractor {
             }
             updateMetadata(pair, v, metadata);
         }
-
     }
 
 
@@ -180,7 +148,7 @@ public class ExtendedMetadataExtractor {
         if (!includeType(propertyValue)) {
             return;
         }
-        String key = MAPI.PREFIX_MAPI_RAW_META + pair.tikaMapiProperty.name;
+        String key = MAPI.PREFIX_MAPI_PROPERTY + pair.tikaMapiProperty.name;
         Types.MAPIType type = propertyValue.getActualType();
         if (type == Types.TIME || type == Types.MV_TIME || type == 
Types.APP_TIME || type == Types.MV_APP_TIME) {
             Calendar calendar = (Calendar) propertyValue.getValue();
@@ -190,8 +158,12 @@ public class ExtendedMetadataExtractor {
                     .toString();
             metadata.add(key, calendarString);
         } else if (type == Types.BOOLEAN) {
-            metadata.add(key, Boolean.toString((boolean) 
propertyValue.getValue()));
-        } else {
+            Boolean val = (Boolean)propertyValue.getValue();
+            if (val == null) {
+                return;
+            }
+            metadata.add(key, Boolean.toString(val));
+        } else if (! StringUtils.isBlank(propertyValue.toString())) {
             metadata.add(key, propertyValue.toString());
         }
 
@@ -205,11 +177,6 @@ public class ExtendedMetadataExtractor {
         return true;
     }
 
-    private static boolean isString(PropertyValue propertyValue) {
-        Types.MAPIType mapiType = propertyValue.getActualType();
-        return mapiType == Types.ASCII_STRING || mapiType == 
Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType == 
Types.UNICODE_STRING;
-    }
-
     private static class TikaMapiProperty {
         String name;
         ClassID classID; // can be null
@@ -237,7 +204,7 @@ public class ExtendedMetadataExtractor {
                     .toUUIDString(), setType.getClassID());
         }
         try (BufferedReader r = new BufferedReader(
-                new 
InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"),
 UTF_8))) {
+                new 
InputStreamReader(ExtendedMetadataExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"),
 UTF_8))) {
             String line = r.readLine();
             while (line != null) {
                 if (line.isBlank() || line.startsWith("#")) {
@@ -309,4 +276,50 @@ public class ExtendedMetadataExtractor {
             this.tikaMapiProperty = tikaMapiProperty;
         }
     }
+
+
+    private static List<Types.MAPIType> parseDataTypes(String[] arr) {
+        if (arr.length == 1) {
+            Types.MAPIType type = parseDataType(arr[0]);
+            if (type != null) {
+                return List.of(type);
+            }
+            return Collections.EMPTY_LIST;
+        }
+        List<Types.MAPIType> types = new ArrayList<>();
+        for (String s : arr) {
+            Types.MAPIType type = parseDataType(s);
+            if (type != null) {
+                types.add(type);
+            }
+        }
+        return types;
+    }
+
+    private static Types.MAPIType parseDataType(String s) {
+        if (StringUtils.isBlank(s)) {
+            return null;
+        }
+        String[] parts = s.split(", ");
+        if (parts.length != 2) {
+            throw new IllegalArgumentException("expected two parts: " + s);
+        }
+        String num = parts[1];
+        if (num.startsWith("0x")) {
+            num = num.substring(2);
+        }
+        int id = Integer.parseInt(num, 16);
+        Types.MAPIType type = Types.getById(id);
+        if (type == null) {
+            //TODO:
+            /*
+                PtypRestriction, 0x00FD
+                PtypRuleAction, 0x00FE
+                PtypServerId, 0x00FB
+             */
+            return Types.createCustom(id);
+        }
+        return type;
+    }
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
index 54e963ee3..ba54f6e4e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java
@@ -18,6 +18,7 @@
 package org.apache.tika.parser.microsoft.msg;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@@ -132,10 +133,11 @@ public final class TikaNameIdChunks implements ChunkGroup 
{
         loadTags();
     }
 
+    //does not return null
     public List<MAPITag> getTags(int storageId) {
         List<MAPITag> tags = mapiTagMap.get(storageId);
         if (tags == null) {
-            return new ArrayList<>();
+            return Collections.emptyList();
         }
         return tags;
     }
@@ -235,7 +237,7 @@ public final class TikaNameIdChunks implements ChunkGroup {
             return 0;
         }
         for (Chunk chunk : chunks) {
-            if (chunk.getType() != Types.BINARY || chunk.getChunkId() != 
streamID) {
+            if (chunk == null || chunk.getType() != Types.BINARY || 
chunk.getChunkId() != streamID) {
                 continue;
             }
             byte[] matchChunkBytes = ((ByteChunk) chunk).getValue();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 10d2e02b4..6c532a84e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -312,30 +312,45 @@ public class OutlookParserTest extends TikaTest {
     public void testAppointmentExtendedMetadata() throws Exception {
         List<Metadata> metadataList = 
getRecursiveMetadata("testMSG_Appointment.msg");
         Metadata m = metadataList.get(0);
-        debug(m);
-        
assertTrue(m.get("mapi:raw:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
-        
assertTrue(m.get("mapi:raw:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
-        
assertTrue(m.get("mapi:raw:PidLidClipStart").contains("2017-02-28T18"));
-        assertTrue(m.get("mapi:raw:PidLidClipEnd").contains("2017-02-28T19"));
-        
assertTrue(m.get("mapi:raw:PidLidCommonStart").contains("2017-02-28T18"));
-        
assertTrue(m.get("mapi:raw:PidLidCommonEnd").contains("2017-02-28T19"));
-        
assertTrue(m.get("mapi:raw:PidLidReminderSignalTime").contains("4501-01-01T00"));
-        
assertTrue(m.get("mapi:raw:PidLidReminderTime").contains("2017-02-28T18"));
-        
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
-        assertEquals("0", m.get("mapi:raw:PidLidAppointmentSequence"));
-        assertEquals("false", m.get("mapi:raw:PidLidRecurring"));
+        
assertTrue(m.get("mapi:property:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
+        
assertTrue(m.get("mapi:property:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
+        
assertTrue(m.get("mapi:property:PidLidClipStart").contains("2017-02-28T18"));
+        
assertTrue(m.get("mapi:property:PidLidClipEnd").contains("2017-02-28T19"));
+        
assertTrue(m.get("mapi:property:PidLidCommonStart").contains("2017-02-28T18"));
+        
assertTrue(m.get("mapi:property:PidLidCommonEnd").contains("2017-02-28T19"));
+        
assertTrue(m.get("mapi:property:PidLidReminderSignalTime").contains("4501-01-01T00"));
+        
assertTrue(m.get("mapi:property:PidLidReminderTime").contains("2017-02-28T18"));
+        
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
+        assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence"));
+        assertEquals("false", m.get("mapi:property:PidLidRecurring"));
     }
 
     @Test
     public void testTaskExtendedMetadata() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testMSG_Task.msg");
         Metadata m = metadataList.get(0);
-        
assertTrue(m.get("mapi:raw:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
-        
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
-        assertEquals("0", m.get("mapi:raw:PidLidTaskActualEffort"));
-        assertEquals("false", m.get("mapi:raw:PidLidTeamTask"));
+        
assertTrue(m.get("mapi:property:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
+        
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
+        assertEquals("0", m.get("mapi:property:PidLidTaskActualEffort"));
+        assertEquals("false", m.get("mapi:property:PidLidTeamTask"));
     }
 
+    @Test
+    public void testContactExtendedMetadata() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testMSG_Contact.msg");
+        Metadata m = metadataList.get(0);
+        assertEquals("2017-02-28T18:41:37Z", 
m.get("mapi:property:PidLidValidFlagStringProof"));
+    }
+
+
+    @Test
+    public void testPostExtendedMetadata() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testMSG_Post.msg");
+        Metadata m = metadataList.get(0);
+        assertEquals("2017-02-28T18:47:11Z", 
m.get("mapi:property:PidLidValidFlagStringProof"));
+    }
+
+
     @Test
     public void testHandlingAllAlternativesBodies() throws Exception {
         //test that default only has one body

Reply via email to