Copilot commented on code in PR #2840:
URL: https://github.com/apache/tika/pull/2840#discussion_r3308498289


##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java:
##########
@@ -157,85 +193,164 @@ private String getDocSecurityString(int docSecurityFlag) 
{
         }
     }
 
-    private void extractMetadata(POIXMLProperties.CustomProperties properties, 
Metadata metadata) {
-        
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties 
props =
-                properties.getUnderlyingProperties();
-        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
-            CTProperty property = props.getPropertyArray(i);
-            String val = null;
-            Date date = null;
-
-            if (property.isSetLpwstr()) {
-                val = property.getLpwstr();
-            } else if (property.isSetLpstr()) {
-                val = property.getLpstr();
-            } else if (property.isSetDate()) {
-                date = property.getDate().getTime();
-            } else if (property.isSetFiletime()) {
-                date = property.getFiletime().getTime();
-            } else if (property.isSetBool()) {
-                val = Boolean.toString(property.getBool());
+    /**
+     * Parse {@code docProps/custom.xml} directly via SAX, bypassing
+     * POI/XMLBeans. The XMLBeans path materializes an attacker-controlled
+     * {@code <vt:decimal>} through {@link BigDecimal#BigDecimal(String)}
+     * during XML deserialization, which is O(n²) in the digit count on
+     * JDK 17. By reading the part ourselves we can cap both the buffered
+     * text content ({@link #MAX_TEXT_BUFFER_LENGTH}) and the decimal
+     * literal length ({@link #MAX_DECIMAL_LENGTH}) before any slow parse
+     * runs.
+     */
+    private void extractCustomPropertiesViaSAX(OPCPackage opcPackage, Metadata 
metadata) {
+        if (opcPackage == null) {
+            return;
+        }
+        try {
+            PackagePart custPart = getRelatedPart(opcPackage, 
CUSTOM_PROPERTIES_REL);
+            if (custPart == null) {
+                return;
+            }
+            CustomPropertiesHandler handler = new CustomPropertiesHandler();
+            try (InputStream is = custPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, handler, new ParseContext());
             }
+            handler.applyTo(metadata);
+        } catch (Exception e) {
+            //swallow
+        }
+    }
 
-            // Integers
-            else if (property.isSetI1()) {
-                val = Integer.toString(property.getI1());
-            } else if (property.isSetI2()) {
-                val = Integer.toString(property.getI2());
-            } else if (property.isSetI4()) {
-                val = Integer.toString(property.getI4());
-            } else if (property.isSetI8()) {
-                val = Long.toString(property.getI8());
-            } else if (property.isSetInt()) {
-                val = Integer.toString(property.getInt());
+    private static PackagePart getRelatedPart(OPCPackage opcPackage, String 
relationshipType) {
+        try {
+            PackageRelationshipCollection rels =
+                    opcPackage.getRelationshipsByType(relationshipType);
+            if (rels == null || rels.size() == 0) {
+                return null;
             }
+            PackageRelationship rel = rels.getRelationship(0);
+            if (rel == null) {
+                return null;
+            }
+            return opcPackage.getPart(rel);
+        } catch (Exception e) {
+            return null;
+        }
+    }
 
-            // Unsigned Integers
-            else if (property.isSetUi1()) {
-                val = Integer.toString(property.getUi1());
-            } else if (property.isSetUi2()) {
-                val = Integer.toString(property.getUi2());
-            } else if (property.isSetUi4()) {
-                val = Long.toString(property.getUi4());
-            } else if (property.isSetUi8()) {
-                val = property.getUi8().toString();
-            } else if (property.isSetUint()) {
-                val = Long.toString(property.getUint());
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream 
parsing.
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
+    /**
+     * SAX handler for {@code docProps/custom.xml} (custom properties).
+     * Matches the schema defined by Microsoft's
+     * {@code 
http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}
+     * namespace, with value types coming from the {@code vt:} namespace.
+     */
+    static class CustomPropertiesHandler extends DefaultHandler {
+
+        private static final String VT_NS =
+                
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
+
+        private final Metadata customMetadata = new Metadata();
+        private String currentPropertyName;
+        private String currentValueType;
+        private final StringBuilder textBuffer = new StringBuilder();
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) {
+            if ("property".equals(localName)) {
+                currentPropertyName = atts.getValue("name");
+                currentValueType = null;
+            } else if (VT_NS.equals(uri) && currentPropertyName != null) {
+                currentValueType = localName;
+                textBuffer.setLength(0);
             }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) {
+            appendCapped(textBuffer, ch, start, length);
+        }
 
-            // Reals
-            else if (property.isSetR4()) {
-                val = Float.toString(property.getR4());
-            } else if (property.isSetR8()) {
-                val = Double.toString(property.getR8());
-            } else if (property.isSetDecimal()) {
-                BigDecimal d = property.getDecimal();
-                if (d == null) {
-                    val = null;
-                } else {
-                    val = d.toPlainString();
+        @Override
+        public void endElement(String uri, String localName, String qName) {
+            if (VT_NS.equals(uri) && currentValueType != null &&
+                    localName.equals(currentValueType) && currentPropertyName 
!= null) {
+                String val = textBuffer.toString().trim();
+                String propName = "custom:" + currentPropertyName;
+                switch (currentValueType) {
+                    case "lpwstr":
+                    case "lpstr":
+                    case "bstr":
+                        customMetadata.set(propName, val);
+                        break;
+                    case "filetime":
+                    case "date":
+                        Property tikaProp = Property.externalDate(propName);
+                        customMetadata.set(tikaProp, val);
+                        break;
+                    case "bool":
+                        customMetadata.set(propName, val);

Review Comment:
   This stores the raw XML boolean text, which is not equivalent to the 
previous XMLBeans path. OOXML booleans can use the schema lexical values `1` 
and `0`; the old code emitted `true` or `false` via 
`Boolean.toString(property.getBool())`, but this would now expose `custom:*` 
metadata as `1` or `0`. Normalize boolean values before setting metadata to 
preserve the existing API behavior.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java:
##########
@@ -157,85 +193,164 @@ private String getDocSecurityString(int docSecurityFlag) 
{
         }
     }
 
-    private void extractMetadata(POIXMLProperties.CustomProperties properties, 
Metadata metadata) {
-        
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties 
props =
-                properties.getUnderlyingProperties();
-        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
-            CTProperty property = props.getPropertyArray(i);
-            String val = null;
-            Date date = null;
-
-            if (property.isSetLpwstr()) {
-                val = property.getLpwstr();
-            } else if (property.isSetLpstr()) {
-                val = property.getLpstr();
-            } else if (property.isSetDate()) {
-                date = property.getDate().getTime();
-            } else if (property.isSetFiletime()) {
-                date = property.getFiletime().getTime();
-            } else if (property.isSetBool()) {
-                val = Boolean.toString(property.getBool());
+    /**
+     * Parse {@code docProps/custom.xml} directly via SAX, bypassing
+     * POI/XMLBeans. The XMLBeans path materializes an attacker-controlled
+     * {@code <vt:decimal>} through {@link BigDecimal#BigDecimal(String)}
+     * during XML deserialization, which is O(n²) in the digit count on
+     * JDK 17. By reading the part ourselves we can cap both the buffered
+     * text content ({@link #MAX_TEXT_BUFFER_LENGTH}) and the decimal
+     * literal length ({@link #MAX_DECIMAL_LENGTH}) before any slow parse
+     * runs.
+     */
+    private void extractCustomPropertiesViaSAX(OPCPackage opcPackage, Metadata 
metadata) {
+        if (opcPackage == null) {
+            return;
+        }
+        try {
+            PackagePart custPart = getRelatedPart(opcPackage, 
CUSTOM_PROPERTIES_REL);
+            if (custPart == null) {
+                return;
+            }
+            CustomPropertiesHandler handler = new CustomPropertiesHandler();
+            try (InputStream is = custPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, handler, new ParseContext());
             }
+            handler.applyTo(metadata);
+        } catch (Exception e) {
+            //swallow
+        }
+    }
 
-            // Integers
-            else if (property.isSetI1()) {
-                val = Integer.toString(property.getI1());
-            } else if (property.isSetI2()) {
-                val = Integer.toString(property.getI2());
-            } else if (property.isSetI4()) {
-                val = Integer.toString(property.getI4());
-            } else if (property.isSetI8()) {
-                val = Long.toString(property.getI8());
-            } else if (property.isSetInt()) {
-                val = Integer.toString(property.getInt());
+    private static PackagePart getRelatedPart(OPCPackage opcPackage, String 
relationshipType) {
+        try {
+            PackageRelationshipCollection rels =
+                    opcPackage.getRelationshipsByType(relationshipType);
+            if (rels == null || rels.size() == 0) {
+                return null;
             }
+            PackageRelationship rel = rels.getRelationship(0);
+            if (rel == null) {
+                return null;
+            }
+            return opcPackage.getPart(rel);
+        } catch (Exception e) {
+            return null;
+        }
+    }
 
-            // Unsigned Integers
-            else if (property.isSetUi1()) {
-                val = Integer.toString(property.getUi1());
-            } else if (property.isSetUi2()) {
-                val = Integer.toString(property.getUi2());
-            } else if (property.isSetUi4()) {
-                val = Long.toString(property.getUi4());
-            } else if (property.isSetUi8()) {
-                val = property.getUi8().toString();
-            } else if (property.isSetUint()) {
-                val = Long.toString(property.getUint());
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream 
parsing.
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
+    /**
+     * SAX handler for {@code docProps/custom.xml} (custom properties).
+     * Matches the schema defined by Microsoft's
+     * {@code 
http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}
+     * namespace, with value types coming from the {@code vt:} namespace.
+     */
+    static class CustomPropertiesHandler extends DefaultHandler {
+
+        private static final String VT_NS =
+                
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
+
+        private final Metadata customMetadata = new Metadata();
+        private String currentPropertyName;
+        private String currentValueType;
+        private final StringBuilder textBuffer = new StringBuilder();
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) {
+            if ("property".equals(localName)) {
+                currentPropertyName = atts.getValue("name");
+                currentValueType = null;
+            } else if (VT_NS.equals(uri) && currentPropertyName != null) {
+                currentValueType = localName;
+                textBuffer.setLength(0);
             }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) {
+            appendCapped(textBuffer, ch, start, length);
+        }
 
-            // Reals
-            else if (property.isSetR4()) {
-                val = Float.toString(property.getR4());
-            } else if (property.isSetR8()) {
-                val = Double.toString(property.getR8());
-            } else if (property.isSetDecimal()) {
-                BigDecimal d = property.getDecimal();
-                if (d == null) {
-                    val = null;
-                } else {
-                    val = d.toPlainString();
+        @Override
+        public void endElement(String uri, String localName, String qName) {
+            if (VT_NS.equals(uri) && currentValueType != null &&
+                    localName.equals(currentValueType) && currentPropertyName 
!= null) {
+                String val = textBuffer.toString().trim();

Review Comment:
   Applying `trim()` to every value changes string custom properties: leading 
and trailing whitespace in `<vt:lpwstr>`, `<vt:lpstr>`, and `<vt:bstr>` was 
previously preserved by `property.getLpwstr()/getLpstr()` and is 
user-controlled metadata content. Keep the raw buffered text for string types 
and only normalize whitespace for typed values that require parsing, such as 
decimal/date/bool/numeric values.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java:
##########
@@ -157,85 +193,164 @@ private String getDocSecurityString(int docSecurityFlag) 
{
         }
     }
 
-    private void extractMetadata(POIXMLProperties.CustomProperties properties, 
Metadata metadata) {
-        
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties 
props =
-                properties.getUnderlyingProperties();
-        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
-            CTProperty property = props.getPropertyArray(i);
-            String val = null;
-            Date date = null;
-
-            if (property.isSetLpwstr()) {
-                val = property.getLpwstr();
-            } else if (property.isSetLpstr()) {
-                val = property.getLpstr();
-            } else if (property.isSetDate()) {
-                date = property.getDate().getTime();
-            } else if (property.isSetFiletime()) {
-                date = property.getFiletime().getTime();
-            } else if (property.isSetBool()) {
-                val = Boolean.toString(property.getBool());
+    /**
+     * Parse {@code docProps/custom.xml} directly via SAX, bypassing
+     * POI/XMLBeans. The XMLBeans path materializes an attacker-controlled
+     * {@code <vt:decimal>} through {@link BigDecimal#BigDecimal(String)}
+     * during XML deserialization, which is O(n²) in the digit count on
+     * JDK 17. By reading the part ourselves we can cap both the buffered
+     * text content ({@link #MAX_TEXT_BUFFER_LENGTH}) and the decimal
+     * literal length ({@link #MAX_DECIMAL_LENGTH}) before any slow parse
+     * runs.
+     */
+    private void extractCustomPropertiesViaSAX(OPCPackage opcPackage, Metadata 
metadata) {
+        if (opcPackage == null) {
+            return;
+        }
+        try {
+            PackagePart custPart = getRelatedPart(opcPackage, 
CUSTOM_PROPERTIES_REL);
+            if (custPart == null) {
+                return;
+            }
+            CustomPropertiesHandler handler = new CustomPropertiesHandler();
+            try (InputStream is = custPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, handler, new ParseContext());
             }
+            handler.applyTo(metadata);
+        } catch (Exception e) {
+            //swallow
+        }
+    }
 
-            // Integers
-            else if (property.isSetI1()) {
-                val = Integer.toString(property.getI1());
-            } else if (property.isSetI2()) {
-                val = Integer.toString(property.getI2());
-            } else if (property.isSetI4()) {
-                val = Integer.toString(property.getI4());
-            } else if (property.isSetI8()) {
-                val = Long.toString(property.getI8());
-            } else if (property.isSetInt()) {
-                val = Integer.toString(property.getInt());
+    private static PackagePart getRelatedPart(OPCPackage opcPackage, String 
relationshipType) {
+        try {
+            PackageRelationshipCollection rels =
+                    opcPackage.getRelationshipsByType(relationshipType);
+            if (rels == null || rels.size() == 0) {
+                return null;
             }
+            PackageRelationship rel = rels.getRelationship(0);
+            if (rel == null) {
+                return null;
+            }
+            return opcPackage.getPart(rel);
+        } catch (Exception e) {
+            return null;
+        }
+    }
 
-            // Unsigned Integers
-            else if (property.isSetUi1()) {
-                val = Integer.toString(property.getUi1());
-            } else if (property.isSetUi2()) {
-                val = Integer.toString(property.getUi2());
-            } else if (property.isSetUi4()) {
-                val = Long.toString(property.getUi4());
-            } else if (property.isSetUi8()) {
-                val = property.getUi8().toString();
-            } else if (property.isSetUint()) {
-                val = Long.toString(property.getUint());
+    /**
+     * Append SAX {@code characters()} content to {@code buf}, but stop 
accepting
+     * once {@link #MAX_TEXT_BUFFER_LENGTH} is reached. Excess characters are
+     * silently dropped; truncated values still flow through downstream 
parsing.
+     */
+    static void appendCapped(StringBuilder buf, char[] ch, int start, int 
length) {
+        if (buf.length() >= MAX_TEXT_BUFFER_LENGTH) {
+            return;
+        }
+        int remaining = MAX_TEXT_BUFFER_LENGTH - buf.length();
+        buf.append(ch, start, Math.min(length, remaining));
+    }
+
+    /**
+     * SAX handler for {@code docProps/custom.xml} (custom properties).
+     * Matches the schema defined by Microsoft's
+     * {@code 
http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}
+     * namespace, with value types coming from the {@code vt:} namespace.
+     */
+    static class CustomPropertiesHandler extends DefaultHandler {
+
+        private static final String VT_NS =
+                
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";;
+
+        private final Metadata customMetadata = new Metadata();
+        private String currentPropertyName;
+        private String currentValueType;
+        private final StringBuilder textBuffer = new StringBuilder();
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) {
+            if ("property".equals(localName)) {
+                currentPropertyName = atts.getValue("name");
+                currentValueType = null;
+            } else if (VT_NS.equals(uri) && currentPropertyName != null) {
+                currentValueType = localName;
+                textBuffer.setLength(0);

Review Comment:
   Accepting any `vt:` element while a property is active changes the handling 
of compound custom properties. For `<vt:vector>` or `<vt:array>`, nested scalar 
elements such as `<vt:lpstr>` will overwrite `currentValueType` and be emitted 
as a scalar custom property, while the previous implementation explicitly 
skipped vectors/arrays. Track the direct child of `<property>` (or an 
unsupported compound depth) so nested values remain ignored unless vector/array 
support is intentionally added.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to