This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4338 in repository https://gitbox.apache.org/repos/asf/tika.git
commit bdd38ef7901ae4786c48ae2fe1666d99dd11a4bb Author: tallison <[email protected]> AuthorDate: Fri Oct 25 09:28:30 2024 -0400 TIKA-4338 -- remove tag soup everywhere else -- revert MatchingContentHandler and update tika-bundle-standard --- tika-bundles/tika-bundle-standard/pom.xml | 2 +- .../main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml index 3bc8cec76..1f47b5f50 100644 --- a/tika-bundles/tika-bundle-standard/pom.xml +++ b/tika-bundles/tika-bundle-standard/pom.xml @@ -174,7 +174,7 @@ jackcess| jackcess-encrypt| commons-lang3| - tagsoup| + jsoup| asm| juniversalchardet| vorbis-java-core| diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java index 0648098d1..831611c06 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java @@ -69,6 +69,10 @@ public class MatchingContentHandler extends ContentHandlerDecorator { if (matcher.matchesElement()) { super.endElement(uri, localName, name); } + // this was originally added for tagsoup, but we need it generally + if (!matchers.isEmpty()) { + matcher = matchers.removeFirst(); + } } public void characters(char[] ch, int start, int length) throws SAXException {
