[ https://issues.apache.org/jira/browse/TIKA-1599?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17769269#comment-17769269 ]
Hudson commented on TIKA-1599: ------------------------------ UNSTABLE: Integrated in Jenkins build Tika ยป tika-main-jdk11 #1264 (See [https://ci-builds.apache.org/job/Tika/job/tika-main-jdk11/1264/]) TIKA-1599 (#1356) (github: [https://github.com/apache/tika/commit/5361b6d12c679bae6ea536119f8902c898217557]) * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_embedded_img.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html * (add) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_charset_utf16le.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_head.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DataURISchemeParseException.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DataURIScheme.java * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/StandardHtmlEncodingDetector.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testXHTML.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DefaultHtmlMapper.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/charsets/XUserDefinedCharset.java * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_embedded_img_in_js.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/tika434.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_metadata.html * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/org/apache/tika/parser/html/tagsoup/StandardCharsets_unsupported_by_IANA.txt * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/SrcDocTest.java * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml * (edit) tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_utf8.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_embedded_data_uri_js.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_charset_utf8.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/PreScanner.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/pom.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/charsets/ReplacementCharset.java * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/org/apache/tika/parser/html/tika-config.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/CharsetAliases.java * (edit) tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.1-tesseract.txt * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlMapper.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-package/pom.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLBadScript.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/MetaProcessor.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testUserDefinedCharset.mhtml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/StandardHtmlEncodingDetectorTest.java * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLGoodScript.html * (edit) tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java * (edit) pom.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/boilerplate-whitespace.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_metadata_two_titles.html * (edit) tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.0-no-tesseract.txt * (delete) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlParser.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testSrcDoc.html * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/HtmlEncodingDetectorTest.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/HtmlParserTest.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/boilerplate.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/XHTMLDowngradeHandler.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DataURISchemeUtil.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlHandler.java * (edit) tika-app/src/test/resources/test-data/tika-config1.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlEncodingDetector.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/CharsetDetectionResult.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/DataURISchemeParserTest.java * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testBoilerplateMissingSpace.html * (edit) tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.0-tesseract.txt * (edit) tika-parent/pom.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/IdentityHtmlMapper.java * (edit) tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java * (edit) tika-bom/pom.xml * (edit) tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.1-no-tesseract.txt * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/org/apache/tika/parser/html/tika-config.xml * (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/big-preamble.html * (edit) CHANGES.txt > Switch from TagSoup to JSoup > ---------------------------- > > Key: TIKA-1599 > URL: https://issues.apache.org/jira/browse/TIKA-1599 > Project: Tika > Issue Type: Improvement > Components: parser > Affects Versions: 1.7, 1.8 > Reporter: Kenneth William Krugler > Assignee: Kenneth William Krugler > Priority: Major > Fix For: 3.0.0-BETA > > Attachments: TIKA-1599-crazy-files.tar.gz, consumentenbond.html, > tagsoup_vs_jsoup_reports.zip > > > There are several Tika issues related to how TagSoup cleans up HTML > ([TIKA-381], [TIKA-985], maybe [TIKA-715]), but TagSoup doesn't seem to be > under active development. > On the other hand I know of several projects that are now using > [JSoup|https://github.com/jhy/jsoup], which is an active project (albeit only > one main contributor) under the MIT license. > I haven't looked into how hard it would be to switch this dependency. -- This message was sent by Atlassian Jira (v8.20.10#820010)