[ 
https://issues.apache.org/jira/browse/TIKA-1599?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17769269#comment-17769269
 ] 

Hudson commented on TIKA-1599:
------------------------------

UNSTABLE: Integrated in Jenkins build Tika ยป tika-main-jdk11 #1264 (See 
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk11/1264/])
TIKA-1599 (#1356) (github: 
[https://github.com/apache/tika/commit/5361b6d12c679bae6ea536119f8902c898217557])
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_embedded_img.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html
* (add) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_charset_utf16le.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_head.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DataURISchemeParseException.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DataURIScheme.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/StandardHtmlEncodingDetector.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testXHTML.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DefaultHtmlMapper.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/charsets/XUserDefinedCharset.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_embedded_img_in_js.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/tika434.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_metadata.html
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/org/apache/tika/parser/html/tagsoup/StandardCharsets_unsupported_by_IANA.txt
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/SrcDocTest.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/pom.xml
* (edit) 
tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_utf8.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_embedded_data_uri_js.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_charset_utf8.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/PreScanner.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/pom.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/charsets/ReplacementCharset.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/org/apache/tika/parser/html/tika-config.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/CharsetAliases.java
* (edit) 
tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.1-tesseract.txt
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlMapper.java
* (add) tika-parsers/tika-parsers-extended/tika-parser-tagsoup-package/pom.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLBadScript.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/MetaProcessor.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testUserDefinedCharset.mhtml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/StandardHtmlEncodingDetectorTest.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLGoodScript.html
* (edit) tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
* (edit) pom.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/boilerplate-whitespace.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTML_metadata_two_titles.html
* (edit) 
tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.0-no-tesseract.txt
* (delete) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlParser.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testSrcDoc.html
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/HtmlEncodingDetectorTest.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/HtmlParserTest.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/boilerplate.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/XHTMLDowngradeHandler.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/DataURISchemeUtil.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlHandler.java
* (edit) tika-app/src/test/resources/test-data/tika-config1.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/HtmlEncodingDetector.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/charsetdetector/CharsetDetectionResult.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup/DataURISchemeParserTest.java
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/testBoilerplateMissingSpace.html
* (edit) 
tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.0-tesseract.txt
* (edit) tika-parent/pom.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup/IdentityHtmlMapper.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
* (edit) tika-bom/pom.xml
* (edit) 
tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/resources/2.4.1-no-tesseract.txt
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/org/apache/tika/parser/html/tika-config.xml
* (add) 
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/test/resources/test-documents/big-preamble.html
* (edit) CHANGES.txt


> Switch from TagSoup to JSoup
> ----------------------------
>
>                 Key: TIKA-1599
>                 URL: https://issues.apache.org/jira/browse/TIKA-1599
>             Project: Tika
>          Issue Type: Improvement
>          Components: parser
>    Affects Versions: 1.7, 1.8
>            Reporter: Kenneth William Krugler
>            Assignee: Kenneth William Krugler
>            Priority: Major
>             Fix For: 3.0.0-BETA
>
>         Attachments: TIKA-1599-crazy-files.tar.gz, consumentenbond.html, 
> tagsoup_vs_jsoup_reports.zip
>
>
> There are several Tika issues related to how TagSoup cleans up HTML 
> ([TIKA-381], [TIKA-985], maybe [TIKA-715]), but TagSoup doesn't seem to be 
> under active development.
> On the other hand I know of several projects that are now using 
> [JSoup|https://github.com/jhy/jsoup], which is an active project (albeit only 
> one main contributor) under the MIT license.
> I haven't looked into how hard it would be to switch this dependency.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to