[ 
https://issues.apache.org/jira/browse/TIKA-4720?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18076029#comment-18076029
 ] 

Hudson commented on TIKA-4720:
------------------------------

SUCCESS: Integrated in Jenkins build Tika » tika-main-jdk17 #1328 (See 
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk17/1328/])
TIKA-4720 -- Move charset detection to byte-bigram Naive Bayes pipeline (#2784) 
(github: 
[https://github.com/apache/tika/commit/e63170f51ee66958bc89ba59786c597165a5878e])
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/HtmlByteStripper.java
* (edit) tika-ml/tika-ml-chardetect/pom.xml
* (delete) tika-ml/tika-ml-chardetect/src/test/python/anneal.py
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
* (delete) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java
* (delete) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/DiagnoseCharsetDetector.java
* (edit) tika-encoding-detectors/tika-encoding-detector-mojibuster/pom.xml
* (delete) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/docs/performance/20260422-0.95-model-partial-20B.txt
* (delete) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/docs/performance/20260422-v4-normalized-devtest.txt
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/Utf16SpecialistEncodingDetector.java
* (add) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainNaiveBayesBigram.java
* (edit) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BenchmarkCharsetDetectors.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesBigramEncodingDetector.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/NaiveBayesPipelineEncodingDetector.java
* (delete) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/WideUnicodeDetector.java
* (edit) tika-ml/tika-ml-core/src/main/java/org/apache/tika/ml/LinearModel.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CharsetConfusables.java
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/nb-bigram.bin
* (delete) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
* (delete) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
* (edit) tika-ml/tika-ml-chardetect/README.md
* (edit) docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
* (delete) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/docs/performance/20260422-0.98-model.txt
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
* (delete) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
* (edit) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
* (edit) docs/modules/ROOT/pages/configuration/encoding-detectors.adoc
* (edit) 
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
* (delete) 
tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/docs/performance/20260422-0.999-int8-model.txt
* (edit) tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
* (edit) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
* (edit) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
* (delete) 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/Utf16DiagnosticReport.java
* (add) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/docs/performance/20260422-0.999-model.txt
* (delete) 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java


> Improve charset detection in 4.x, take 2
> ----------------------------------------
>
>                 Key: TIKA-4720
>                 URL: https://issues.apache.org/jira/browse/TIKA-4720
>             Project: Tika
>          Issue Type: Task
>            Reporter: Tim Allison
>            Priority: Minor
>
> I had some really good luck with simple naive bayes with careful scaling.
>  
> This ticket includes the move to that as the main charset detector. This 
> ticket also includes work to improve our default html charset detector to get 
> some of the benefits of our StandardHtml charset detector without its 
> rigidity.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to