[ 
https://issues.apache.org/jira/browse/TIKA-4662?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18059502#comment-18059502
 ] 

Hudson commented on TIKA-4662:
------------------------------

SUCCESS: Integrated in Jenkins build Tika ยป tika-main-jdk17 #1215 (See 
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk17/1215/])
TIKA-4662 -- update language detection (#2610) (github: 
[https://github.com/apache/tika/commit/9c67c522b48dcfd7c225bdca1100db0e7d249f2a])
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nep
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/quz
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tgl
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ita
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kor
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ban
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/grn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sin
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tam
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/msa
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_pashto.py
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/bih
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tat
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/cat
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/chv
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bjn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tuk
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/gle
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tgk
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ConfusionDumper.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/tyv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ssw
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/que
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainLanguageModel.java
* (delete) 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ina
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/nav
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/DuplicateChecker.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bam
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/vie
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ron
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ido
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/min
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/dan
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountPriorityQueue.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hat
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/arg
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/mzn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nso
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mal
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/lmo
* (edit) 
tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file1.pdf.json
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nds
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CompositeTextStatsCalculator.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ilo
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/yid
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/lad
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bul
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ful
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrigramAblation.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/pam
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fao
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/roh
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/tel-rom
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/xmf
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ConfusableDiff.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/kal
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/uzn
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/gug
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/FeatureExtractor.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/deu
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kir
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hrv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lav
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/myv
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/cym
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/che
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ext
* (edit) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/LanguageIdTest.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/hbs
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/oss
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/mrj
* (delete) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/kom
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sme
* (edit) 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java
* (add) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TikaEvalTokenizer.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bos
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/vro
* (edit) docs/modules/ROOT/nav.adoc
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fry
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/WordTokenizerTest.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/hin-rom
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/arz
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/spa
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/run
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/scn
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/mya-zaw
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/wol
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sun
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/srd
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/hif
* (edit) 
tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/jpn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/sqi
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ndo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/slv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nno
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/gla
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/uzb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/amh
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/por
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/urd
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupModel.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ewe
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/mai
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/wln
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/diq
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ben
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/vol
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/lus
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sot
* (delete) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/LuceneTokenCounter.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ile
* (edit) tika-langdetect/pom.xml
* (edit) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lit
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/cmn
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tur
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fin
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/oci
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/isl
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lao
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/pes
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ydd
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ukr
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerManager.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lat
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/knn
* (delete) 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/swa
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mlg
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lug
* (edit) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/langid/LangIdTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ori
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sah
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/nap
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bak
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kur
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/afr
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/uig
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/pfl
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/zea
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ckb
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractorTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hin
* (edit) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/textstats/TextStatsTest.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sco
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/eus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/eng
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/san
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/langid/LanguageIDWrapper.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/aze
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/div
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/Phase2SmokeTest.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mon
* (edit) 
tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/AnalyzerManagerTest.java
* (delete) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ben-rom
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/khk
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/koi
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sna
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/swe
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nob
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tha
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/zho
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/asm
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ibo
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kan
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupModelTest.java
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CrossDomainEval.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fra
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nan
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/zho-trad
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/orm
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/mya
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/nld
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/rus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/gsw
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kin
* (delete) tika-eval/tika-eval-core/src/main/resources/lucene-analyzers.json
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/gom
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ces
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/lin
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/frr
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mhr
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/snd
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/rue
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/xho
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/hsb
* (edit) 
tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/tools/TopCommonTokenCounterTest.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bar
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/jav
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusReader.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/khm
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/lvs
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/zul
* (edit) tika-eval/tika-eval-core/pom.xml
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/LangIdRegressionTest.java
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/AblationRunner.java
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/python/download_corpus.py
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bcl
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/udm
* (add) docs/modules/ROOT/pages/advanced/language-detection.adoc
* (delete) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/new
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LabeledSentence.java
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/Phase2Trainer.java
* (delete) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/est
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/som
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hun
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bpy
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/wuu
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/epo
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/smi
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kat
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/eml
* (add) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TikaEvalTokenizerFuzzTest.java
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCountPriorityQueue.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/mwl
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/glv
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/kaz
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ltz
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mlt
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tsn
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/vls
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hye
* (add) tika-langdetect/tika-langdetect-charsoup-core/pom.xml
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/QuickF1Eval.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ast
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bel
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/fas
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/yor
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/tel
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/resources/org/apache/tika/langdetect/charsoup/comparison-report.txt
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/prs
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/glg
* (add) 
tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/tools/CommonTokenGenerator.java
* (add) docs/modules/ROOT/pages/advanced/language-detection-build.adoc
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/guj
* (delete) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pnb
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/DataSplitter.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ara
* (edit) 
tika-pipes/tika-pipes-config-store-ignite/src/test/java/org/apache/tika/pipes/ignite/IgniteConfigStoreTest.java
* (delete) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractorTest.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/szl
* (add) tika-langdetect/tika-langdetect-charsoup/pom.xml
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/krc
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mar
* (edit) 
tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/tam-rom
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mkd
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/mri
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/swh
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/urd-rom
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/lim
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ksh
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/war
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/tso
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ModelQuantizer.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/sgs
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pol
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CompareDetectors.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pus
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/pan
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/bua
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/cos
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/pms
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/slk
* (add) 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/BucketSaturationAnalyzer.java
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/hau
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/lup
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/srp
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/zho-simp
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/csb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ceb
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/pap
* (delete) tika-eval/tika-eval-core/src/main/resources/common_tokens/ekk
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/bre
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/ven
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ell
* (edit) docs/modules/ROOT/pages/advanced/index.adoc
* (add) tika-eval/tika-eval-core/src/main/resources/common_tokens/dsb
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/ind
* (edit) tika-bom/pom.xml
* (edit) tika-eval/tika-eval-core/src/main/resources/common_tokens/heb
* (add) 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
* (delete) 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java


> Modernize lang-detector for at least 4.x
> ----------------------------------------
>
>                 Key: TIKA-4662
>                 URL: https://issues.apache.org/jira/browse/TIKA-4662
>             Project: Tika
>          Issue Type: Task
>            Reporter: Tim Allison
>            Priority: Minor
>
> We were using opennlp's maxent code with a custom built model. I recently did 
> some work to modernize that a bit and to improve feature extraction, speed 
> and model size. Let's upgrade for 4.x (at least).



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to