This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
from 7117454ca1 checkpoint - wip
add 80a973185c TIKA-4692 - script block features and model updates (#2702)
add 1185f7dbd0 specify eol to prevent checkstyle surprises on windows
(#2701)
add 963bbd8018 Merge branch 'main' into TIKA-4692-improve-ooxml-sax-parsers
No new revisions were added by this update.
Summary of changes:
.../ROOT/pages/advanced/charset-detection-eval.txt | 274 +
.../advanced/charsoup-supported-languages.adoc | 207 +-
.../ROOT/pages/advanced/flores-eval-20260320.txt | 580 +
.../pages/advanced/generative-language-model.adoc | 140 +-
.../pages/advanced/language-detection-build.adoc | 621 +-
.../ROOT/pages/advanced/language-detection.adoc | 2 +-
.../charsoup/CharSoupEncodingDetector.java | 128 +-
.../eval/core/metadata/TikaEvalMetadataFilter.java | 10 +-
.../src/main/resources/common_tokens/ace | 3261 +-
.../src/main/resources/common_tokens/afr | 58582 +++++++++---------
.../src/main/resources/common_tokens/aka | 19339 +++---
.../src/main/resources/common_tokens/alt | 5673 +-
.../src/main/resources/common_tokens/amh | 19160 +++---
.../src/main/resources/common_tokens/ami | 5460 +-
.../src/main/resources/common_tokens/ara | 59812 +++++++++---------
.../src/main/resources/common_tokens/arg | 48885 ++++++++-------
.../src/main/resources/common_tokens/asm | 10416 ++--
.../src/main/resources/common_tokens/ava | 5305 +-
.../src/main/resources/common_tokens/avk | 14236 +++--
.../src/main/resources/common_tokens/azb | 48707 ++++++++-------
.../src/main/resources/common_tokens/aze | 59620 +++++++++---------
.../src/main/resources/common_tokens/bak | 59078 +++++++++---------
.../src/main/resources/common_tokens/ban | 14234 +++--
.../src/main/resources/common_tokens/bar | 31353 +++++-----
.../src/main/resources/common_tokens/bcl | 24806 ++++----
.../src/main/resources/common_tokens/be-x-old | 59662 +++++++++---------
.../src/main/resources/common_tokens/bel | 59732 +++++++++---------
.../src/main/resources/common_tokens/ben | 28389 +++++----
.../src/main/resources/common_tokens/bjn | 10079 ++--
.../src/main/resources/common_tokens/bod | 527 +
.../src/main/resources/common_tokens/bre | 56480 ++++++++---------
.../src/main/resources/common_tokens/bul | 59778 +++++++++---------
.../src/main/resources/common_tokens/bxr | 7601 +--
.../src/main/resources/common_tokens/cat | 59066 +++++++++---------
.../src/main/resources/common_tokens/cdo-x-rom | 1279 +-
.../src/main/resources/common_tokens/ceb | 53368 ++++++++---------
.../src/main/resources/common_tokens/ces | 59868 +++++++++---------
.../src/main/resources/common_tokens/che | 56598 ++++++++---------
.../src/main/resources/common_tokens/chv | 33025 +++++-----
.../src/main/resources/common_tokens/ckb | 58370 +++++++++---------
.../src/main/resources/common_tokens/cnh | 35018 ++++++-----
.../src/main/resources/common_tokens/cor | 7559 +--
.../src/main/resources/common_tokens/cos | 16434 ++---
.../src/main/resources/common_tokens/csb | 4494 +-
.../src/main/resources/common_tokens/cym | 57934 +++++++++---------
.../src/main/resources/common_tokens/dag | 9356 +--
.../src/main/resources/common_tokens/dan | 59036 +++++++++---------
.../src/main/resources/common_tokens/deu | 59112 +++++++++---------
.../src/main/resources/common_tokens/diq | 35052 +++++++++--
.../src/main/resources/common_tokens/div | 15474 ++++-
.../src/main/resources/common_tokens/dsb | 5063 +-
.../src/main/resources/common_tokens/ell | 59866 +++++++++---------
.../src/main/resources/common_tokens/eng | 58222 +++++++++---------
.../src/main/resources/common_tokens/epo | 58852 +++++++++---------
.../src/main/resources/common_tokens/est | 59338 +++++++++---------
.../src/main/resources/common_tokens/eus | 59158 +++++++++---------
.../src/main/resources/common_tokens/ewe | 22320 +++----
.../src/main/resources/common_tokens/ext | 6421 +-
.../src/main/resources/common_tokens/fao | 20477 ++++---
.../src/main/resources/common_tokens/fas | 57870 +++++++++---------
.../src/main/resources/common_tokens/fin | 59528 +++++++++---------
.../src/main/resources/common_tokens/fra | 58382 +++++++++---------
.../src/main/resources/common_tokens/frr | 10142 ++--
.../src/main/resources/common_tokens/fry | 59206 +++++++++---------
.../src/main/resources/common_tokens/gla | 14147 +++--
.../src/main/resources/common_tokens/gle | 52606 ++++++++--------
.../src/main/resources/common_tokens/glg | 59292 +++++++++---------
.../src/main/resources/common_tokens/glv | 7881 +--
.../src/main/resources/common_tokens/gom | 10524 ++--
.../src/main/resources/common_tokens/grn | 9707 +--
.../src/main/resources/common_tokens/gsw | 59314 +++++++++---------
.../src/main/resources/common_tokens/guj | 11744 ++--
.../src/main/resources/common_tokens/hak-x-rom | 1421 +-
.../src/main/resources/common_tokens/hau | 45690 +++++++-------
.../src/main/resources/common_tokens/heb | 59994 +++++++++---------
.../src/main/resources/common_tokens/hil | 38597 ++++++------
.../src/main/resources/common_tokens/hin | 16128 +++--
.../src/main/resources/common_tokens/hrv | 59876 +++++++++---------
.../src/main/resources/common_tokens/hsb | 17857 +++---
.../src/main/resources/common_tokens/hun | 59516 +++++++++---------
.../src/main/resources/common_tokens/hye | 59522 +++++++++---------
.../src/main/resources/common_tokens/hyw | 40638 +++++++------
.../src/main/resources/common_tokens/ibo | 36791 ++++++------
.../src/main/resources/common_tokens/ido | 28746 +++++----
.../src/main/resources/common_tokens/ile | 6617 +-
.../src/main/resources/common_tokens/ilo | 13590 +++--
.../src/main/resources/common_tokens/ina | 19257 +++---
.../src/main/resources/common_tokens/ind | 57836 +++++++++---------
.../src/main/resources/common_tokens/isl | 59426 +++++++++---------
.../src/main/resources/common_tokens/ita | 59098 +++++++++---------
.../src/main/resources/common_tokens/jav | 58619 +++++++++---------
.../src/main/resources/common_tokens/jbo | 2284 +-
.../src/main/resources/common_tokens/jpn | 59914 +++++++++---------
.../src/main/resources/common_tokens/kaa | 9998 +--
.../src/main/resources/common_tokens/kab | 6933 ++-
.../src/main/resources/common_tokens/kan | 57362 +++++++++---------
.../src/main/resources/common_tokens/kat | 59766 +++++++++---------
.../src/main/resources/common_tokens/kaz | 59410 +++++++++---------
.../src/main/resources/common_tokens/kha | 20048 ++++---
.../src/main/resources/common_tokens/khm | 19506 +++---
.../src/main/resources/common_tokens/kin | 16652 ++---
.../src/main/resources/common_tokens/kir | 59714 +++++++++---------
.../src/main/resources/common_tokens/kor | 59970 +++++++++---------
.../src/main/resources/common_tokens/kpv | 7300 ++-
.../src/main/resources/common_tokens/ksh | 5224 +-
.../src/main/resources/common_tokens/kur | 41528 +++++++------
.../src/main/resources/common_tokens/lao | 1670 +-
.../src/main/resources/common_tokens/lat | 58786 +++++++++---------
.../src/main/resources/common_tokens/lav | 59622 +++++++++---------
.../src/main/resources/common_tokens/lez | 8326 +--
.../src/main/resources/common_tokens/lfn | 11797 ++--
.../src/main/resources/common_tokens/lim | 40637 +++++++------
.../src/main/resources/common_tokens/lit | 59478 +++++++++---------
.../src/main/resources/common_tokens/ltz | 57304 +++++++++---------
.../src/main/resources/common_tokens/lug | 10940 ++--
.../src/main/resources/common_tokens/lus | 55276 ++++++++---------
.../src/main/resources/common_tokens/mal | 57366 +++++++++---------
.../src/main/resources/common_tokens/mar | 20721 ++++---
.../src/main/resources/common_tokens/mhr | 11844 ++--
.../src/main/resources/common_tokens/min | 33269 +++++-----
.../src/main/resources/common_tokens/mkd | 59796 +++++++++---------
.../src/main/resources/common_tokens/mlg | 55190 ++++++++---------
.../src/main/resources/common_tokens/mlt | 37519 ++++++------
.../src/main/resources/common_tokens/mon | 57267 +++++++++---------
.../src/main/resources/common_tokens/mrj | 5384 +-
.../src/main/resources/common_tokens/msa | 56960 +++++++++---------
.../src/main/resources/common_tokens/mwl | 30033 +++++-----
.../src/main/resources/common_tokens/mya | 59952 +++++++++---------
.../src/main/resources/common_tokens/myv | 4626 +-
.../src/main/resources/common_tokens/mzn | 10865 ++--
.../src/main/resources/common_tokens/nds | 55806 ++++++++---------
.../src/main/resources/common_tokens/nep | 12508 ++--
.../src/main/resources/common_tokens/nld | 56386 ++++++++---------
.../src/main/resources/common_tokens/nno | 58638 +++++++++---------
.../src/main/resources/common_tokens/nob | 58518 +++++++++---------
.../src/main/resources/common_tokens/nqo | 5836 +-
.../src/main/resources/common_tokens/nso | 2416 +-
.../src/main/resources/common_tokens/nya | 57552 +++++++++---------
.../src/main/resources/common_tokens/olo | 4824 +-
.../src/main/resources/common_tokens/ori | 11158 ++--
.../src/main/resources/common_tokens/orm | 58182 +++++++++---------
.../src/main/resources/common_tokens/oss | 9637 +--
.../src/main/resources/common_tokens/pam | 7385 ++-
.../src/main/resources/common_tokens/pan | 6048 +-
.../src/main/resources/common_tokens/pap | 7386 +--
.../src/main/resources/common_tokens/pfl | 4325 +-
.../src/main/resources/common_tokens/pnb | 57824 +++++++++---------
.../src/main/resources/common_tokens/pol | 59670 +++++++++---------
.../src/main/resources/common_tokens/por | 59182 +++++++++---------
.../src/main/resources/common_tokens/pus | 59644 +++++++++---------
.../src/main/resources/common_tokens/roh | 25077 ++++----
.../src/main/resources/common_tokens/ron | 58502 +++++++++---------
.../src/main/resources/common_tokens/rue | 8460 +--
.../src/main/resources/common_tokens/rus | 59874 +++++++++---------
.../src/main/resources/common_tokens/sah | 37487 ++++++------
.../src/main/resources/common_tokens/san | 11790 ++--
.../src/main/resources/common_tokens/sat | 13787 +++--
.../src/main/resources/common_tokens/sgs | 6831 ++-
.../src/main/resources/common_tokens/sin | 31996 +++++-----
.../src/main/resources/common_tokens/skr | 13330 ++--
.../src/main/resources/common_tokens/slk | 59810 +++++++++---------
.../src/main/resources/common_tokens/slv | 59774 +++++++++---------
.../src/main/resources/common_tokens/sme | 3991 +-
.../src/main/resources/common_tokens/smn | 6356 +-
.../src/main/resources/common_tokens/smo | 42362 ++++++-------
.../src/main/resources/common_tokens/sna | 13057 ++--
.../src/main/resources/common_tokens/snd | 29833 ++++-----
.../src/main/resources/common_tokens/som | 18438 +++---
.../src/main/resources/common_tokens/spa | 59330 +++++++++---------
.../src/main/resources/common_tokens/sqi | 59562 +++++++++---------
.../src/main/resources/common_tokens/srp | 59966 +++++++++---------
.../src/main/resources/common_tokens/stq | 7243 ++-
.../src/main/resources/common_tokens/sun | 35409 ++++++-----
.../src/main/resources/common_tokens/swe | 51742 ++++++++--------
.../src/main/resources/common_tokens/swh | 50058 +++++++++-------
.../src/main/resources/common_tokens/szl | 13274 ++--
.../src/main/resources/common_tokens/szy | 10252 ++--
.../src/main/resources/common_tokens/tam | 30387 +++++-----
.../src/main/resources/common_tokens/tat | 54684 ++++++++---------
.../src/main/resources/common_tokens/tay | 2576 +-
.../src/main/resources/common_tokens/tel | 32278 +++++-----
.../src/main/resources/common_tokens/tet | 42675 ++++++-------
.../src/main/resources/common_tokens/tgk | 55669 +++++++++--------
.../src/main/resources/common_tokens/tgl | 59024 +++++++++---------
.../src/main/resources/common_tokens/tha | 47313 ++++++++-------
.../src/main/resources/common_tokens/tir | 59808 +++++++++---------
.../src/main/resources/common_tokens/trv | 6993 ++-
.../src/main/resources/common_tokens/tsn | 24409 ++++----
.../src/main/resources/common_tokens/tso | 22879 +++----
.../src/main/resources/common_tokens/tuk | 23762 ++++----
.../src/main/resources/common_tokens/tum | 10273 ++--
.../src/main/resources/common_tokens/tur | 59726 +++++++++---------
.../src/main/resources/common_tokens/tyv | 12173 ++--
.../src/main/resources/common_tokens/udm | 58990 +++++++++---------
.../src/main/resources/common_tokens/uig | 38074 ++++++------
.../src/main/resources/common_tokens/ukr | 59798 +++++++++---------
.../src/main/resources/common_tokens/urd | 56988 +++++++++---------
.../src/main/resources/common_tokens/uzb | 59316 +++++++++---------
.../src/main/resources/common_tokens/vep | 15464 ++---
.../src/main/resources/common_tokens/vie | 36981 ++++++------
.../src/main/resources/common_tokens/vls | 17456 +++---
.../src/main/resources/common_tokens/vol | 7883 +--
.../src/main/resources/common_tokens/vro | 6011 +-
.../src/main/resources/common_tokens/war | 44127 +++++++-------
.../src/main/resources/common_tokens/wln | 14345 +++--
.../src/main/resources/common_tokens/xho | 58710 +++++++++---------
.../src/main/resources/common_tokens/xmf | 21162 ++++---
.../src/main/resources/common_tokens/ydd | 26151 ++++----
.../src/main/resources/common_tokens/yor | 11592 ++--
.../src/main/resources/common_tokens/yue | 23964 +++++---
.../src/main/resources/common_tokens/zho | 60008 +++++++++----------
.../src/main/resources/common_tokens/zul | 10937 ++--
.../core/tokens/tools/CommonTokenGenerator.java | 7 +-
.../tika/langdetect/charsoup/CharSoupModel.java | 59 +-
.../charsoup/GenerativeLanguageModel.java | 953 +-
.../langdetect/charsoup/GlmScriptCategory.java | 174 +
.../charsoup/SaltedNgramFeatureExtractor.java | 502 +
.../charsoup/ScriptAwareFeatureExtractor.java | 79 +-
.../tika/langdetect/charsoup/ScriptCategory.java | 44 +-
.../charsoup/ShortTextFeatureExtractor.java | 70 +-
.../langdetect/charsoup/langdetect-20260320.bin | Bin 0 -> 6687361 bytes
.../charsoup/langdetect-generative-v1-20260310.bin | Bin 6679371 -> 0 bytes
.../charsoup/langdetect-generative-v4-20260320.bin | Bin 0 -> 7493241 bytes
.../charsoup/langdetect-short-v1-20260310.bin | Bin 3999308 -> 0 bytes
.../langdetect/charsoup/langdetect-v7-20260306.bin | Bin 3328628 -> 0 bytes
.../charsoup/CharSoupDetectorConfig.java | 60 +-
.../charsoup/CharSoupLanguageDetector.java | 523 +-
.../charsoup/CharSoupDetectorConfigTest.java | 44 +-
.../charsoup/CharSoupModelRoutingTest.java | 153 +-
.../langdetect/charsoup/LangIdRegressionTest.java | 26 +-
.../charsoup/ScriptAwareFeatureExtractorTest.java | 82 +-
.../charsoup/tools/CompareDetectors.java | 103 +-
.../langdetect/charsoup/tools/CorpusAliases.java | 79 +
.../tika/langdetect/charsoup/tools/FloresNorm.java | 109 +
.../charsoup/tools/GlmAdjudicateDiagnostic.java | 149 +
.../charsoup/tools/GlmNoiseSensitivityReport.java | 469 +
.../langdetect/charsoup/tools/GlmRerankerEval.java | 311 +
.../charsoup/tools/MarginDiagnostic.java | 94 +
.../langdetect/charsoup/tools/ModelQuantizer.java | 13 +-
.../langdetect/charsoup/tools/Phase2Trainer.java | 110 +-
.../langdetect/charsoup/tools/PrepareCorpus.java | 127 +-
.../langdetect/charsoup/tools/PrintDiscLabels.java | 19 +-
.../langdetect/charsoup/tools/RecalibrateGlm.java | 197 +
.../charsoup/tools/ResearchFeatureExtractor.java | 67 +-
.../tools/TrainGenerativeLanguageModel.java | 75 +-
.../charsoup/tools/TrainLanguageModel.java | 44 +-
.../langdetect/charsoup/tools/TrainShortModel.java | 179 -
tika-parent/pom.xml | 1 +
248 files changed, 3610388 insertions(+), 3370332 deletions(-)
create mode 100644 docs/modules/ROOT/pages/advanced/charset-detection-eval.txt
create mode 100644 docs/modules/ROOT/pages/advanced/flores-eval-20260320.txt
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bod
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GlmScriptCategory.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/SaltedNgramFeatureExtractor.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-20260320.bin
delete mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-generative-v1-20260310.bin
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-generative-v4-20260320.bin
delete mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-short-v1-20260310.bin
delete mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-v7-20260306.bin
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusAliases.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FloresNorm.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/GlmAdjudicateDiagnostic.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/GlmNoiseSensitivityReport.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/GlmRerankerEval.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/MarginDiagnostic.java
copy
tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java =>
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/PrintDiscLabels.java
(68%)
mode change 100755 => 100644
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/RecalibrateGlm.java
delete mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainShortModel.java