This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 7117454ca1 checkpoint - wip
     add 80a973185c TIKA-4692 - script block features and model updates (#2702)
     add 1185f7dbd0 specify eol to prevent checkstyle surprises on windows 
(#2701)
     add 963bbd8018 Merge branch 'main' into TIKA-4692-improve-ooxml-sax-parsers

No new revisions were added by this update.

Summary of changes:
 .../ROOT/pages/advanced/charset-detection-eval.txt |   274 +
 .../advanced/charsoup-supported-languages.adoc     |   207 +-
 .../ROOT/pages/advanced/flores-eval-20260320.txt   |   580 +
 .../pages/advanced/generative-language-model.adoc  |   140 +-
 .../pages/advanced/language-detection-build.adoc   |   621 +-
 .../ROOT/pages/advanced/language-detection.adoc    |     2 +-
 .../charsoup/CharSoupEncodingDetector.java         |   128 +-
 .../eval/core/metadata/TikaEvalMetadataFilter.java |    10 +-
 .../src/main/resources/common_tokens/ace           |  3261 +-
 .../src/main/resources/common_tokens/afr           | 58582 +++++++++---------
 .../src/main/resources/common_tokens/aka           | 19339 +++---
 .../src/main/resources/common_tokens/alt           |  5673 +-
 .../src/main/resources/common_tokens/amh           | 19160 +++---
 .../src/main/resources/common_tokens/ami           |  5460 +-
 .../src/main/resources/common_tokens/ara           | 59812 +++++++++---------
 .../src/main/resources/common_tokens/arg           | 48885 ++++++++-------
 .../src/main/resources/common_tokens/asm           | 10416 ++--
 .../src/main/resources/common_tokens/ava           |  5305 +-
 .../src/main/resources/common_tokens/avk           | 14236 +++--
 .../src/main/resources/common_tokens/azb           | 48707 ++++++++-------
 .../src/main/resources/common_tokens/aze           | 59620 +++++++++---------
 .../src/main/resources/common_tokens/bak           | 59078 +++++++++---------
 .../src/main/resources/common_tokens/ban           | 14234 +++--
 .../src/main/resources/common_tokens/bar           | 31353 +++++-----
 .../src/main/resources/common_tokens/bcl           | 24806 ++++----
 .../src/main/resources/common_tokens/be-x-old      | 59662 +++++++++---------
 .../src/main/resources/common_tokens/bel           | 59732 +++++++++---------
 .../src/main/resources/common_tokens/ben           | 28389 +++++----
 .../src/main/resources/common_tokens/bjn           | 10079 ++--
 .../src/main/resources/common_tokens/bod           |   527 +
 .../src/main/resources/common_tokens/bre           | 56480 ++++++++---------
 .../src/main/resources/common_tokens/bul           | 59778 +++++++++---------
 .../src/main/resources/common_tokens/bxr           |  7601 +--
 .../src/main/resources/common_tokens/cat           | 59066 +++++++++---------
 .../src/main/resources/common_tokens/cdo-x-rom     |  1279 +-
 .../src/main/resources/common_tokens/ceb           | 53368 ++++++++---------
 .../src/main/resources/common_tokens/ces           | 59868 +++++++++---------
 .../src/main/resources/common_tokens/che           | 56598 ++++++++---------
 .../src/main/resources/common_tokens/chv           | 33025 +++++-----
 .../src/main/resources/common_tokens/ckb           | 58370 +++++++++---------
 .../src/main/resources/common_tokens/cnh           | 35018 ++++++-----
 .../src/main/resources/common_tokens/cor           |  7559 +--
 .../src/main/resources/common_tokens/cos           | 16434 ++---
 .../src/main/resources/common_tokens/csb           |  4494 +-
 .../src/main/resources/common_tokens/cym           | 57934 +++++++++---------
 .../src/main/resources/common_tokens/dag           |  9356 +--
 .../src/main/resources/common_tokens/dan           | 59036 +++++++++---------
 .../src/main/resources/common_tokens/deu           | 59112 +++++++++---------
 .../src/main/resources/common_tokens/diq           | 35052 +++++++++--
 .../src/main/resources/common_tokens/div           | 15474 ++++-
 .../src/main/resources/common_tokens/dsb           |  5063 +-
 .../src/main/resources/common_tokens/ell           | 59866 +++++++++---------
 .../src/main/resources/common_tokens/eng           | 58222 +++++++++---------
 .../src/main/resources/common_tokens/epo           | 58852 +++++++++---------
 .../src/main/resources/common_tokens/est           | 59338 +++++++++---------
 .../src/main/resources/common_tokens/eus           | 59158 +++++++++---------
 .../src/main/resources/common_tokens/ewe           | 22320 +++----
 .../src/main/resources/common_tokens/ext           |  6421 +-
 .../src/main/resources/common_tokens/fao           | 20477 ++++---
 .../src/main/resources/common_tokens/fas           | 57870 +++++++++---------
 .../src/main/resources/common_tokens/fin           | 59528 +++++++++---------
 .../src/main/resources/common_tokens/fra           | 58382 +++++++++---------
 .../src/main/resources/common_tokens/frr           | 10142 ++--
 .../src/main/resources/common_tokens/fry           | 59206 +++++++++---------
 .../src/main/resources/common_tokens/gla           | 14147 +++--
 .../src/main/resources/common_tokens/gle           | 52606 ++++++++--------
 .../src/main/resources/common_tokens/glg           | 59292 +++++++++---------
 .../src/main/resources/common_tokens/glv           |  7881 +--
 .../src/main/resources/common_tokens/gom           | 10524 ++--
 .../src/main/resources/common_tokens/grn           |  9707 +--
 .../src/main/resources/common_tokens/gsw           | 59314 +++++++++---------
 .../src/main/resources/common_tokens/guj           | 11744 ++--
 .../src/main/resources/common_tokens/hak-x-rom     |  1421 +-
 .../src/main/resources/common_tokens/hau           | 45690 +++++++-------
 .../src/main/resources/common_tokens/heb           | 59994 +++++++++---------
 .../src/main/resources/common_tokens/hil           | 38597 ++++++------
 .../src/main/resources/common_tokens/hin           | 16128 +++--
 .../src/main/resources/common_tokens/hrv           | 59876 +++++++++---------
 .../src/main/resources/common_tokens/hsb           | 17857 +++---
 .../src/main/resources/common_tokens/hun           | 59516 +++++++++---------
 .../src/main/resources/common_tokens/hye           | 59522 +++++++++---------
 .../src/main/resources/common_tokens/hyw           | 40638 +++++++------
 .../src/main/resources/common_tokens/ibo           | 36791 ++++++------
 .../src/main/resources/common_tokens/ido           | 28746 +++++----
 .../src/main/resources/common_tokens/ile           |  6617 +-
 .../src/main/resources/common_tokens/ilo           | 13590 +++--
 .../src/main/resources/common_tokens/ina           | 19257 +++---
 .../src/main/resources/common_tokens/ind           | 57836 +++++++++---------
 .../src/main/resources/common_tokens/isl           | 59426 +++++++++---------
 .../src/main/resources/common_tokens/ita           | 59098 +++++++++---------
 .../src/main/resources/common_tokens/jav           | 58619 +++++++++---------
 .../src/main/resources/common_tokens/jbo           |  2284 +-
 .../src/main/resources/common_tokens/jpn           | 59914 +++++++++---------
 .../src/main/resources/common_tokens/kaa           |  9998 +--
 .../src/main/resources/common_tokens/kab           |  6933 ++-
 .../src/main/resources/common_tokens/kan           | 57362 +++++++++---------
 .../src/main/resources/common_tokens/kat           | 59766 +++++++++---------
 .../src/main/resources/common_tokens/kaz           | 59410 +++++++++---------
 .../src/main/resources/common_tokens/kha           | 20048 ++++---
 .../src/main/resources/common_tokens/khm           | 19506 +++---
 .../src/main/resources/common_tokens/kin           | 16652 ++---
 .../src/main/resources/common_tokens/kir           | 59714 +++++++++---------
 .../src/main/resources/common_tokens/kor           | 59970 +++++++++---------
 .../src/main/resources/common_tokens/kpv           |  7300 ++-
 .../src/main/resources/common_tokens/ksh           |  5224 +-
 .../src/main/resources/common_tokens/kur           | 41528 +++++++------
 .../src/main/resources/common_tokens/lao           |  1670 +-
 .../src/main/resources/common_tokens/lat           | 58786 +++++++++---------
 .../src/main/resources/common_tokens/lav           | 59622 +++++++++---------
 .../src/main/resources/common_tokens/lez           |  8326 +--
 .../src/main/resources/common_tokens/lfn           | 11797 ++--
 .../src/main/resources/common_tokens/lim           | 40637 +++++++------
 .../src/main/resources/common_tokens/lit           | 59478 +++++++++---------
 .../src/main/resources/common_tokens/ltz           | 57304 +++++++++---------
 .../src/main/resources/common_tokens/lug           | 10940 ++--
 .../src/main/resources/common_tokens/lus           | 55276 ++++++++---------
 .../src/main/resources/common_tokens/mal           | 57366 +++++++++---------
 .../src/main/resources/common_tokens/mar           | 20721 ++++---
 .../src/main/resources/common_tokens/mhr           | 11844 ++--
 .../src/main/resources/common_tokens/min           | 33269 +++++-----
 .../src/main/resources/common_tokens/mkd           | 59796 +++++++++---------
 .../src/main/resources/common_tokens/mlg           | 55190 ++++++++---------
 .../src/main/resources/common_tokens/mlt           | 37519 ++++++------
 .../src/main/resources/common_tokens/mon           | 57267 +++++++++---------
 .../src/main/resources/common_tokens/mrj           |  5384 +-
 .../src/main/resources/common_tokens/msa           | 56960 +++++++++---------
 .../src/main/resources/common_tokens/mwl           | 30033 +++++-----
 .../src/main/resources/common_tokens/mya           | 59952 +++++++++---------
 .../src/main/resources/common_tokens/myv           |  4626 +-
 .../src/main/resources/common_tokens/mzn           | 10865 ++--
 .../src/main/resources/common_tokens/nds           | 55806 ++++++++---------
 .../src/main/resources/common_tokens/nep           | 12508 ++--
 .../src/main/resources/common_tokens/nld           | 56386 ++++++++---------
 .../src/main/resources/common_tokens/nno           | 58638 +++++++++---------
 .../src/main/resources/common_tokens/nob           | 58518 +++++++++---------
 .../src/main/resources/common_tokens/nqo           |  5836 +-
 .../src/main/resources/common_tokens/nso           |  2416 +-
 .../src/main/resources/common_tokens/nya           | 57552 +++++++++---------
 .../src/main/resources/common_tokens/olo           |  4824 +-
 .../src/main/resources/common_tokens/ori           | 11158 ++--
 .../src/main/resources/common_tokens/orm           | 58182 +++++++++---------
 .../src/main/resources/common_tokens/oss           |  9637 +--
 .../src/main/resources/common_tokens/pam           |  7385 ++-
 .../src/main/resources/common_tokens/pan           |  6048 +-
 .../src/main/resources/common_tokens/pap           |  7386 +--
 .../src/main/resources/common_tokens/pfl           |  4325 +-
 .../src/main/resources/common_tokens/pnb           | 57824 +++++++++---------
 .../src/main/resources/common_tokens/pol           | 59670 +++++++++---------
 .../src/main/resources/common_tokens/por           | 59182 +++++++++---------
 .../src/main/resources/common_tokens/pus           | 59644 +++++++++---------
 .../src/main/resources/common_tokens/roh           | 25077 ++++----
 .../src/main/resources/common_tokens/ron           | 58502 +++++++++---------
 .../src/main/resources/common_tokens/rue           |  8460 +--
 .../src/main/resources/common_tokens/rus           | 59874 +++++++++---------
 .../src/main/resources/common_tokens/sah           | 37487 ++++++------
 .../src/main/resources/common_tokens/san           | 11790 ++--
 .../src/main/resources/common_tokens/sat           | 13787 +++--
 .../src/main/resources/common_tokens/sgs           |  6831 ++-
 .../src/main/resources/common_tokens/sin           | 31996 +++++-----
 .../src/main/resources/common_tokens/skr           | 13330 ++--
 .../src/main/resources/common_tokens/slk           | 59810 +++++++++---------
 .../src/main/resources/common_tokens/slv           | 59774 +++++++++---------
 .../src/main/resources/common_tokens/sme           |  3991 +-
 .../src/main/resources/common_tokens/smn           |  6356 +-
 .../src/main/resources/common_tokens/smo           | 42362 ++++++-------
 .../src/main/resources/common_tokens/sna           | 13057 ++--
 .../src/main/resources/common_tokens/snd           | 29833 ++++-----
 .../src/main/resources/common_tokens/som           | 18438 +++---
 .../src/main/resources/common_tokens/spa           | 59330 +++++++++---------
 .../src/main/resources/common_tokens/sqi           | 59562 +++++++++---------
 .../src/main/resources/common_tokens/srp           | 59966 +++++++++---------
 .../src/main/resources/common_tokens/stq           |  7243 ++-
 .../src/main/resources/common_tokens/sun           | 35409 ++++++-----
 .../src/main/resources/common_tokens/swe           | 51742 ++++++++--------
 .../src/main/resources/common_tokens/swh           | 50058 +++++++++-------
 .../src/main/resources/common_tokens/szl           | 13274 ++--
 .../src/main/resources/common_tokens/szy           | 10252 ++--
 .../src/main/resources/common_tokens/tam           | 30387 +++++-----
 .../src/main/resources/common_tokens/tat           | 54684 ++++++++---------
 .../src/main/resources/common_tokens/tay           |  2576 +-
 .../src/main/resources/common_tokens/tel           | 32278 +++++-----
 .../src/main/resources/common_tokens/tet           | 42675 ++++++-------
 .../src/main/resources/common_tokens/tgk           | 55669 +++++++++--------
 .../src/main/resources/common_tokens/tgl           | 59024 +++++++++---------
 .../src/main/resources/common_tokens/tha           | 47313 ++++++++-------
 .../src/main/resources/common_tokens/tir           | 59808 +++++++++---------
 .../src/main/resources/common_tokens/trv           |  6993 ++-
 .../src/main/resources/common_tokens/tsn           | 24409 ++++----
 .../src/main/resources/common_tokens/tso           | 22879 +++----
 .../src/main/resources/common_tokens/tuk           | 23762 ++++----
 .../src/main/resources/common_tokens/tum           | 10273 ++--
 .../src/main/resources/common_tokens/tur           | 59726 +++++++++---------
 .../src/main/resources/common_tokens/tyv           | 12173 ++--
 .../src/main/resources/common_tokens/udm           | 58990 +++++++++---------
 .../src/main/resources/common_tokens/uig           | 38074 ++++++------
 .../src/main/resources/common_tokens/ukr           | 59798 +++++++++---------
 .../src/main/resources/common_tokens/urd           | 56988 +++++++++---------
 .../src/main/resources/common_tokens/uzb           | 59316 +++++++++---------
 .../src/main/resources/common_tokens/vep           | 15464 ++---
 .../src/main/resources/common_tokens/vie           | 36981 ++++++------
 .../src/main/resources/common_tokens/vls           | 17456 +++---
 .../src/main/resources/common_tokens/vol           |  7883 +--
 .../src/main/resources/common_tokens/vro           |  6011 +-
 .../src/main/resources/common_tokens/war           | 44127 +++++++-------
 .../src/main/resources/common_tokens/wln           | 14345 +++--
 .../src/main/resources/common_tokens/xho           | 58710 +++++++++---------
 .../src/main/resources/common_tokens/xmf           | 21162 ++++---
 .../src/main/resources/common_tokens/ydd           | 26151 ++++----
 .../src/main/resources/common_tokens/yor           | 11592 ++--
 .../src/main/resources/common_tokens/yue           | 23964 +++++---
 .../src/main/resources/common_tokens/zho           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/zul           | 10937 ++--
 .../core/tokens/tools/CommonTokenGenerator.java    |     7 +-
 .../tika/langdetect/charsoup/CharSoupModel.java    |    59 +-
 .../charsoup/GenerativeLanguageModel.java          |   953 +-
 .../langdetect/charsoup/GlmScriptCategory.java     |   174 +
 .../charsoup/SaltedNgramFeatureExtractor.java      |   502 +
 .../charsoup/ScriptAwareFeatureExtractor.java      |    79 +-
 .../tika/langdetect/charsoup/ScriptCategory.java   |    44 +-
 .../charsoup/ShortTextFeatureExtractor.java        |    70 +-
 .../langdetect/charsoup/langdetect-20260320.bin    |   Bin 0 -> 6687361 bytes
 .../charsoup/langdetect-generative-v1-20260310.bin |   Bin 6679371 -> 0 bytes
 .../charsoup/langdetect-generative-v4-20260320.bin |   Bin 0 -> 7493241 bytes
 .../charsoup/langdetect-short-v1-20260310.bin      |   Bin 3999308 -> 0 bytes
 .../langdetect/charsoup/langdetect-v7-20260306.bin |   Bin 3328628 -> 0 bytes
 .../charsoup/CharSoupDetectorConfig.java           |    60 +-
 .../charsoup/CharSoupLanguageDetector.java         |   523 +-
 .../charsoup/CharSoupDetectorConfigTest.java       |    44 +-
 .../charsoup/CharSoupModelRoutingTest.java         |   153 +-
 .../langdetect/charsoup/LangIdRegressionTest.java  |    26 +-
 .../charsoup/ScriptAwareFeatureExtractorTest.java  |    82 +-
 .../charsoup/tools/CompareDetectors.java           |   103 +-
 .../langdetect/charsoup/tools/CorpusAliases.java   |    79 +
 .../tika/langdetect/charsoup/tools/FloresNorm.java |   109 +
 .../charsoup/tools/GlmAdjudicateDiagnostic.java    |   149 +
 .../charsoup/tools/GlmNoiseSensitivityReport.java  |   469 +
 .../langdetect/charsoup/tools/GlmRerankerEval.java |   311 +
 .../charsoup/tools/MarginDiagnostic.java           |    94 +
 .../langdetect/charsoup/tools/ModelQuantizer.java  |    13 +-
 .../langdetect/charsoup/tools/Phase2Trainer.java   |   110 +-
 .../langdetect/charsoup/tools/PrepareCorpus.java   |   127 +-
 .../langdetect/charsoup/tools/PrintDiscLabels.java |    19 +-
 .../langdetect/charsoup/tools/RecalibrateGlm.java  |   197 +
 .../charsoup/tools/ResearchFeatureExtractor.java   |    67 +-
 .../tools/TrainGenerativeLanguageModel.java        |    75 +-
 .../charsoup/tools/TrainLanguageModel.java         |    44 +-
 .../langdetect/charsoup/tools/TrainShortModel.java |   179 -
 tika-parent/pom.xml                                |     1 +
 248 files changed, 3610388 insertions(+), 3370332 deletions(-)
 create mode 100644 docs/modules/ROOT/pages/advanced/charset-detection-eval.txt
 create mode 100644 docs/modules/ROOT/pages/advanced/flores-eval-20260320.txt
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bod
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GlmScriptCategory.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/SaltedNgramFeatureExtractor.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-20260320.bin
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-generative-v1-20260310.bin
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-generative-v4-20260320.bin
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-short-v1-20260310.bin
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-v7-20260306.bin
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusAliases.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FloresNorm.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/GlmAdjudicateDiagnostic.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/GlmNoiseSensitivityReport.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/GlmRerankerEval.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/MarginDiagnostic.java
 copy 
tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java => 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/PrintDiscLabels.java
 (68%)
 mode change 100755 => 100644
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/RecalibrateGlm.java
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainShortModel.java

Reply via email to