This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch elasticsearch-emitter in repository https://gitbox.apache.org/repos/asf/tika.git
commit 663e01237452dbf630569c23d403a73f8b183ae0 Merge: 8e548d6adb c4d67657a0 Author: tballison <[email protected]> AuthorDate: Fri Feb 20 10:33:01 2026 -0500 Merge remote-tracking branch 'origin/main' into elasticsearch-emitter .github/workflows/main-jdk17-build.yml | 1 + .../main-jdk17-windows-build-multi-locale.yml | 1 + .github/workflows/main-jdk17-windows-build.yml | 1 + .github/workflows/main-jdk21-build.yml | 1 + .github/workflows/main-jdk25-build.yml | 1 + .gitignore | 1 + docs/modules/ROOT/examples/tess4j-basic.json | 10 + docs/modules/ROOT/examples/tess4j-full.json | 18 + docs/modules/ROOT/nav.adoc | 5 + docs/modules/ROOT/pages/advanced/index.adoc | 2 + .../pages/advanced/language-detection-build.adoc | 550 + .../ROOT/pages/advanced/language-detection.adoc | 401 + .../pages/configuration/parsers/tess4j-parser.adoc | 282 + .../pages/migration-to-4x/chunk-strategies.adoc | 257 + .../pages/migration-to-4x/design-notes-4x.adoc | 2 +- .../inference-handler-requirements.adoc | 282 + docs/pom.xml | 2 +- pom.xml | 2 +- tika-annotation-processor/pom.xml | 2 +- tika-app/pom.xml | 2 +- tika-bom/pom.xml | 155 +- tika-bundles/pom.xml | 4 +- tika-bundles/tika-bundle-standard/pom.xml | 4 +- tika-core/pom.xml | 2 +- .../apache/tika/metadata/TikaCoreProperties.java | 12 +- .../java/org/apache/tika/parser/ParseContext.java | 15 +- .../tika/sax/BasicContentHandlerFactory.java | 7 +- .../org/apache/tika/sax/ContentHandlerFactory.java | 15 + .../tika/sax/RecursiveParserWrapperHandler.java | 2 + .../org/apache/tika/parser/ParseContextTest.java | 104 + tika-detectors/pom.xml | 2 +- tika-detectors/tika-detector-magika/pom.xml | 2 +- tika-detectors/tika-detector-siegfried/pom.xml | 2 +- tika-e2e-tests/pom.xml | 32 +- tika-e2e-tests/tika-grpc/pom.xml | 2 +- tika-eval/pom.xml | 2 +- tika-eval/tika-eval-app/pom.xml | 2 +- .../org/apache/tika/eval/app/ProfilerBase.java | 2 +- .../eval/app/tools/BatchTopCommonTokenCounter.java | 60 - .../eval/app/tools/SlowCompositeReaderWrapper.java | 391 - .../tika/eval/app/tools/TopCommonTokenCounter.java | 363 - .../apache/tika/eval/app/AnalyzerManagerTest.java | 76 +- .../apache/tika/eval/app/SimpleComparerTest.java | 12 +- .../eval/app/tools/TopCommonTokenCounterTest.java | 80 +- .../resources/test-dirs/extractsB/file1.pdf.json | 2 +- tika-eval/tika-eval-core/pom.xml | 26 +- .../tika/eval/core/langid/LanguageIDWrapper.java | 9 +- .../tika/eval/core/textstats/CommonTokens.java | 22 +- .../textstats/CompositeTextStatsCalculator.java | 90 +- .../core/textstats/TokenCountPriorityQueue.java | 47 +- .../core/tokens/AlphaIdeographFilterFactory.java | 94 - .../eval/core/tokens/AnalyzerDeserializer.java | 198 - .../tika/eval/core/tokens/AnalyzerManager.java | 79 +- .../tokens/CJKBigramAwareLengthFilterFactory.java | 96 - .../eval/core/tokens/CommonTokenCountManager.java | 7 +- .../tika/eval/core/tokens/TikaEvalTokenizer.java | 300 + .../tika/eval/core/tokens/TokenContraster.java | 45 +- .../eval/core/tokens/TokenCountPriorityQueue.java | 53 +- .../apache/tika/eval/core/tokens/TokenCounter.java | 152 - .../tokens/URLEmailNormalizingFilterFactory.java | 88 - .../src/main/resources/common_tokens/afr | 60004 +++++++++--------- .../src/main/resources/common_tokens/amh | 25580 ++++---- .../src/main/resources/common_tokens/ara | 60008 +++++++++--------- .../src/main/resources/common_tokens/arg | 9618 +++ .../src/main/resources/common_tokens/arz | 17123 ++++++ .../src/main/resources/common_tokens/asm | 14736 ++--- .../src/main/resources/common_tokens/ast | 43989 +++++++++----- .../src/main/resources/common_tokens/aze | 60004 +++++++++--------- .../src/main/resources/common_tokens/bak | 53066 +++++++++------- .../src/main/resources/common_tokens/bam | 1192 + .../src/main/resources/common_tokens/ban | 9402 +-- .../src/main/resources/common_tokens/bar | 17900 ++++++ .../src/main/resources/common_tokens/bcl | 2863 + .../src/main/resources/common_tokens/bel | 60008 +++++++++--------- .../src/main/resources/common_tokens/ben | 43908 +++++++++----- .../src/main/resources/common_tokens/ben-rom | 30022 ---------- .../src/main/resources/common_tokens/bih | 903 - .../src/main/resources/common_tokens/bjn | 4081 ++ .../src/main/resources/common_tokens/bos | 60004 +++++++++--------- .../src/main/resources/common_tokens/bpy | 1001 + .../src/main/resources/common_tokens/bre | 26954 ++++++--- .../src/main/resources/common_tokens/bua | 2734 + .../src/main/resources/common_tokens/bul | 59964 +++++++++--------- .../src/main/resources/common_tokens/cat | 60000 +++++++++--------- .../src/main/resources/common_tokens/ceb | 55570 +++++++++-------- .../src/main/resources/common_tokens/ces | 59978 +++++++++--------- .../src/main/resources/common_tokens/che | 10343 ++-- .../src/main/resources/common_tokens/chv | 10199 ++++ .../src/main/resources/common_tokens/ckb | 16758 +++++- .../src/main/resources/common_tokens/cmn | 30022 ---------- .../src/main/resources/common_tokens/cos | 2835 + .../src/main/resources/common_tokens/csb | 1668 + .../src/main/resources/common_tokens/cym | 30933 ++++++---- .../src/main/resources/common_tokens/dan | 59964 +++++++++--------- .../src/main/resources/common_tokens/deu | 59986 +++++++++--------- .../src/main/resources/common_tokens/diq | 5984 ++ .../src/main/resources/common_tokens/div | 60010 +++++++++---------- .../src/main/resources/common_tokens/dsb | 1320 + .../src/main/resources/common_tokens/ekk | 30022 ---------- .../src/main/resources/common_tokens/ell | 60004 +++++++++--------- .../src/main/resources/common_tokens/eml | 5000 ++ .../src/main/resources/common_tokens/eng | 59996 +++++++++--------- .../src/main/resources/common_tokens/epo | 59970 +++++++++--------- .../src/main/resources/common_tokens/est | 59990 +++++++++--------- .../src/main/resources/common_tokens/eus | 59746 +++++++++--------- .../src/main/resources/common_tokens/ewe | 1484 + .../src/main/resources/common_tokens/ext | 2768 + .../src/main/resources/common_tokens/fao | 59778 +++++++++--------- .../src/main/resources/common_tokens/fas | 60010 +++++++++---------- .../src/main/resources/common_tokens/fin | 59978 +++++++++--------- .../src/main/resources/common_tokens/fra | 59992 +++++++++--------- .../src/main/resources/common_tokens/frr | 1994 + .../src/main/resources/common_tokens/fry | 46948 +++++++++------ .../src/main/resources/common_tokens/ful | 9679 --- .../src/main/resources/common_tokens/gla | 26778 --------- .../src/main/resources/common_tokens/gle | 47428 ++++++++------- .../src/main/resources/common_tokens/glg | 60010 +++++++++---------- .../src/main/resources/common_tokens/glv | 2583 + .../src/main/resources/common_tokens/gom | 18860 +++--- .../src/main/resources/common_tokens/grn | 3448 ++ .../src/main/resources/common_tokens/gsw | 42562 +++++++++---- .../src/main/resources/common_tokens/gug | 5528 -- .../src/main/resources/common_tokens/guj | 46584 ++++++-------- .../src/main/resources/common_tokens/hat | 11055 ++-- .../src/main/resources/common_tokens/hau | 37315 +++--------- .../src/main/resources/common_tokens/hbs | 30020 ++++++++++ .../src/main/resources/common_tokens/heb | 60010 +++++++++---------- .../src/main/resources/common_tokens/hif | 2137 + .../src/main/resources/common_tokens/hin | 60006 +++++++++--------- .../src/main/resources/common_tokens/hin-rom | 30022 ---------- .../src/main/resources/common_tokens/hrv | 60008 +++++++++--------- .../src/main/resources/common_tokens/hsb | 5205 ++ .../src/main/resources/common_tokens/hun | 60000 +++++++++--------- .../src/main/resources/common_tokens/hye | 60010 +++++++++---------- .../src/main/resources/common_tokens/ibo | 11859 +--- .../src/main/resources/common_tokens/ido | 8656 +++ .../src/main/resources/common_tokens/ile | 956 + .../src/main/resources/common_tokens/ilo | 3845 ++ .../src/main/resources/common_tokens/ina | 6880 +++ .../src/main/resources/common_tokens/ind | 59978 +++++++++--------- .../src/main/resources/common_tokens/isl | 59592 +++++++++--------- .../src/main/resources/common_tokens/ita | 59976 +++++++++--------- .../src/main/resources/common_tokens/jav | 42845 +++++++++---- .../src/main/resources/common_tokens/jpn | 60010 +++++++++---------- .../src/main/resources/common_tokens/kal | 6341 ++ .../src/main/resources/common_tokens/kan | 43538 +++++++++----- .../src/main/resources/common_tokens/kat | 60004 +++++++++--------- .../src/main/resources/common_tokens/kaz | 60010 +++++++++---------- .../src/main/resources/common_tokens/khk | 4187 ++ .../src/main/resources/common_tokens/khm | 30022 ---------- .../src/main/resources/common_tokens/kin | 12065 ++-- .../src/main/resources/common_tokens/kir | 55878 +++++++++-------- .../src/main/resources/common_tokens/knn | 5022 -- .../src/main/resources/common_tokens/koi | 1373 + .../src/main/resources/common_tokens/kom | 2382 + .../src/main/resources/common_tokens/kor | 60010 +++++++++---------- .../src/main/resources/common_tokens/krc | 1974 + .../src/main/resources/common_tokens/ksh | 2841 + .../src/main/resources/common_tokens/kur | 42426 ++++--------- .../src/main/resources/common_tokens/lad | 1681 + .../src/main/resources/common_tokens/lao | 30826 +--------- .../src/main/resources/common_tokens/lat | 42560 +++++++++---- .../src/main/resources/common_tokens/lav | 60008 +++++++++--------- .../src/main/resources/common_tokens/lim | 41907 +++++++++---- .../src/main/resources/common_tokens/lin | 5655 -- .../src/main/resources/common_tokens/lit | 60004 +++++++++--------- .../src/main/resources/common_tokens/lmo | 6924 +++ .../src/main/resources/common_tokens/ltz | 40377 +++++++++---- .../src/main/resources/common_tokens/lug | 53486 +++++++++-------- .../src/main/resources/common_tokens/lup | 905 + .../src/main/resources/common_tokens/lus | 10034 ++++ .../src/main/resources/common_tokens/lvs | 30022 ---------- .../src/main/resources/common_tokens/mai | 755 + .../src/main/resources/common_tokens/mal | 31778 +++++++++- .../src/main/resources/common_tokens/mar | 56710 +++++++++--------- .../src/main/resources/common_tokens/mhr | 8357 ++- .../src/main/resources/common_tokens/min | 16659 +++-- .../src/main/resources/common_tokens/mkd | 60010 +++++++++---------- .../src/main/resources/common_tokens/mlg | 11905 ++-- .../src/main/resources/common_tokens/mlt | 53228 ++++++++-------- .../src/main/resources/common_tokens/mon | 44624 +++++++++----- .../src/main/resources/common_tokens/mri | 14151 +++-- .../src/main/resources/common_tokens/mrj | 1994 + .../src/main/resources/common_tokens/msa | 58366 +++++++++--------- .../src/main/resources/common_tokens/mwl | 12264 ++++ .../src/main/resources/common_tokens/mya | 30022 ---------- .../src/main/resources/common_tokens/mya-zaw | 30022 ---------- .../src/main/resources/common_tokens/myv | 1132 + .../src/main/resources/common_tokens/mzn | 5501 ++ .../src/main/resources/common_tokens/nan | 9022 ++- .../src/main/resources/common_tokens/nap | 2039 + .../src/main/resources/common_tokens/nav | 533 + .../src/main/resources/common_tokens/ndo | 3142 + .../src/main/resources/common_tokens/nds | 35077 +++++++---- .../src/main/resources/common_tokens/nep | 50092 +++++++++------- .../src/main/resources/common_tokens/new | 3537 +- .../src/main/resources/common_tokens/nld | 59966 +++++++++--------- .../src/main/resources/common_tokens/nno | 60010 +++++++++---------- .../src/main/resources/common_tokens/nob | 60000 +++++++++--------- .../src/main/resources/common_tokens/nso | 8919 ++- .../src/main/resources/common_tokens/oci | 12539 ---- .../src/main/resources/common_tokens/ori | 13578 +++-- .../src/main/resources/common_tokens/orm | 31985 +--------- .../src/main/resources/common_tokens/oss | 3369 ++ .../src/main/resources/common_tokens/pam | 3054 + .../src/main/resources/common_tokens/pan | 18234 +++--- .../src/main/resources/common_tokens/pap | 9143 +++ .../src/main/resources/common_tokens/pes | 30022 ---------- .../src/main/resources/common_tokens/pfl | 2452 + .../src/main/resources/common_tokens/pms | 6552 ++ .../src/main/resources/common_tokens/pnb | 37685 +++++++++--- .../src/main/resources/common_tokens/pol | 60000 +++++++++--------- .../src/main/resources/common_tokens/por | 59990 +++++++++--------- .../src/main/resources/common_tokens/prs | 12167 ++++ .../src/main/resources/common_tokens/pus | 34381 +++++++---- .../src/main/resources/common_tokens/que | 2170 + .../src/main/resources/common_tokens/quz | 4441 -- .../src/main/resources/common_tokens/roh | 35391 ++++++----- .../src/main/resources/common_tokens/ron | 60004 +++++++++--------- .../src/main/resources/common_tokens/rue | 2797 + .../src/main/resources/common_tokens/run | 3534 ++ .../src/main/resources/common_tokens/rus | 60002 +++++++++--------- .../src/main/resources/common_tokens/sah | 14433 +++++ .../src/main/resources/common_tokens/san | 11972 +++- .../src/main/resources/common_tokens/scn | 7559 +++ .../src/main/resources/common_tokens/sco | 12070 ++++ .../src/main/resources/common_tokens/sgs | 2547 + .../src/main/resources/common_tokens/sin | 27552 ++++++--- .../src/main/resources/common_tokens/slk | 60008 +++++++++--------- .../src/main/resources/common_tokens/slv | 60010 +++++++++---------- .../src/main/resources/common_tokens/sme | 4120 ++ .../src/main/resources/common_tokens/smi | 1676 + .../src/main/resources/common_tokens/sna | 23750 ++++++++ .../src/main/resources/common_tokens/snd | 18834 ++++-- .../src/main/resources/common_tokens/som | 37241 +++++++----- .../src/main/resources/common_tokens/sot | 3535 ++ .../src/main/resources/common_tokens/spa | 59990 +++++++++--------- .../src/main/resources/common_tokens/sqi | 60010 +++++++++---------- .../src/main/resources/common_tokens/srd | 4285 +- .../src/main/resources/common_tokens/srp | 60010 +++++++++---------- .../src/main/resources/common_tokens/ssw | 2278 +- .../src/main/resources/common_tokens/sun | 29316 +++++++++ .../src/main/resources/common_tokens/swa | 9604 --- .../src/main/resources/common_tokens/swe | 59992 +++++++++--------- .../src/main/resources/common_tokens/swh | 4172 ++ .../src/main/resources/common_tokens/szl | 2654 + .../src/main/resources/common_tokens/tam | 42694 +++++++++---- .../src/main/resources/common_tokens/tam-rom | 30022 ---------- .../src/main/resources/common_tokens/tat | 59968 +++++++++--------- .../src/main/resources/common_tokens/tel | 54347 +++++++++-------- .../src/main/resources/common_tokens/tel-rom | 30022 ---------- .../src/main/resources/common_tokens/tgk | 49781 +++++++++------ .../src/main/resources/common_tokens/tgl | 59996 +++++++++--------- .../src/main/resources/common_tokens/tha | 33064 +++++++++- .../src/main/resources/common_tokens/tsn | 19224 ++---- .../src/main/resources/common_tokens/tso | 4677 ++ .../src/main/resources/common_tokens/tuk | 35478 +++++------ .../src/main/resources/common_tokens/tur | 60006 +++++++++--------- .../src/main/resources/common_tokens/tyv | 2933 + .../src/main/resources/common_tokens/udm | 1401 + .../src/main/resources/common_tokens/uig | 34819 +++++------ .../src/main/resources/common_tokens/ukr | 59998 +++++++++--------- .../src/main/resources/common_tokens/urd | 59962 +++++++++--------- .../src/main/resources/common_tokens/urd-rom | 30022 ---------- .../src/main/resources/common_tokens/uzb | 56433 +++++++++-------- .../src/main/resources/common_tokens/uzn | 30020 ++++++++++ .../src/main/resources/common_tokens/ven | 2457 + .../src/main/resources/common_tokens/vie | 55964 +++++++++-------- .../src/main/resources/common_tokens/vls | 7607 +++ .../src/main/resources/common_tokens/vol | 5010 +- .../src/main/resources/common_tokens/vro | 2069 + .../src/main/resources/common_tokens/war | 47024 +++++++++------ .../src/main/resources/common_tokens/wln | 5792 ++ .../src/main/resources/common_tokens/wol | 4502 -- .../src/main/resources/common_tokens/wuu | 30020 ++++++++++ .../src/main/resources/common_tokens/xho | 31235 +++++++--- .../src/main/resources/common_tokens/xmf | 6407 ++ .../src/main/resources/common_tokens/ydd | 9537 +++ .../src/main/resources/common_tokens/yid | 7668 --- .../src/main/resources/common_tokens/yor | 4315 +- .../src/main/resources/common_tokens/zea | 2318 + .../src/main/resources/common_tokens/zho | 30020 ++++++++++ .../src/main/resources/common_tokens/zho-simp | 30022 ---------- .../src/main/resources/common_tokens/zho-trad | 30022 ---------- .../src/main/resources/common_tokens/zul | 45631 +++++++++----- .../src/main/resources/lucene-analyzers.json | 66 - .../apache/tika/eval/core/langid/LangIdTest.java | 18 +- .../core/metadata/TikaEvalMetadataFilterTest.java | 8 +- .../tika/eval/core/textstats/TextStatsTest.java | 18 +- .../tika/eval/core/tokens/LuceneTokenCounter.java | 186 - .../core/tokens/TikaEvalTokenizerFuzzTest.java | 164 + .../tika/eval/core/tokens/TokenCounterTest.java | 127 - .../core/tokens/tools/CommonTokenGenerator.java | 337 + .../apache/tika/eval/core/util/LanguageIdTest.java | 13 +- tika-example/pom.xml | 2 +- tika-grpc/pom.xml | 2 +- tika-handlers/pom.xml | 2 +- tika-handlers/tika-handler-boilerpipe/pom.xml | 2 +- tika-integration-tests/pom.xml | 2 +- .../tika-pipes-kafka-integration-tests/pom.xml | 2 +- .../pom.xml | 2 +- .../tika-pipes-s3-integration-tests/pom.xml | 2 +- .../tika-pipes-solr-integration-tests/pom.xml | 2 +- .../tika-resource-loading-tests/pom.xml | 2 +- tika-integration-tests/tika-woodstox-tests/pom.xml | 2 +- tika-java7/pom.xml | 2 +- tika-langdetect/pom.xml | 4 +- .../pom.xml | 22 +- .../charsoup/CharSoupFeatureExtractor.java | 456 + .../tika/langdetect/charsoup/CharSoupModel.java | 363 + .../tika/langdetect/charsoup/FeatureExtractor.java | 67 + .../charsoup/ScriptAwareFeatureExtractor.java | 314 + .../tika/langdetect/charsoup/ScriptCategory.java | 117 + .../tika/langdetect/charsoup/WordTokenizer.java | 225 + .../apache/tika/langdetect/charsoup/langdetect.bin | Bin 0 -> 1641016 bytes .../pom.xml | 29 +- .../charsoup/CharSoupLanguageDetector.java | 393 + .../charsoup/CharSoupFeatureExtractorTest.java | 442 + .../langdetect/charsoup/CharSoupModelTest.java | 153 + .../langdetect/charsoup/LangIdRegressionTest.java | 154 + .../charsoup/ScriptAwareFeatureExtractorTest.java | 422 + .../langdetect/charsoup/WordTokenizerTest.java | 193 + .../langdetect/charsoup/tools/AblationRunner.java | 199 + .../charsoup/tools/BucketSaturationAnalyzer.java | 193 + .../charsoup/tools/CompareDetectors.java | 746 + .../langdetect/charsoup/tools/ConfusableDiff.java | 297 + .../langdetect/charsoup/tools/ConfusionDumper.java | 174 + .../langdetect/charsoup/tools/CorpusReader.java | 168 + .../langdetect/charsoup/tools/CrossDomainEval.java | 1049 + .../langdetect/charsoup/tools/DataSplitter.java | 156 + .../charsoup/tools/DuplicateChecker.java | 291 + .../langdetect/charsoup/tools/LabeledSentence.java | 44 +- .../langdetect/charsoup/tools/ModelQuantizer.java | 94 + .../langdetect/charsoup/tools/Phase2SmokeTest.java | 214 + .../langdetect/charsoup/tools/Phase2Trainer.java | 1456 + .../langdetect/charsoup/tools/QuickF1Eval.java | 605 + .../charsoup/tools/TrainLanguageModel.java | 1018 + .../langdetect/charsoup/tools/TrigramAblation.java | 180 + .../src/test/python/download_corpus.py | 370 + .../src/test/python/filter_pashto.py | 89 + .../tika/langdetect/charsoup/comparison-report.txt | 239 + tika-langdetect/tika-langdetect-lingo24/pom.xml | 2 +- tika-langdetect/tika-langdetect-mitll-text/pom.xml | 2 +- tika-langdetect/tika-langdetect-opennlp/pom.xml | 2 +- tika-langdetect/tika-langdetect-optimaize/pom.xml | 2 +- .../tika-langdetect-test-commons/pom.xml | 2 +- tika-langdetect/tika-langdetect-tika/pom.xml | 2 +- tika-parent/pom.xml | 31 +- tika-parsers/pom.xml | 2 +- tika-parsers/tika-parsers-extended/pom.xml | 2 +- .../tika-parser-scientific-module/pom.xml | 2 +- .../tika-parser-scientific-package/pom.xml | 2 +- .../tika-parser-sqlite3-module/pom.xml | 2 +- .../tika-parser-sqlite3-package/pom.xml | 2 +- .../pom.xml | 2 +- tika-parsers/tika-parsers-ml/pom.xml | 4 +- .../pom.xml | 24 +- .../tika/inference/AbstractEmbeddingFilter.java | 234 + .../main/java/org/apache/tika/inference/Chunk.java | 88 + .../org/apache/tika/inference/ChunkSerializer.java | 229 + .../tika/inference/ImageEmbeddingConfig.java | 149 + .../org/apache/tika/inference/InferenceConfig.java | 260 + .../org/apache/tika/inference/MarkdownChunker.java | 210 + .../tika/inference/OpenAIEmbeddingFilter.java | 227 + .../tika/inference/OpenAIImageEmbeddingParser.java | 443 + .../apache/tika/inference/VectorSerializer.java | 56 + .../apache/tika/inference/locator/Locators.java | 121 + .../tika/inference/locator/PaginatedLocator.java | 56 + .../tika/inference/locator/SpatialLocator.java | 54 + .../tika/inference/locator/TemporalLocator.java | 40 +- .../apache/tika/inference/locator/TextLocator.java | 39 +- .../apache/tika/inference/ChunkSerializerTest.java | 183 + .../apache/tika/inference/MarkdownChunkerTest.java | 138 + .../tika/inference/OpenAIEmbeddingFilterTest.java | 268 + .../inference/OpenAIImageEmbeddingParserTest.java | 340 + .../tika/inference/VectorSerializerTest.java | 58 + .../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 2 +- .../tika-parser-nlp-package/pom.xml | 2 +- .../tika-parser-tess4j-module}/pom.xml | 40 +- .../tika/parser/ocr/tess4j/Tess4JConfig.java | 355 + .../tika/parser/ocr/tess4j/Tess4JParser.java | 516 + .../tika/parser/ocr/tess4j/Tess4JConfigTest.java | 140 + .../tika/parser/ocr/tess4j/Tess4JParserTest.java | 255 + .../src/test/resources/test-documents/testOCR.jpg | Bin 0 -> 3408 bytes .../tika-parser-vlm-ocr-module/pom.xml | 2 +- .../tika-parsers-ml/tika-transcribe-aws/pom.xml | 2 +- tika-parsers/tika-parsers-standard/pom.xml | 2 +- .../tika-parsers-standard-modules/pom.xml | 2 +- .../tika-parser-apple-module/pom.xml | 2 +- .../tika-parser-audiovideo-module/pom.xml | 2 +- .../tika-parser-cad-module/pom.xml | 2 +- .../tika-parser-code-module/pom.xml | 2 +- .../tika-parser-crypto-module/pom.xml | 2 +- .../tika-parser-digest-commons/pom.xml | 2 +- .../tika-parser-font-module/pom.xml | 2 +- .../tika-parser-html-module/pom.xml | 2 +- .../tika-parser-image-module/pom.xml | 2 +- .../tika-parser-jdbc-commons/pom.xml | 2 +- .../tika-parser-mail-commons/pom.xml | 2 +- .../tika-parser-mail-module/pom.xml | 2 +- .../tika-parser-microsoft-module/pom.xml | 2 +- .../tika-parser-miscoffice-module/pom.xml | 2 +- .../tika-parser-news-module/pom.xml | 2 +- .../tika-parser-ocr-module/pom.xml | 2 +- .../tika-parser-pdf-module/pom.xml | 2 +- .../tika-parser-pkg-module/pom.xml | 2 +- .../tika-parser-text-module/pom.xml | 2 +- .../tika-parser-webarchive-module/pom.xml | 2 +- .../tika-parser-xml-module/pom.xml | 2 +- .../tika-parser-xmp-commons/pom.xml | 2 +- .../tika-parser-zip-commons/pom.xml | 2 +- .../tika-parsers-standard-package/pom.xml | 2 +- tika-pipes/pom.xml | 2 +- tika-pipes/tika-async-cli/pom.xml | 2 +- .../apache/tika/async/cli/AsyncProcessorTest.java | 4 + tika-pipes/tika-httpclient-commons/pom.xml | 2 +- tika-pipes/tika-pipes-api/pom.xml | 2 +- tika-pipes/tika-pipes-config-store-ignite/pom.xml | 2 +- .../tika/pipes/ignite/IgniteConfigStoreTest.java | 10 +- tika-pipes/tika-pipes-core/pom.xml | 2 +- .../tika/pipes/core/PerClientServerManager.java | 20 +- .../org/apache/tika/pipes/core/PipesClient.java | 153 +- .../tika/pipes/core/protocol/PipesMessage.java | 169 + .../tika/pipes/core/protocol/PipesMessageType.java | 96 + .../core/protocol/ProtocolDesyncException.java | 28 +- .../core/protocol/ShutDownReceivedException.java | 30 +- .../serialization/FetchEmitTupleDeserializer.java | 8 - .../tika/pipes/core/server/ConnectionHandler.java | 255 +- .../tika/pipes/core/server/ParseHandler.java | 2 + .../apache/tika/pipes/core/server/PipesServer.java | 364 +- .../tika/pipes/core/server/ServerProtocolIO.java | 133 + .../tika/pipes/core/protocol/PipesMessageTest.java | 202 + tika-pipes/tika-pipes-fork-parser/pom.xml | 2 +- tika-pipes/tika-pipes-integration-tests/pom.xml | 2 +- .../apache/tika/pipes/core/PipesClientTest.java | 61 +- tika-pipes/tika-pipes-iterator-commons/pom.xml | 2 +- tika-pipes/tika-pipes-plugins/pom.xml | 2 +- .../tika-pipes-atlassian-jwt/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-az-blob/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-csv/pom.xml | 2 +- .../tika-pipes-file-system/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-gcs/pom.xml | 2 +- .../tika-pipes-google-drive/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-http/pom.xml | 71 +- .../test/resources/configs/tika-config-http.json | 2 +- .../tika-pipes-plugins/tika-pipes-jdbc/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-json/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-kafka/pom.xml | 2 +- .../tika-pipes-microsoft-graph/pom.xml | 2 +- .../tika-pipes-opensearch/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-s3/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-solr/pom.xml | 2 +- tika-pipes/tika-pipes-reporter-commons/pom.xml | 2 +- tika-plugins-core/pom.xml | 2 +- tika-serialization/pom.xml | 2 +- .../org/apache/tika/config/loader/TikaLoader.java | 8 +- tika-server/pom.xml | 2 +- tika-server/tika-server-client/pom.xml | 2 +- tika-server/tika-server-core/pom.xml | 2 +- tika-server/tika-server-standard/pom.xml | 2 +- tika-translate/pom.xml | 2 +- tika-xmp/pom.xml | 2 +- 462 files changed, 3422355 insertions(+), 3048376 deletions(-)
