This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-4663-handler-type-metadata
in repository https://gitbox.apache.org/repos/asf/tika.git
from 1ff2f0f733 TIKA-4663 - add content handler type metadata and switch
default to markdown
add 8918c66384 TIKA-4664 - add Poppler renderer, replace MuPDF, add OCR
safety limits (#2612)
add 2c98c63677 TIKA-4666 - add VLM parsers (Claude, Gemini, OpenAI) (#2614)
add 720fa5a850 Merge branch 'main' into TIKA-4663-handler-type-metadata
add 24311eeff2 TIKA-4663 -- fix handlertypename
No new revisions were added by this update.
Summary of changes:
docs/modules/ROOT/examples/claude-vlm-basic.json | 10 +
docs/modules/ROOT/examples/claude-vlm-full.json | 18 +
docs/modules/ROOT/examples/gemini-vlm-basic.json | 10 +
docs/modules/ROOT/examples/gemini-vlm-full.json | 18 +
docs/modules/ROOT/examples/openai-vlm-basic.json | 11 +
docs/modules/ROOT/examples/openai-vlm-full.json | 18 +
docs/modules/ROOT/examples/vlm-pdf-parsing.json | 16 +
docs/modules/ROOT/nav.adoc | 2 +
docs/modules/ROOT/pages/advanced/index.adoc | 2 +
.../ROOT/pages/advanced/local-vlm-server.adoc | 445 ++++++++++++++++++++
.../pages/configuration/parsers/vlm-parsers.adoc | 236 +++++++++++
.../tika/sax/BasicContentHandlerFactory.java | 2 +-
.../org/apache/tika/sax/ContentHandlerFactory.java | 2 +-
.../tika/sax/RecursiveParserWrapperHandler.java | 2 +-
tika-parsers/tika-parsers-ml/pom.xml | 1 +
.../tika-parser-vlm-ocr-module/pom.xml | 132 ++++++
.../apache/tika/parser/vlm/AbstractVLMParser.java | 464 +++++++++++++++++++++
.../apache/tika/parser/vlm/ClaudeVLMParser.java | 227 ++++++++++
.../apache/tika/parser/vlm/GeminiVLMParser.java | 238 +++++++++++
.../tika/parser/vlm/MarkdownToXHTMLEmitter.java | 409 ++++++++++++++++++
.../apache/tika/parser/vlm/OpenAIVLMParser.java | 266 ++++++++++++
.../org/apache/tika/parser/vlm/VLMOCRConfig.java | 307 ++++++++++++++
.../tika/parser/vlm/ClaudeVLMParserTest.java | 285 +++++++++++++
.../tika/parser/vlm/GeminiVLMParserTest.java | 260 ++++++++++++
.../parser/vlm/MarkdownToXHTMLEmitterTest.java | 253 +++++++++++
.../tika/parser/vlm/OpenAIVLMParserTest.java | 291 +++++++++++++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 27 ++
.../java/org/apache/tika/parser/pdf/OcrConfig.java | 60 +++
.../apache/tika/parser/pdf/PDFParserConfig.java | 33 ++
.../tika/renderer/pdf/mutool/MuPDFRenderer.java | 150 -------
.../tika/renderer/pdf/poppler/PopplerRenderer.java | 293 +++++++++++++
.../renderer/pdf/poppler/PopplerRendererTest.java | 167 ++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +-
...fig.json => tika-rendering-poppler-config.json} | 2 +-
.../tika/pipes/core/server/ParseHandler.java | 2 +-
35 files changed, 4513 insertions(+), 167 deletions(-)
create mode 100644 docs/modules/ROOT/examples/claude-vlm-basic.json
create mode 100644 docs/modules/ROOT/examples/claude-vlm-full.json
create mode 100644 docs/modules/ROOT/examples/gemini-vlm-basic.json
create mode 100644 docs/modules/ROOT/examples/gemini-vlm-full.json
create mode 100644 docs/modules/ROOT/examples/openai-vlm-basic.json
create mode 100644 docs/modules/ROOT/examples/openai-vlm-full.json
create mode 100644 docs/modules/ROOT/examples/vlm-pdf-parsing.json
create mode 100644 docs/modules/ROOT/pages/advanced/local-vlm-server.adoc
create mode 100644
docs/modules/ROOT/pages/configuration/parsers/vlm-parsers.adoc
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/pom.xml
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
create mode 100644
tika-parsers/tika-parsers-ml/tika-parser-vlm-ocr-module/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
delete mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
rename
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/{tika-rendering-mupdf-config.json
=> tika-rendering-poppler-config.json} (85%)