This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 79d017047 TIKA-4508 -- allow tika async to inherit from existing
tika-config.xml
79d017047 is described below
commit 79d0170477b2d2d0f198c4f75960bb6f9ca677d0
Author: tallison <[email protected]>
AuthorDate: Mon Oct 6 10:50:09 2025 -0400
TIKA-4508 -- allow tika async to inherit from existing tika-config.xml
---
.../apache/tika/async/cli/SimpleAsyncConfig.java | 9 +-
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 11 ++-
.../tika/async/cli/TikaConfigAsyncWriter.java | 81 ++++++++++++++--
.../tika/async/cli/TikaConfigAsyncWriterTest.java | 102 +++++++++++++++++++++
.../test/resources/configs/TIKA-4508-emitters.xml | 58 ++++++++++++
.../test/resources/configs/TIKA-4508-parsers.xml | 49 ++++++++++
6 files changed, 299 insertions(+), 11 deletions(-)
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
index 34458beb0..603f80e3d 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -24,14 +24,17 @@ class SimpleAsyncConfig {
private Long timeoutMs;
private String xmx;
private String fileList;
+ private String tikaConfig;//path to the tikaConfig file to be used in the
forked process
- public SimpleAsyncConfig(String inputDir, String outputDir, Integer
numClients, Long timeoutMs, String xmx, String fileList) {
+ //TODO -- switch to a builder
+ public SimpleAsyncConfig(String inputDir, String outputDir, Integer
numClients, Long timeoutMs, String xmx, String fileList, String tikaConfig) {
this.inputDir = inputDir;
this.outputDir = outputDir;
this.numClients = numClients;
this.timeoutMs = timeoutMs;
this.xmx = xmx;
this.fileList = fileList;
+ this.tikaConfig = tikaConfig;
}
public String getInputDir() {
@@ -57,4 +60,8 @@ class SimpleAsyncConfig {
public String getFileList() {
return fileList;
}
+
+ public String getTikaConfig() {
+ return tikaConfig;
+ }
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 769265ed1..2a87a4b1a 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -49,6 +49,8 @@ public class TikaAsyncCLI {
options.addOption("?", "help", false, "this help message");
options.addOption("t", "timeoutMs", true, "timeout for each parse in
milliseconds");
options.addOption("l", "fileList", true, "file list");
+ options.addOption("c", "config", true, "tikaConfig to inherit from --
" +
+ "commandline options will not overwrite existing iterators,
emitters, fetchers and async");
return options;
}
@@ -83,7 +85,7 @@ public class TikaAsyncCLI {
//not private for testing purposes
static SimpleAsyncConfig parseCommandLine(String[] args) throws
ParseException, IOException {
if (args.length == 2 && ! args[0].startsWith("-")) {
- return new SimpleAsyncConfig(args[0], args[1], null, null, null,
null);
+ return new SimpleAsyncConfig(args[0], args[1], null, null, null,
null, null);
}
Options options = getOptions();
@@ -100,6 +102,7 @@ public class TikaAsyncCLI {
Long timeoutMs = null;
Integer numClients = null;
String fileList = null;
+ String tikaConfig = null;
if (line.hasOption("i")) {
inputDir = line.getOptionValue("i");
}
@@ -118,8 +121,12 @@ public class TikaAsyncCLI {
if (line.hasOption("l")) {
fileList = line.getOptionValue("l");
}
+
+ if (line.hasOption("c")) {
+ tikaConfig = line.getOptionValue("c");
+ }
return new SimpleAsyncConfig(inputDir, outputDir,
- numClients, timeoutMs, xmx, fileList);
+ numClients, timeoutMs, xmx, fileList, tikaConfig);
}
diff --git
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
index ddb5ecdb4..5ff8f5d46 100644
---
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
+++
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
@@ -31,13 +31,24 @@ import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.utils.StringUtils;
+import org.apache.tika.utils.XMLReaderUtils;
class TikaConfigAsyncWriter {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(TikaAsyncCLI.class);
+
private static final String FETCHER_NAME = "fsf";
private static final String EMITTER_NAME = "fse";
@@ -55,11 +66,22 @@ class TikaConfigAsyncWriter {
}
}
- void _write(Path output) throws ParserConfigurationException,
TransformerException, IOException {
- DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
- Document document = dbf.newDocumentBuilder().newDocument();
- Element properties = document.createElement("properties");
- document.appendChild(properties);
+ void _write(Path output) throws ParserConfigurationException,
TransformerException, IOException, TikaException, SAXException {
+ Document document = null;
+ Element properties = null;
+ if (simpleAsyncConfig.getTikaConfig() != null) {
+ document =
XMLReaderUtils.buildDOM(Paths.get(simpleAsyncConfig.getTikaConfig()));
+ properties = document.getDocumentElement();
+ if (! "properties".equals(properties.getLocalName())) {
+ throw new TikaConfigException("Document element must be
'<properties>' in " +
+ simpleAsyncConfig.getTikaConfig());
+ }
+ } else {
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ document = dbf.newDocumentBuilder().newDocument();
+ properties = document.createElement("properties");
+ document.appendChild(properties);
+ }
writePipesIterator(document, properties);
writeFetchers(document, properties);
writeEmitters(document, properties);
@@ -77,6 +99,11 @@ class TikaConfigAsyncWriter {
}
private void writePipesIterator(Document document, Element properties) {
+ Element pipesIterator = findChild("pipesIterator", properties);
+ if (pipesIterator != null) {
+ LOG.info("pipesIterator already exists in tika-config. Not
overwriting with commandline");
+ return;
+ }
if (! StringUtils.isBlank(simpleAsyncConfig.getFileList())) {
writeFileListIterator(document, properties);
} else {
@@ -104,7 +131,13 @@ class TikaConfigAsyncWriter {
}
private void writeEmitters(Document document, Element properties) {
- Element emitters = createAndGetElement(document, properties,
"emitters");
+ Element emitters = findChild("emitters", properties);
+ if (emitters != null) {
+ LOG.info("emitters already exist in tika-config. Not overwriting
with commandline");
+ return;
+ }
+
+ emitters = createAndGetElement(document, properties, "emitters");
Element emitter = createAndGetElement( document, emitters, "emitter",
"class", "org.apache.tika.pipes.emitter.fs.FileSystemEmitter");
appendTextElement(document, emitter, "name", EMITTER_NAME);
@@ -113,7 +146,13 @@ class TikaConfigAsyncWriter {
}
private void writeFetchers(Document document, Element properties) {
- Element fetchers = createAndGetElement(document, properties,
"fetchers");
+ Element fetchers = findChild("fetchers", properties);
+ if (fetchers != null) {
+ LOG.info("fetchers already exist in tika-config. Not overwriting
with commandline");
+ return;
+ }
+
+ fetchers = createAndGetElement(document, properties, "fetchers");
Element fetcher = createAndGetElement(document, fetchers, "fetcher",
"class", "org.apache.tika.pipes.fetcher.fs.FileSystemFetcher");
appendTextElement(document, fetcher, "name", FETCHER_NAME);
@@ -128,7 +167,18 @@ class TikaConfigAsyncWriter {
}
private void writeAsync(Document document, Element properties) {
- Element async = createAndGetElement(document, properties, "async");
+ Element async = findChild("async", properties);
+ if (async != null) {
+ LOG.info("async already exists in tika-config. Not overwriting
with commandline");
+ return;
+ }
+
+ async = createAndGetElement(document, properties, "async");
+ Element pipesIterator = findChild("pipesIterator", properties);
+ if (pipesIterator != null) {
+ LOG.info("pipesIterator already exists in tika-config. Not
overwriting with commandline");
+ }
+
properties.appendChild(async);
if (simpleAsyncConfig.getNumClients() != null) {
appendTextElement(document, async, "numClients",
Integer.toString(simpleAsyncConfig.getNumClients()));
@@ -140,6 +190,10 @@ class TikaConfigAsyncWriter {
if (simpleAsyncConfig.getTimeoutMs() != null) {
appendTextElement(document, async, "timeoutMillis",
Long.toString(simpleAsyncConfig.getTimeoutMs()));
}
+ if (simpleAsyncConfig.getTikaConfig() != null) {
+ Path p = Paths.get(simpleAsyncConfig.getTikaConfig());
+ appendTextElement(document, async, "tikaConfig",
p.toAbsolutePath().toString());
+ }
}
private static void appendTextElement(Document document, Element parent,
String itemName, String text, String... attrs) {
@@ -156,4 +210,15 @@ class TikaConfigAsyncWriter {
return el;
}
+ static Element findChild(String childElementName, Element root) {
+ NodeList nodeList = root.getChildNodes();
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Node child = nodeList.item(i);
+ if (childElementName.equals(child.getLocalName())) {
+ return (Element)child;
+ }
+ }
+ return null;
+ }
+
}
diff --git
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
new file mode 100644
index 000000000..c88db2de0
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class TikaConfigAsyncWriterTest {
+
+
+ @Test
+ public void testBasic(@TempDir Path dir) throws Exception {
+ Path p =
Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-parsers.xml").toURI());
+ SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input",
"output", 4,
+ 10000L, "-Xmx1g", null, p.toAbsolutePath().toString());
+ Path target =
Paths.get("/home/tallison/Desktop/tmp/").resolve("combined.xml");
+ TikaConfigAsyncWriter writer = new
TikaConfigAsyncWriter(simpleAsyncConfig);
+ writer.write(target);
+
+ Set<String> expected = Set.of("service-loader", "parsers",
"pipesIterator", "fetchers", "emitters", "async");
+ Set<String> properties = loadProperties(target);
+ assertEquals(expected, properties);
+ }
+
+ @Test
+ public void testDontOverwriteEmitters(@TempDir Path dir) throws Exception {
+ Path p =
Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-emitters.xml").toURI());
+ SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input",
"output", 4,
+ 10000L, "-Xmx1g", null, p.toAbsolutePath().toString());
+ Path target =
Paths.get("/home/tallison/Desktop/tmp/").resolve("combined.xml");
+ TikaConfigAsyncWriter writer = new
TikaConfigAsyncWriter(simpleAsyncConfig);
+ writer.write(target);
+
+ Set<String> expected = Set.of("parsers", "pipesIterator", "fetchers",
"emitters", "async");
+ Set<String> properties = loadProperties(target);
+ assertEquals(expected, properties);
+
+ Document doc = XMLReaderUtils.buildDOM(target);
+ Element emitters = TikaConfigAsyncWriter.findChild("emitters",
doc.getDocumentElement());
+ assertNotNull(emitters);
+ int found = 0;
+ for (int i = 0; i < emitters.getChildNodes().getLength(); i++) {
+ Node n = emitters.getChildNodes().item(i);
+ if ("emitter".equals(n.getLocalName())) {
+ Node clazzNode = n.getAttributes().getNamedItem("class");
+ if (clazzNode != null) {
+ String clazz = clazzNode.getNodeValue();
+ if (clazz != null && clazz.startsWith("com.custom.")) {
+ found++;
+ }
+ }
+ }
+ }
+ assertEquals(2, found);
+
+ }
+
+
+ private Set<String> loadProperties(Path path) throws TikaException,
IOException, SAXException {
+ Document document = XMLReaderUtils.buildDOM(path);
+ Element properties = document.getDocumentElement();
+ assertEquals("properties", properties.getLocalName());
+ Set<String> children = new HashSet<>();
+ for (int i = 0; i < properties.getChildNodes().getLength(); i++) {
+ Node n = properties.getChildNodes().item(i);
+ if (n.getLocalName() != null) {
+ children.add(n.getLocalName());
+ }
+ }
+ return children;
+ }
+}
diff --git
a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-emitters.xml
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-emitters.xml
new file mode 100644
index 000000000..5e3eed353
--- /dev/null
+++
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-emitters.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+ <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractActions" type="bool">true</param>
+ <param name="extractInlineImages" type="bool">true</param>
+ <param name="extractIncrementalUpdateInfo" type="bool">true</param>
+ <param name="parseIncrementalUpdates" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="includeDeletedContent" type="bool">true</param>
+ <param name="includeMoveFromContent" type="bool">true</param>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+ <emitters>
+ <emitter class="com.custom.tika.pipes.emitter.fs.FileSystemEmitter">
+ <name>json</name>
+ <basePath>JSON_PATH</basePath>
+ </emitter>
+ <emitter class="com.custom.tika.pipes.emitter.fs.FileSystemEmitter">
+ <name>bytes</name>
+ <basePath>BYTES_PATH</basePath>
+ </emitter>
+ </emitters>
+</properties>
\ No newline at end of file
diff --git
a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-parsers.xml
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-parsers.xml
new file mode 100644
index 000000000..4b5b8550c
--- /dev/null
+++ b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-parsers.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <service-loader initializableProblemHandler="throw"/>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+ <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractActions" type="bool">true</param>
+ <param name="extractInlineImages" type="bool">true</param>
+ <param name="extractIncrementalUpdateInfo" type="bool">true</param>
+ <param name="parseIncrementalUpdates" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="includeDeletedContent" type="bool">true</param>
+ <param name="includeMoveFromContent" type="bool">true</param>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file