This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 79d017047 TIKA-4508 -- allow tika async to inherit from existing 
tika-config.xml
79d017047 is described below

commit 79d0170477b2d2d0f198c4f75960bb6f9ca677d0
Author: tallison <[email protected]>
AuthorDate: Mon Oct 6 10:50:09 2025 -0400

    TIKA-4508 -- allow tika async to inherit from existing tika-config.xml
---
 .../apache/tika/async/cli/SimpleAsyncConfig.java   |   9 +-
 .../org/apache/tika/async/cli/TikaAsyncCLI.java    |  11 ++-
 .../tika/async/cli/TikaConfigAsyncWriter.java      |  81 ++++++++++++++--
 .../tika/async/cli/TikaConfigAsyncWriterTest.java  | 102 +++++++++++++++++++++
 .../test/resources/configs/TIKA-4508-emitters.xml  |  58 ++++++++++++
 .../test/resources/configs/TIKA-4508-parsers.xml   |  49 ++++++++++
 6 files changed, 299 insertions(+), 11 deletions(-)

diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
index 34458beb0..603f80e3d 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/SimpleAsyncConfig.java
@@ -24,14 +24,17 @@ class SimpleAsyncConfig {
     private Long timeoutMs;
     private String xmx;
     private String fileList;
+    private String tikaConfig;//path to the tikaConfig file to be used in the 
forked process
 
-    public SimpleAsyncConfig(String inputDir, String outputDir, Integer 
numClients, Long timeoutMs, String xmx, String fileList) {
+    //TODO -- switch to a builder
+    public SimpleAsyncConfig(String inputDir, String outputDir, Integer 
numClients, Long timeoutMs, String xmx, String fileList, String tikaConfig) {
         this.inputDir = inputDir;
         this.outputDir = outputDir;
         this.numClients = numClients;
         this.timeoutMs = timeoutMs;
         this.xmx = xmx;
         this.fileList = fileList;
+        this.tikaConfig = tikaConfig;
     }
 
     public String getInputDir() {
@@ -57,4 +60,8 @@ class SimpleAsyncConfig {
     public String getFileList() {
         return fileList;
     }
+
+    public String getTikaConfig() {
+        return tikaConfig;
+    }
 }
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
index 769265ed1..2a87a4b1a 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaAsyncCLI.java
@@ -49,6 +49,8 @@ public class TikaAsyncCLI {
         options.addOption("?", "help", false, "this help message");
         options.addOption("t", "timeoutMs", true, "timeout for each parse in 
milliseconds");
         options.addOption("l", "fileList", true, "file list");
+        options.addOption("c", "config", true, "tikaConfig to inherit from -- 
" +
+                "commandline options will not overwrite existing iterators, 
emitters, fetchers and async");
 
         return options;
     }
@@ -83,7 +85,7 @@ public class TikaAsyncCLI {
     //not private for testing purposes
     static SimpleAsyncConfig parseCommandLine(String[] args) throws 
ParseException, IOException {
         if (args.length == 2 && ! args[0].startsWith("-")) {
-            return new SimpleAsyncConfig(args[0], args[1], null, null, null, 
null);
+            return new SimpleAsyncConfig(args[0], args[1], null, null, null, 
null, null);
         }
 
         Options options = getOptions();
@@ -100,6 +102,7 @@ public class TikaAsyncCLI {
         Long timeoutMs = null;
         Integer numClients = null;
         String fileList = null;
+        String tikaConfig = null;
         if (line.hasOption("i")) {
             inputDir = line.getOptionValue("i");
         }
@@ -118,8 +121,12 @@ public class TikaAsyncCLI {
         if (line.hasOption("l")) {
             fileList = line.getOptionValue("l");
         }
+
+        if (line.hasOption("c")) {
+            tikaConfig = line.getOptionValue("c");
+        }
         return new SimpleAsyncConfig(inputDir, outputDir,
-                numClients, timeoutMs, xmx, fileList);
+                numClients, timeoutMs, xmx, fileList, tikaConfig);
     }
 
 
diff --git 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
index ddb5ecdb4..5ff8f5d46 100644
--- 
a/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
+++ 
b/tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/TikaConfigAsyncWriter.java
@@ -31,13 +31,24 @@ import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.utils.StringUtils;
+import org.apache.tika.utils.XMLReaderUtils;
 
 class TikaConfigAsyncWriter {
 
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(TikaAsyncCLI.class);
+
     private static final String FETCHER_NAME = "fsf";
     private static final String EMITTER_NAME = "fse";
 
@@ -55,11 +66,22 @@ class TikaConfigAsyncWriter {
         }
     }
 
-    void _write(Path output) throws ParserConfigurationException, 
TransformerException, IOException {
-        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-        Document document = dbf.newDocumentBuilder().newDocument();
-        Element properties = document.createElement("properties");
-        document.appendChild(properties);
+    void _write(Path output) throws ParserConfigurationException, 
TransformerException, IOException, TikaException, SAXException {
+        Document document = null;
+        Element properties = null;
+        if (simpleAsyncConfig.getTikaConfig() != null) {
+            document = 
XMLReaderUtils.buildDOM(Paths.get(simpleAsyncConfig.getTikaConfig()));
+            properties = document.getDocumentElement();
+            if (! "properties".equals(properties.getLocalName())) {
+                throw new TikaConfigException("Document element must be 
'<properties>' in " +
+                        simpleAsyncConfig.getTikaConfig());
+            }
+        } else {
+            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+            document = dbf.newDocumentBuilder().newDocument();
+            properties = document.createElement("properties");
+            document.appendChild(properties);
+        }
         writePipesIterator(document, properties);
         writeFetchers(document, properties);
         writeEmitters(document, properties);
@@ -77,6 +99,11 @@ class TikaConfigAsyncWriter {
     }
 
     private void writePipesIterator(Document document, Element properties) {
+        Element pipesIterator = findChild("pipesIterator", properties);
+        if (pipesIterator != null) {
+            LOG.info("pipesIterator already exists in tika-config. Not 
overwriting with commandline");
+            return;
+        }
         if (! StringUtils.isBlank(simpleAsyncConfig.getFileList())) {
             writeFileListIterator(document, properties);
         } else {
@@ -104,7 +131,13 @@ class TikaConfigAsyncWriter {
     }
 
     private void writeEmitters(Document document, Element properties) {
-        Element emitters = createAndGetElement(document, properties, 
"emitters");
+        Element emitters = findChild("emitters", properties);
+        if (emitters != null) {
+            LOG.info("emitters already exist in tika-config. Not overwriting 
with commandline");
+            return;
+        }
+
+        emitters = createAndGetElement(document, properties, "emitters");
         Element emitter = createAndGetElement( document, emitters, "emitter",
                 "class", "org.apache.tika.pipes.emitter.fs.FileSystemEmitter");
         appendTextElement(document, emitter, "name", EMITTER_NAME);
@@ -113,7 +146,13 @@ class TikaConfigAsyncWriter {
     }
 
     private void writeFetchers(Document document, Element properties) {
-        Element fetchers = createAndGetElement(document, properties, 
"fetchers");
+        Element fetchers = findChild("fetchers", properties);
+        if (fetchers != null) {
+            LOG.info("fetchers already exist in tika-config. Not overwriting 
with commandline");
+            return;
+        }
+
+        fetchers = createAndGetElement(document, properties, "fetchers");
         Element fetcher = createAndGetElement(document, fetchers, "fetcher",
                 "class", "org.apache.tika.pipes.fetcher.fs.FileSystemFetcher");
         appendTextElement(document, fetcher, "name", FETCHER_NAME);
@@ -128,7 +167,18 @@ class TikaConfigAsyncWriter {
     }
 
     private void writeAsync(Document document, Element properties) {
-        Element async = createAndGetElement(document, properties, "async");
+        Element async = findChild("async", properties);
+        if (async != null) {
+            LOG.info("async already exists in tika-config. Not overwriting 
with commandline");
+            return;
+        }
+
+        async = createAndGetElement(document, properties, "async");
+        Element pipesIterator = findChild("pipesIterator", properties);
+        if (pipesIterator != null) {
+            LOG.info("pipesIterator already exists in tika-config. Not 
overwriting with commandline");
+        }
+
         properties.appendChild(async);
         if (simpleAsyncConfig.getNumClients() != null) {
             appendTextElement(document, async, "numClients", 
Integer.toString(simpleAsyncConfig.getNumClients()));
@@ -140,6 +190,10 @@ class TikaConfigAsyncWriter {
         if (simpleAsyncConfig.getTimeoutMs() != null) {
             appendTextElement(document, async, "timeoutMillis", 
Long.toString(simpleAsyncConfig.getTimeoutMs()));
         }
+        if (simpleAsyncConfig.getTikaConfig() != null) {
+            Path p = Paths.get(simpleAsyncConfig.getTikaConfig());
+            appendTextElement(document, async, "tikaConfig", 
p.toAbsolutePath().toString());
+        }
     }
 
     private static  void appendTextElement(Document document, Element parent, 
String itemName, String text, String... attrs) {
@@ -156,4 +210,15 @@ class TikaConfigAsyncWriter {
         return el;
     }
 
+    static Element findChild(String childElementName, Element root) {
+        NodeList nodeList = root.getChildNodes();
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node child = nodeList.item(i);
+            if (childElementName.equals(child.getLocalName())) {
+                return (Element)child;
+            }
+        }
+        return null;
+    }
+
 }
diff --git 
a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
new file mode 100644
index 000000000..c88db2de0
--- /dev/null
+++ 
b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaConfigAsyncWriterTest.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class TikaConfigAsyncWriterTest {
+
+
+    @Test
+    public void testBasic(@TempDir Path dir) throws Exception {
+        Path p = 
Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-parsers.xml").toURI());
+        SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input", 
"output", 4,
+                10000L, "-Xmx1g", null, p.toAbsolutePath().toString());
+        Path target = 
Paths.get("/home/tallison/Desktop/tmp/").resolve("combined.xml");
+        TikaConfigAsyncWriter writer = new 
TikaConfigAsyncWriter(simpleAsyncConfig);
+        writer.write(target);
+
+        Set<String> expected = Set.of("service-loader", "parsers", 
"pipesIterator", "fetchers", "emitters", "async");
+        Set<String> properties = loadProperties(target);
+        assertEquals(expected, properties);
+    }
+
+    @Test
+    public void testDontOverwriteEmitters(@TempDir Path dir) throws Exception {
+        Path p = 
Paths.get(TikaConfigAsyncWriter.class.getResource("/configs/TIKA-4508-emitters.xml").toURI());
+        SimpleAsyncConfig simpleAsyncConfig = new SimpleAsyncConfig("input", 
"output", 4,
+                10000L, "-Xmx1g", null, p.toAbsolutePath().toString());
+        Path target = 
Paths.get("/home/tallison/Desktop/tmp/").resolve("combined.xml");
+        TikaConfigAsyncWriter writer = new 
TikaConfigAsyncWriter(simpleAsyncConfig);
+        writer.write(target);
+
+        Set<String> expected = Set.of("parsers", "pipesIterator", "fetchers", 
"emitters", "async");
+        Set<String> properties = loadProperties(target);
+        assertEquals(expected, properties);
+
+        Document doc = XMLReaderUtils.buildDOM(target);
+        Element emitters = TikaConfigAsyncWriter.findChild("emitters", 
doc.getDocumentElement());
+        assertNotNull(emitters);
+        int found = 0;
+        for (int i = 0; i < emitters.getChildNodes().getLength(); i++) {
+            Node n = emitters.getChildNodes().item(i);
+            if ("emitter".equals(n.getLocalName())) {
+                Node clazzNode = n.getAttributes().getNamedItem("class");
+                if (clazzNode != null) {
+                    String clazz = clazzNode.getNodeValue();
+                    if (clazz != null && clazz.startsWith("com.custom.")) {
+                        found++;
+                    }
+                }
+            }
+        }
+        assertEquals(2, found);
+
+    }
+
+
+    private Set<String> loadProperties(Path path) throws TikaException, 
IOException, SAXException {
+        Document document = XMLReaderUtils.buildDOM(path);
+        Element properties = document.getDocumentElement();
+        assertEquals("properties", properties.getLocalName());
+        Set<String> children = new HashSet<>();
+        for (int i = 0; i < properties.getChildNodes().getLength(); i++) {
+            Node n = properties.getChildNodes().item(i);
+            if (n.getLocalName() != null) {
+                children.add(n.getLocalName());
+            }
+        }
+        return children;
+    }
+}
diff --git 
a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-emitters.xml 
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-emitters.xml
new file mode 100644
index 000000000..5e3eed353
--- /dev/null
+++ 
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-emitters.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+      <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pdf.PDFParser">
+      <params>
+        <param name="extractActions" type="bool">true</param>
+        <param name="extractInlineImages" type="bool">true</param>
+        <param name="extractIncrementalUpdateInfo" type="bool">true</param>
+        <param name="parseIncrementalUpdates" type="bool">true</param>
+      </params>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+      <params>
+        <param name="includeDeletedContent" type="bool">true</param>
+        <param name="includeMoveFromContent" type="bool">true</param>
+        <param name="extractMacros" type="bool">true</param>
+      </params>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+      <params>
+        <param name="extractMacros" type="bool">true</param>
+      </params>
+    </parser>
+  </parsers>
+  <emitters>
+    <emitter class="com.custom.tika.pipes.emitter.fs.FileSystemEmitter">
+      <name>json</name>
+      <basePath>JSON_PATH</basePath>
+    </emitter>
+    <emitter class="com.custom.tika.pipes.emitter.fs.FileSystemEmitter">
+      <name>bytes</name>
+      <basePath>BYTES_PATH</basePath>
+    </emitter>
+  </emitters>
+</properties>
\ No newline at end of file
diff --git 
a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-parsers.xml 
b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-parsers.xml
new file mode 100644
index 000000000..4b5b8550c
--- /dev/null
+++ b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4508-parsers.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+  <service-loader initializableProblemHandler="throw"/>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+      <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pdf.PDFParser">
+      <params>
+        <param name="extractActions" type="bool">true</param>
+        <param name="extractInlineImages" type="bool">true</param>
+        <param name="extractIncrementalUpdateInfo" type="bool">true</param>
+        <param name="parseIncrementalUpdates" type="bool">true</param>
+      </params>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+      <params>
+        <param name="includeDeletedContent" type="bool">true</param>
+        <param name="includeMoveFromContent" type="bool">true</param>
+        <param name="extractMacros" type="bool">true</param>
+      </params>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+      <params>
+        <param name="extractMacros" type="bool">true</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
\ No newline at end of file

Reply via email to