Re: [PR] Add OCR encode parser module [tika]

via GitHub Wed, 15 Apr 2026 20:03:28 -0700


Copilot commented on code in PR #2769:
URL: https://github.com/apache/tika/pull/2769#discussion_r3090571206



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java:
##########
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.codec.binary.Base64InputStream;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractExternalProcessParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser that base64-encodes image content instead of performing OCR
+ * text extraction. This is useful when you need to preserve the original
+ * image data in the parsed output for downstream processing by an
+ * external service.
+ * <p>
+ * To configure this parser, pass an {@link EncodeOCRConfig} object
+ * through the ParseContext, or configure it via tika-config.xml/json.
+ */
+public class EncodeOCRParser
+        extends AbstractExternalProcessParser
+        implements Initializable {
+
+    private static final String OCR = "ocr-";
+    private static final Logger LOG = LoggerFactory.getLogger(
+            EncodeOCRParser.class);
+    private static final Object[] LOCK = new Object[0];
+    private static final long serialVersionUID = -8167538283213097266L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    // these are not currently covered by other parsers
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    // add the ocr- versions as well
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+    private static volatile boolean hasWarned = false;
+
+    private EncodeOCRConfig defaultConfig = new EncodeOCRConfig();
+
+    public EncodeOCRParser() {
+    }
+
+    public EncodeOCRParser(EncodeOCRConfig config) {
+        this.defaultConfig = config;
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        //no-op
+    }
+
+    public void checkInitialization() throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        EncodeOCRConfig config = context.get(EncodeOCRConfig.class);
+        if (config == null || !config.isSkipOcr()) {
+            return SUPPORTED_TYPES;
+        }
+        return Collections.emptySet();
+    }
+
+    @Override
+    public void parse(
+            TikaInputStream tis,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext
+    ) throws IOException, SAXException, TikaException {
+        ParseContext workingContext =
+                parseContext != null ? parseContext : new ParseContext();
+
+        EncodeOCRConfig userConfig = workingContext.get(
+                EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            config = defaultConfig.cloneAndUpdate(userConfig);
+        }
+
+        if (config != null && config.isSkipOcr()) {
+            return;
+        }
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            TikaInputStream tikaStream = TikaInputStream.get(
+                    tis, tmp, metadata);
+
+            ContentHandler baseHandler = getContentHandler(
+                    config.isInlineContent(),
+                    handler,
+                    metadata,
+                    workingContext);
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(
+                    baseHandler, metadata);

Review Comment:
   XHTMLContentHandler is constructed without the ParseContext, so 
SAXOutputConfig settings in the ParseContext (e.g., 
includeTitle/writeMetadataToHead) won’t be honored. Use the 
XHTMLContentHandler(ContentHandler, Metadata, ParseContext) constructor like 
other parsers do.
   ```suggestion
                       baseHandler, metadata, workingContext);
   ```



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/test/resources/test-configs/tika-config-encodeocr-full.xml:
##########
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ocrencode.EncodeOCRParser">
+            <params>
+                <param name="minFileSizeToOcr" type="long">1</param>
+                <param name="maxFileSizeToOcr" type="long">2000000</param>
+                <param name="maxImagesToOcr" type="int">25</param>
+                <param name="skipOCR" type="bool">false</param>

Review Comment:
   The param name "skipOCR" won’t map to EncodeOCRConfig#setSkipOcr(boolean) 
under standard JavaBean naming (property is "skipOcr"). Use "skipOcr" so this 
config file behaves as intended.
   ```suggestion
                   <param name="skipOcr" type="bool">false</param>
   ```



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/test/resources/test-configs/tika-config-encodeocr-skip.xml:
##########
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ocrencode.EncodeOCRParser">
+            <params>
+                <param name="skipOCR" type="bool">true</param>

Review Comment:
   The param name "skipOCR" won’t map to EncodeOCRConfig#setSkipOcr(boolean) 
under standard JavaBean naming (property is "skipOcr"). Use "skipOcr" here so 
the setting is actually applied when loading via tika-config.
   ```suggestion
                   <param name="skipOcr" type="bool">true</param>
   ```



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRConfig.java:
##########
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import java.io.Serializable;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Configuration for EncodeOCRParser. This class is not thread safe and must be
+ * synchronized externally.
+ * <p>
+ * This class will remember all set* field forever, and on
+ * {@link #cloneAndUpdate(EncodeOCRConfig)}, it will update all the fields that
+ * have been set on the "update" config. Create a new update config for each
+ * parse unless you're only changing the same field(s) with every parse.
+ */
+public class EncodeOCRConfig implements Serializable {
+
+    private static final long serialVersionUID = -1761942486845717891L;
+
+    private static final Logger LOG = LoggerFactory.getLogger(
+        EncodeOCRConfig.class
+    );
+
+    // Maximum file size to submit file to ocr.
+    private long maxFileSizeToOcr = Integer.MAX_VALUE;
+    // Minimum file size to submit file to ocr.
+    private long minFileSizeToOcr = 0;
+    private boolean skipOcr = false;
+    private int maxImagesToOcr = 50;
+    private Set<String> userConfigured = new HashSet<>();
+    private boolean inlineContent = false;
+
+    public void setInlineContent(boolean inlineContent) {
+        this.inlineContent = inlineContent;
+        userConfigured.add("inlineContent");
+    }
+
+    public boolean isInlineContent() {
+        return inlineContent;
+    }
+
+    /**
+     * @see #setMinFileSizeToOcr(long minFileSizeToOcr)
+     */
+    public long getMinFileSizeToOcr() {
+        return minFileSizeToOcr;
+    }
+
+    /**
+     * Set minimum file size to submit file to ocr. Default is 0.
+     */
+    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+        this.minFileSizeToOcr = minFileSizeToOcr;
+        userConfigured.add("minFileSizeToOcr");
+    }
+
+    /**
+     * @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
+     */
+    public long getMaxFileSizeToOcr() {
+        return maxFileSizeToOcr;
+    }
+
+    /**
+     * Set maximum file size to submit file to ocr. Default is
+     * Integer.MAX_VALUE.
+     */
+    public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
+        this.maxFileSizeToOcr = maxFileSizeToOcr;
+        userConfigured.add("maxFileSizeToOcr");
+    }
+
+    public boolean isSkipOcr() {
+        return skipOcr;
+    }
+
+    /**
+     * If you want to turn off OCR at run time for a specific file, set this to
+     * <code>true</code>
+     *
+     * @param skipOcr
+     */
+    public void setSkipOcr(boolean skipOcr) {
+        this.skipOcr = skipOcr;
+        userConfigured.add("skipOcr");
+    }
+
+    public int getMaxImagesToOcr() {
+        return maxImagesToOcr;
+    }
+
+    /**
+     * Sets the maximum number of images to process with OCR per parse. 
Defaults
+     * to {@link Integer#MAX_VALUE}, which effectively disables the limit.
+     *
+     * @param maxImagesToOcr maximum number of images to OCR; must be >= 0
+     */
+    public void setMaxImagesToOcr(int maxImagesToOcr) {
+        if (maxImagesToOcr < 0) {
+            throw new IllegalArgumentException(
+                "maxImagesToOcr must be >= 0"
+            );
+        }
+        this.maxImagesToOcr = maxImagesToOcr;
+        userConfigured.add("maxImagesToOcr");
+    }
+
+    public EncodeOCRConfig cloneAndUpdate(EncodeOCRConfig updates)
+        throws TikaException {
+        EncodeOCRConfig updated = new EncodeOCRConfig();
+        for (Field field : this.getClass().getDeclaredFields()) {
+            if (Modifier.isFinal(field.getModifiers())) {
+                continue;
+            } else if (Modifier.isStatic(field.getModifiers())) {
+                continue;
+            }
+            if ("userConfigured".equals(field.getName())) {
+                continue;
+            }
+            if (updates.userConfigured.contains(field.getName())) {
+                try {
+                    field.set(updated, field.get(updates));
+                } catch (IllegalAccessException e) {
+                    throw new TikaException(
+                        "can't update " + field.getName(),
+                        e
+                    );
+                }
+            } else {
+                try {
+                    field.set(updated, field.get(this));
+                } catch (IllegalAccessException e) {
+                    throw new TikaException(
+                        "can't update " + field.getName(),
+                        e
+                    );
+                }
+            }
+        }

Review Comment:
   cloneAndUpdate() creates a new EncodeOCRConfig but never copies/updates the 
userConfigured set (it’s explicitly skipped), and it sets fields via reflection 
(bypassing setters). This means the returned config won’t “remember all set* 
fields forever” as the class-level Javadoc claims, and it can’t be reliably 
used as an “updates” config in a later cloneAndUpdate call. Consider carrying 
forward userConfigured (e.g., union of this.userConfigured and 
updates.userConfigured) and/or recording the fields that were applied to the 
clone.
   ```suggestion
           }
           updated.userConfigured.addAll(this.userConfigured);
           updated.userConfigured.addAll(updates.userConfigured);
   ```



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java:
##########
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.codec.binary.Base64InputStream;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractExternalProcessParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser that base64-encodes image content instead of performing OCR
+ * text extraction. This is useful when you need to preserve the original
+ * image data in the parsed output for downstream processing by an
+ * external service.
+ * <p>
+ * To configure this parser, pass an {@link EncodeOCRConfig} object
+ * through the ParseContext, or configure it via tika-config.xml/json.
+ */
+public class EncodeOCRParser
+        extends AbstractExternalProcessParser
+        implements Initializable {
+
+    private static final String OCR = "ocr-";
+    private static final Logger LOG = LoggerFactory.getLogger(
+            EncodeOCRParser.class);
+    private static final Object[] LOCK = new Object[0];
+    private static final long serialVersionUID = -8167538283213097266L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    // these are not currently covered by other parsers
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    // add the ocr- versions as well
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+    private static volatile boolean hasWarned = false;
+
+    private EncodeOCRConfig defaultConfig = new EncodeOCRConfig();
+
+    public EncodeOCRParser() {
+    }
+
+    public EncodeOCRParser(EncodeOCRConfig config) {
+        this.defaultConfig = config;
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        //no-op
+    }
+
+    public void checkInitialization() throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        EncodeOCRConfig config = context.get(EncodeOCRConfig.class);
+        if (config == null || !config.isSkipOcr()) {
+            return SUPPORTED_TYPES;
+        }
+        return Collections.emptySet();
+    }
+
+    @Override
+    public void parse(
+            TikaInputStream tis,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext
+    ) throws IOException, SAXException, TikaException {
+        ParseContext workingContext =
+                parseContext != null ? parseContext : new ParseContext();
+
+        EncodeOCRConfig userConfig = workingContext.get(
+                EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            config = defaultConfig.cloneAndUpdate(userConfig);
+        }
+
+        if (config != null && config.isSkipOcr()) {
+            return;
+        }
+
+        try (TemporaryResources tmp = new TemporaryResources()) {
+            TikaInputStream tikaStream = TikaInputStream.get(
+                    tis, tmp, metadata);
+
+            ContentHandler baseHandler = getContentHandler(
+                    config.isInlineContent(),
+                    handler,
+                    metadata,
+                    workingContext);
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(
+                    baseHandler, metadata);
+            xhtml.startDocument();
+            doEncode(tikaStream, xhtml, metadata, workingContext, config);
+            xhtml.endDocument();
+        }
+    }
+
+    private ContentHandler getContentHandler(
+            boolean isInlineContent,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext) {
+        if (!isInlineContent) {
+            return handler;
+        }
+        ParentContentHandler parentContentHandler = parseContext.get(
+                ParentContentHandler.class);
+        if (parentContentHandler == null) {
+            return handler;
+        }
+        String embeddedType = metadata.get(
+                TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        if (!TikaCoreProperties.EmbeddedResourceType.INLINE.name()
+                .equals(embeddedType)) {
+            return handler;
+        }
+        return new TeeContentHandler(
+                new EmbeddedContentHandler(
+                        new BodyContentHandler(
+                                parentContentHandler.getContentHandler())),
+                handler);
+    }
+
+    private void doEncode(
+            TikaInputStream tikaInputStream,
+            ContentHandler xhtml,
+            Metadata metadata,
+            ParseContext parseContext,
+            EncodeOCRConfig config
+    ) throws IOException, SAXException, TikaException {
+        warnOnFirstParse();
+
+        long size = tikaInputStream.getLength();
+        if (size >= config.getMinFileSizeToOcr()
+                && size <= config.getMaxFileSizeToOcr()) {
+            if (!reserveImageSlot(parseContext, config)) {
+                OCRImageCounter counter = parseContext.get(
+                        OCRImageCounter.class);
+                int processed = counter != null
+                        ? counter.get()
+                        : config.getMaxImagesToOcr();
+                LOG.info("Skipping OCR encode for image because "
+                                + "the configured limit of {} images "
+                                + "has been reached ({} already processed)",
+                        config.getMaxImagesToOcr(), processed);
+                return;
+            }
+            byte[] bytes = IOUtils.toByteArray(tikaInputStream);
+            encodeToBase64(bytes, xhtml);
+        } else {

Review Comment:
   doEncode() reads the entire image into a byte[] (IOUtils.toByteArray) before 
base64 encoding. With the current default maxFileSizeToOcr (Integer.MAX_VALUE) 
this can cause very large heap spikes/OOMs, and it duplicates data in memory 
(raw bytes + base64 chars). Prefer streaming directly from TikaInputStream 
through Base64InputStream into the ContentHandler, or spool to a temp file and 
stream from disk when size is large.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java:
##########
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.codec.binary.Base64InputStream;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractExternalProcessParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser that base64-encodes image content instead of performing OCR
+ * text extraction. This is useful when you need to preserve the original
+ * image data in the parsed output for downstream processing by an
+ * external service.
+ * <p>
+ * To configure this parser, pass an {@link EncodeOCRConfig} object
+ * through the ParseContext, or configure it via tika-config.xml/json.
+ */
+public class EncodeOCRParser
+        extends AbstractExternalProcessParser
+        implements Initializable {
+
+    private static final String OCR = "ocr-";
+    private static final Logger LOG = LoggerFactory.getLogger(
+            EncodeOCRParser.class);
+    private static final Object[] LOCK = new Object[0];
+    private static final long serialVersionUID = -8167538283213097266L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    // these are not currently covered by other parsers
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    // add the ocr- versions as well
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+    private static volatile boolean hasWarned = false;
+
+    private EncodeOCRConfig defaultConfig = new EncodeOCRConfig();
+
+    public EncodeOCRParser() {
+    }
+
+    public EncodeOCRParser(EncodeOCRConfig config) {
+        this.defaultConfig = config;
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        //no-op
+    }
+
+    public void checkInitialization() throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        EncodeOCRConfig config = context.get(EncodeOCRConfig.class);
+        if (config == null || !config.isSkipOcr()) {
+            return SUPPORTED_TYPES;
+        }
+        return Collections.emptySet();
+    }
+
+    @Override
+    public void parse(
+            TikaInputStream tis,
+            ContentHandler handler,
+            Metadata metadata,
+            ParseContext parseContext
+    ) throws IOException, SAXException, TikaException {
+        ParseContext workingContext =
+                parseContext != null ? parseContext : new ParseContext();
+
+        EncodeOCRConfig userConfig = workingContext.get(
+                EncodeOCRConfig.class);
+        EncodeOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            config = defaultConfig.cloneAndUpdate(userConfig);
+        }
+
+        if (config != null && config.isSkipOcr()) {
+            return;
+        }

Review Comment:
   This parser advertises and is routed to image/ocr-* types via 
CONTENT_TYPE_PARSER_OVERRIDE, but parse() never normalizes/removes the ocr- 
routing metadata (unlike TesseractOCRParser.normalizeOCRMimeMetadata). This can 
leave ocr-* values and the override key in Metadata after parsing. Consider 
stripping the ocr- prefix from Metadata.CONTENT_TYPE (if present) and removing 
CONTENT_TYPE_PARSER_OVERRIDE when it starts with "ocr-".



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java:
##########
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.codec.binary.Base64InputStream;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractExternalProcessParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Parser that base64-encodes image content instead of performing OCR
+ * text extraction. This is useful when you need to preserve the original
+ * image data in the parsed output for downstream processing by an
+ * external service.
+ * <p>
+ * To configure this parser, pass an {@link EncodeOCRConfig} object
+ * through the ParseContext, or configure it via tika-config.xml/json.
+ */
+public class EncodeOCRParser
+        extends AbstractExternalProcessParser
+        implements Initializable {
+
+    private static final String OCR = "ocr-";
+    private static final Logger LOG = LoggerFactory.getLogger(
+            EncodeOCRParser.class);
+    private static final Object[] LOCK = new Object[0];
+    private static final long serialVersionUID = -8167538283213097266L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
+                    // these are not currently covered by other parsers
+                    MediaType.image("jp2"),
+                    MediaType.image("jpx"),
+                    MediaType.image("x-portable-pixmap"),
+                    // add the ocr- versions as well
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap")
+            )));
+    private static volatile boolean hasWarned = false;
+
+    private EncodeOCRConfig defaultConfig = new EncodeOCRConfig();
+
+    public EncodeOCRParser() {
+    }
+
+    public EncodeOCRParser(EncodeOCRConfig config) {
+        this.defaultConfig = config;
+    }
+
+    @Override
+    public void initialize() throws TikaConfigException {
+        //no-op
+    }
+
+    public void checkInitialization() throws TikaConfigException {
+        //no-op
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        EncodeOCRConfig config = context.get(EncodeOCRConfig.class);
+        if (config == null || !config.isSkipOcr()) {
+            return SUPPORTED_TYPES;
+        }
+        return Collections.emptySet();
+    }

Review Comment:
   getSupportedTypes() only checks EncodeOCRConfig from the ParseContext and 
ignores the parser’s own defaultConfig (set via constructor). If 
defaultConfig.skipOcr is true, this parser will still advertise SUPPORTED_TYPES 
and may be selected by CompositeParser, but parse() will immediately return and 
produce no output. Consider merging defaultConfig with the context config (same 
logic as parse()) and basing supported types on the effective config.



##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRConfig.java:
##########
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocrencode;
+
+import java.io.Serializable;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Configuration for EncodeOCRParser. This class is not thread safe and must be
+ * synchronized externally.
+ * <p>
+ * This class will remember all set* field forever, and on
+ * {@link #cloneAndUpdate(EncodeOCRConfig)}, it will update all the fields that
+ * have been set on the "update" config. Create a new update config for each
+ * parse unless you're only changing the same field(s) with every parse.
+ */
+public class EncodeOCRConfig implements Serializable {
+
+    private static final long serialVersionUID = -1761942486845717891L;
+
+    private static final Logger LOG = LoggerFactory.getLogger(
+        EncodeOCRConfig.class
+    );
+
+    // Maximum file size to submit file to ocr.
+    private long maxFileSizeToOcr = Integer.MAX_VALUE;
+    // Minimum file size to submit file to ocr.
+    private long minFileSizeToOcr = 0;
+    private boolean skipOcr = false;
+    private int maxImagesToOcr = 50;
+    private Set<String> userConfigured = new HashSet<>();
+    private boolean inlineContent = false;
+
+    public void setInlineContent(boolean inlineContent) {
+        this.inlineContent = inlineContent;
+        userConfigured.add("inlineContent");
+    }
+
+    public boolean isInlineContent() {
+        return inlineContent;
+    }
+
+    /**
+     * @see #setMinFileSizeToOcr(long minFileSizeToOcr)
+     */
+    public long getMinFileSizeToOcr() {
+        return minFileSizeToOcr;
+    }
+
+    /**
+     * Set minimum file size to submit file to ocr. Default is 0.
+     */
+    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+        this.minFileSizeToOcr = minFileSizeToOcr;
+        userConfigured.add("minFileSizeToOcr");
+    }
+
+    /**
+     * @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
+     */
+    public long getMaxFileSizeToOcr() {
+        return maxFileSizeToOcr;
+    }
+
+    /**
+     * Set maximum file size to submit file to ocr. Default is
+     * Integer.MAX_VALUE.
+     */
+    public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
+        this.maxFileSizeToOcr = maxFileSizeToOcr;
+        userConfigured.add("maxFileSizeToOcr");
+    }
+
+    public boolean isSkipOcr() {
+        return skipOcr;
+    }
+
+    /**
+     * If you want to turn off OCR at run time for a specific file, set this to
+     * <code>true</code>
+     *
+     * @param skipOcr
+     */
+    public void setSkipOcr(boolean skipOcr) {
+        this.skipOcr = skipOcr;
+        userConfigured.add("skipOcr");
+    }
+
+    public int getMaxImagesToOcr() {
+        return maxImagesToOcr;
+    }
+
+    /**
+     * Sets the maximum number of images to process with OCR per parse. 
Defaults
+     * to {@link Integer#MAX_VALUE}, which effectively disables the limit.

Review Comment:
   Javadoc says maxImagesToOcr defaults to Integer.MAX_VALUE, but the field 
default is 50. Update the Javadoc (or the default) so they match; otherwise 
users will configure based on incorrect docs.
   ```suggestion
        * Sets the maximum number of images to process with OCR per parse. 
Default
        * is 50.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add OCR encode parser module [tika]

Reply via email to