zamf commented on code in PR #2769: URL: https://github.com/apache/tika/pull/2769#discussion_r3092867201
########## tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-encode-module/src/main/java/org/apache/tika/parser/ocrencode/EncodeOCRParser.java: ########## @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocrencode; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.codec.binary.Base64InputStream; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import org.apache.tika.config.Initializable; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.ParentContentHandler; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractExternalProcessParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; + +/** + * Parser that base64-encodes image content instead of performing OCR + * text extraction. This is useful when you need to preserve the original + * image data in the parsed output for downstream processing by an + * external service. + * <p> + * To configure this parser, pass an {@link EncodeOCRConfig} object + * through the ParseContext, or configure it via tika-config.xml/json. + */ +public class EncodeOCRParser + extends AbstractExternalProcessParser + implements Initializable { + + private static final String OCR = "ocr-"; + private static final Logger LOG = LoggerFactory.getLogger( + EncodeOCRParser.class); + private static final Object[] LOCK = new Object[0]; + private static final long serialVersionUID = -8167538283213097266L; + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + MediaType.image(OCR + "png"), + MediaType.image(OCR + "jpeg"), + MediaType.image(OCR + "tiff"), + MediaType.image(OCR + "bmp"), + MediaType.image(OCR + "gif"), + // these are not currently covered by other parsers + MediaType.image("jp2"), + MediaType.image("jpx"), + MediaType.image("x-portable-pixmap"), + // add the ocr- versions as well + MediaType.image(OCR + "jp2"), + MediaType.image(OCR + "jpx"), + MediaType.image(OCR + "x-portable-pixmap") + ))); + private static volatile boolean hasWarned = false; + + private EncodeOCRConfig defaultConfig = new EncodeOCRConfig(); + + public EncodeOCRParser() { + } + + public EncodeOCRParser(EncodeOCRConfig config) { + this.defaultConfig = config; + } + + @Override + public void initialize() throws TikaConfigException { + //no-op + } + + public void checkInitialization() throws TikaConfigException { + //no-op + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + EncodeOCRConfig config = context.get(EncodeOCRConfig.class); + if (config == null || !config.isSkipOcr()) { + return SUPPORTED_TYPES; + } + return Collections.emptySet(); + } + + @Override + public void parse( + TikaInputStream tis, + ContentHandler handler, + Metadata metadata, + ParseContext parseContext + ) throws IOException, SAXException, TikaException { + ParseContext workingContext = + parseContext != null ? parseContext : new ParseContext(); + + EncodeOCRConfig userConfig = workingContext.get( + EncodeOCRConfig.class); + EncodeOCRConfig config = defaultConfig; + if (userConfig != null) { + config = defaultConfig.cloneAndUpdate(userConfig); + } + + if (config != null && config.isSkipOcr()) { + return; + } + + try (TemporaryResources tmp = new TemporaryResources()) { + TikaInputStream tikaStream = TikaInputStream.get( + tis, tmp, metadata); + + ContentHandler baseHandler = getContentHandler( + config.isInlineContent(), + handler, + metadata, + workingContext); + XHTMLContentHandler xhtml = new XHTMLContentHandler( + baseHandler, metadata); + xhtml.startDocument(); + doEncode(tikaStream, xhtml, metadata, workingContext, config); + xhtml.endDocument(); + } + } + + private ContentHandler getContentHandler( + boolean isInlineContent, + ContentHandler handler, + Metadata metadata, + ParseContext parseContext) { + if (!isInlineContent) { + return handler; + } + ParentContentHandler parentContentHandler = parseContext.get( + ParentContentHandler.class); + if (parentContentHandler == null) { + return handler; + } + String embeddedType = metadata.get( + TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (!TikaCoreProperties.EmbeddedResourceType.INLINE.name() + .equals(embeddedType)) { + return handler; + } + return new TeeContentHandler( + new EmbeddedContentHandler( + new BodyContentHandler( + parentContentHandler.getContentHandler())), + handler); + } + + private void doEncode( + TikaInputStream tikaInputStream, + ContentHandler xhtml, + Metadata metadata, + ParseContext parseContext, + EncodeOCRConfig config + ) throws IOException, SAXException, TikaException { + warnOnFirstParse(); + + long size = tikaInputStream.getLength(); + if (size >= config.getMinFileSizeToOcr() + && size <= config.getMaxFileSizeToOcr()) { + if (!reserveImageSlot(parseContext, config)) { + OCRImageCounter counter = parseContext.get( + OCRImageCounter.class); + int processed = counter != null + ? counter.get() + : config.getMaxImagesToOcr(); + LOG.info("Skipping OCR encode for image because " + + "the configured limit of {} images " + + "has been reached ({} already processed)", + config.getMaxImagesToOcr(), processed); + return; + } + byte[] bytes = IOUtils.toByteArray(tikaInputStream); + encodeToBase64(bytes, xhtml); + } else { Review Comment: fixed, switched to streaming -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
