[ https://issues.apache.org/jira/browse/NIFI-1815?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15302470#comment-15302470 ]
ASF GitHub Bot commented on NIFI-1815: -------------------------------------- Github user olegz commented on a diff in the pull request: https://github.com/apache/nifi/pull/397#discussion_r64783373 --- Diff: nifi-nar-bundles/nifi-ocr-bundle/nifi-ocr-processors/src/main/java/org/apache/nifi/processors/ocr/TesseractOCRProcessor.java --- @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nifi.processors.ocr; + +import net.sourceforge.tess4j.ITesseract; +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; +import org.apache.commons.lang3.StringUtils; +import org.apache.nifi.annotation.behavior.InputRequirement; +import org.apache.nifi.annotation.lifecycle.OnScheduled; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.components.AllowableValue; +import org.apache.nifi.components.ValidationContext; +import org.apache.nifi.components.ValidationResult; +import org.apache.nifi.components.Validator; +import org.apache.nifi.flowfile.FlowFile; +import org.apache.nifi.annotation.documentation.CapabilityDescription; +import org.apache.nifi.annotation.documentation.Tags; + +import org.apache.nifi.processor.AbstractProcessor; +import org.apache.nifi.processor.exception.ProcessException; +import org.apache.nifi.processor.io.StreamCallback; +import org.apache.nifi.processor.util.StandardValidators; +import org.apache.nifi.processor.Relationship; +import org.apache.nifi.processor.ProcessorInitializationContext; +import org.apache.nifi.processor.ProcessContext; +import org.apache.nifi.processor.ProcessSession; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.InputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.File; +import java.io.FileFilter; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.Map; +import java.util.HashMap; +import java.util.HashSet; +import java.util.ArrayList; +import java.util.concurrent.atomic.AtomicBoolean; + +@Tags({"ocr", "tesseract", "image", "text"}) +@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED) +@CapabilityDescription("Extracts text from images using Optical Character Recognition (OCR). The images are pulled from the incoming" + + " Flowfile's content. Supported image types are TIFF, JPEG, GIF, PNG, BMP, and PDF. Any Flowfile that doesn't contain" + + " a supported image type in its content body will be routed to the 'unsupported image format' relationship and no OCR will be performed." + + " This processor uses Tesseract to perform its duties and part of that requires that a valid Tesseract data (Tessdata) directory" + + " be specified in the 'Tessdata Directory' Property. This processor considers a valid Tessdata directory to be an existing directory on the" + + " local NiFi instance that contains one or more files ending with the '.traineddata' extension. The list of supported languages" + + " is built from the Tessdata directory configured by listing all files ending with '.traineddata' and considering those" + + " Tesseract language models. You can create you own Tesseract language models and place them in your Tessedata directory" + + " and the processor will display it in the dropdown list of languages available. All valid Tesseract configuration values" + + " may be passed to this processor by use of the 'Tesseract configuration values' which accepts a comma separated list" + + " of key=value pairs representing Tesseract configurations. 'Tesseract configuration values' is where all of your tuning" + + " values can be passed in to help increase the accuracy of your OCR operations based on your expected input images." + + " TesseractOCRProcessor only supports installations of Tesseract version 3.0 and greater.") +public class TesseractOCRProcessor extends AbstractProcessor { + + public static Set<String> SUPPORTED_LANGUAGES; + private static final String TESS_LANG_EXTENSION = ".traineddata"; + private static List<AllowableValue> PAGE_SEGMENTATION_MODES; + private static ITesseract tessInstance; + private List<PropertyDescriptor> descriptors; + private Set<Relationship> relationships; --- End diff -- Since @OnScheduled (wher you are setting it) and onTrigger() are going to be different threads, consider making this 'volatile' to ensure visibility between the threads > Tesseract OCR Processor > ----------------------- > > Key: NIFI-1815 > URL: https://issues.apache.org/jira/browse/NIFI-1815 > Project: Apache NiFi > Issue Type: Improvement > Reporter: Jeremy Dyer > Assignee: Jeremy Dyer > Attachments: 0006-changes-to-the-OCR-processor.patch, > nifi_1815_1.x_patch.zip > > > This ticket is a follow-up to NIFI-1718 minus the use of the Tika library > Expose OCR capabilities through a new processor which uses the Tesseract > library. Use of this processor would require that Tesseract be installed on > the NiFi host. Since the processor will have a system dependency care must be > taken to ensure that the overall NiFi cluster continues to function properly > in the absence of the Tesseract system dependency even though the OCR > processor itself will be unable to perform its duties. In the event that the > system dependencies are not detected the processor should display a > validation warning rather than failing or preventing the NiFi instance from > booting properly. > Properties expose to configure Tesseract > tesseractPath - Path to tesseract installation folder, if not on system path. > language - Language ID (e.g. "eng"); language dictionary to be used. > pageSegMode - Tesseract page segmentation mode, defaults to 1. > minFileSizeToOcr - Minimum file size to submit file to OCR, defaults to 0. > maxFileSizeToOcr - Maximum file size to submit file to OCR, defaults to > Integer.MAX_VALUE. > timeout - Maximum time (in seconds) to wait for the OCR process termination; > defaults to 120. -- This message was sent by Atlassian JIRA (v6.3.4#6332)