This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b2ca3781f7a27e7c0ca627359d13a66c56940039
Author: Ewan Mellor <[email protected]>
AuthorDate: Wed Feb 21 13:43:44 2018 -0800

    Fix for TIKA-2584 contributed by ewanmellor.
    
    Add TesseractOCRConfig.{add,get}OtherTesseractConfig, plus parsing of
    TesseractOCRConfig.properties to extract any key-value pair where the
    key has an underscore.
    
    Inside TesseractOCRParser, pass these key-value pairs to Tesseract
    using its -c command line option.
    
    This gives a mechanism by which user code can pass arbitrary options
    to Tesseract without Tika having to understand them.
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 42 ++++++++++++++++++++++
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 15 ++++++--
 2 files changed, 54 insertions(+), 3 deletions(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 4139cd2..07bb7f8 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -20,7 +20,9 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
+import java.util.HashMap;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Properties;
 
 /**
@@ -100,6 +102,9 @@ public class TesseractOCRConfig implements Serializable {
     // whether or not to apply rotation calculated by the rotation.py script
     private boolean applyRotation = false;
 
+    // See addOtherTesseractConfig.
+    private Map<String, String> otherTesseractConfig = new HashMap<>();
+
 
     /**
      * Default contructor.
@@ -178,6 +183,7 @@ public class TesseractOCRConfig implements Serializable {
         setApplyRotation(
                        getProp(props, "applyRotation", getApplyRotation()));
 
+        loadOtherTesseractConfig(props);
     }
 
     /**
@@ -517,6 +523,28 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
+     * @see #addOtherTesseractConfig(String, String)
+     */
+    public Map<String, String> getOtherTesseractConfig() {
+        return otherTesseractConfig;
+    }
+
+    /**
+     * Add a key-value pair to pass to Tesseract using its -c command line 
option.
+     * To see the possible options, run tesseract --print-parameters.
+     *
+     * You may also add these parameters in TesseractOCRConfig.properties; any
+     * key-value pair in the properties file where the key contains an 
underscore
+     * is passed directly to Tesseract.
+     *
+     * @param key
+     * @param value
+     */
+    public void addOtherTesseractConfig(String key, String value) {
+        otherTesseractConfig.put(key, value);
+    }
+
+    /**
      * Get property from the properties file passed in.
      *
      * @param properties     properties file to read from.
@@ -565,4 +593,18 @@ public class TesseractOCRConfig implements Serializable {
                 property, propVal));
     }
 
+    /**
+     * Populate otherTesseractConfig from the given properties.
+     * This assumes that any key-value pair where the key contains
+     * an underscore is an option to be passed opaquely to Tesseract.
+     *
+     * @param properties properties file to read from.
+     */
+    private void loadOtherTesseractConfig(Properties properties) {
+        for (String k : properties.stringPropertyNames()) {
+            if (k.contains("_")) {
+                otherTesseractConfig.put(k, properties.getProperty(k));
+            }
+        }
+    }
 }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 3e15c44..6bf2ab4 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -34,6 +34,7 @@ import java.io.Reader;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
@@ -465,12 +466,20 @@ public class TesseractOCRParser extends AbstractParser 
implements Initializable
      *           if an input error occurred
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) 
throws IOException, TikaException {
-        String[] cmd = { config.getTesseractPath() + getTesseractProg(), 
input.getPath(), output.getPath(), "-l",
+        ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
+                config.getTesseractPath() + getTesseractProg(), 
input.getPath(),  output.getPath(), "-l",
                 config.getLanguage(), "-psm", config.getPageSegMode(),
-                config.getOutputType().name().toLowerCase(Locale.US),
+                config.getOutputType().name().toLowerCase(Locale.US)
+        ));
+        for (Map.Entry<String, String> entry : 
config.getOtherTesseractConfig().entrySet()) {
+            cmd.add("-c");
+            cmd.add(entry.getKey() + "=" + entry.getValue());
+        }
+        cmd.addAll(Arrays.asList(
                 "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
-                (config.getPreserveInterwordSpacing())? 
"preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
+                (config.getPreserveInterwordSpacing())? 
"preserve_interword_spaces=1" : "preserve_interword_spaces=0"
+        ));
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
         final Process process = pb.start();

-- 
To stop receiving notification emails like this one, please contact
[email protected].

Reply via email to