tika git commit: TIKA-2093- Add Tesseract's hOCR output format as an option, via Eric Pugh. This commit also catches 2.x up to trunk; there were clearly some other changes to Tesseract that hadn't yet made it into 2.x.

tallison Thu, 22 Sep 2016 18:23:49 -0700

Repository: tika
Updated Branches:
  refs/heads/2.x d543378a8 -> 673533d0e



TIKA-2093- Add Tesseract's hOCR output format as an option, via Eric Pugh.  
This commit also catches 2.x up to trunk; there were clearly some other changes 
to Tesseract that hadn't yet made it into 2.x.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/673533d0
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/673533d0
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/673533d0

Branch: refs/heads/2.x
Commit: 673533d0e65b2b2613e19bbf952bdb352c628e52
Parents: d543378
Author: tballison <talli...@mitre.org>
Authored: Thu Sep 22 21:23:32 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Thu Sep 22 21:23:32 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../tika/parser/ocr/TesseractOCRConfig.java     | 254 +++++++++++++++++--
 .../tika/parser/ocr/TesseractOCRParser.java     | 236 +++++++++++++++--
 .../tika/parser/ocr/TesseractOCRParserTest.java | 117 ++++-----
 4 files changed, 503 insertions(+), 107 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 53f9a82..d13a644 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.14 - ???
 
+  * Add Tesseract's hOCR output format as an option, via Eric Pugh
+    (TIKA-2093).
+
   * Extract macros from MSOffice files (TIKA-2069).
 
   * Maintain passed-in mime in TXTParser (TIKA-2047).

http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index a35370a..5d06a7a 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -42,11 +42,16 @@ public class TesseractOCRConfig implements Serializable{
 
        private static final long serialVersionUID = -4861942486845757891L;
 
+       public enum OUTPUT_TYPE {
+               TXT,
+               HOCR
+       }
+
        // Path to tesseract installation folder, if not on system path.
        private  String tesseractPath = "";
 
-    // Path to the 'tessdata' folder, which contains language files and config 
files.
-    private String tessdataPath = "";
+       // Path to the 'tessdata' folder, which contains language files and 
config files.
+       private String tessdataPath = "";
 
        // Language dictionary to be used.
        private  String language = "eng";
@@ -63,6 +68,30 @@ public class TesseractOCRConfig implements Serializable{
        // Maximum time (seconds) to wait for the ocring process termination
        private int timeout = 120;
 
+       // The format of the ocr'ed output to be returned, txt or hocr.
+       private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
+
+       // enable image processing (optional)
+       private int enableImageProcessing = 0;
+
+       // Path to ImageMagick program, if not on system path.
+       private String ImageMagickPath = "";
+
+       // resolution of processed image (in dpi).
+       private int density = 300;
+
+       // number of bits in a color sample within a pixel.
+       private int depth = 4;
+
+       // colorspace of processed image.
+       private String colorspace = "gray";
+
+       // filter to be applied to the processed image.
+       private String filter = "triangle";
+
+       // factor by which image is to be scaled.
+       private int resize = 900;
+
        /**
         * Default contructor.
         */
@@ -99,10 +128,11 @@ public class TesseractOCRConfig implements Serializable{
                        }
                }
 
+               // set parameters for Tesseract
                setTesseractPath(
                                getProp(props, "tesseractPath", 
getTesseractPath()));
-        setTessdataPath(
-                getProp(props, "tessdataPath", getTessdataPath()));
+               setTessdataPath(
+                               getProp(props, "tessdataPath", 
getTessdataPath()));
                setLanguage(
                                getProp(props, "language", getLanguage()));
                setPageSegMode(
@@ -112,7 +142,29 @@ public class TesseractOCRConfig implements Serializable{
                setMaxFileSizeToOcr(
                                getProp(props, "maxFileSizeToOcr", 
getMaxFileSizeToOcr()));
                setTimeout(
-                getProp(props, "timeout", getTimeout()));
+                               getProp(props, "timeout", getTimeout()));
+               String outputTypeString = props.getProperty("outputType");
+               if ("txt".equals(outputTypeString)) {
+                       setOutputType(OUTPUT_TYPE.TXT);
+               } else if ("hocr".equals(outputTypeString)) {
+                       setOutputType(OUTPUT_TYPE.HOCR);
+               }
+
+               // set parameters for ImageMagick
+               setEnableImageProcessing(
+                               getProp(props, "enableImageProcessing", 
isEnableImageProcessing()));
+               setImageMagickPath(
+                               getProp(props, "ImageMagickPath", 
getImageMagickPath()));
+               setDensity(
+                               getProp(props, "density", getDensity()));
+               setDepth(
+                               getProp(props, "depth", getDepth()));
+               setColorspace(
+                               getProp(props, "colorspace", getColorspace()));
+               setFilter(
+                               getProp(props, "filter", getFilter()));
+               setResize(
+                               getProp(props, "resize", getResize()));
 
        }
 
@@ -123,10 +175,10 @@ public class TesseractOCRConfig implements Serializable{
 
        /**
         * Set the path to the Tesseract executable, needed if it is not on 
system path.
-     * <p>
-     * Note that if you set this value, it is highly recommended that you also
-     * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
-     * </p>
+        * <p>
+        * Note that if you set this value, it is highly recommended that you 
also
+        * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+        * </p>
         */
        public void setTesseractPath(String tesseractPath) {
                if(!tesseractPath.isEmpty() && 
!tesseractPath.endsWith(File.separator))
@@ -135,22 +187,22 @@ public class TesseractOCRConfig implements Serializable{
                this.tesseractPath = tesseractPath;
        }
 
-    /** @see #setTessdataPath(String tessdataPath) */
-    public String getTessdataPath() {
-        return tessdataPath;
-    }
+       /** @see #setTessdataPath(String tessdataPath) */
+       public String getTessdataPath() {
+               return tessdataPath;
+       }
 
-    /**
-     * Set the path to the 'tessdata' folder, which contains language files 
and config files. In some cases (such
-     * as on Windows), this folder is found in the Tesseract installation, but 
in other cases
-     * (such as when Tesseract is built from source), it may be located 
elsewhere.
-     */
-    public void setTessdataPath(String tessdataPath) {
-        if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
-            tessdataPath += File.separator;
+       /**
+        * Set the path to the 'tessdata' folder, which contains language files 
and config files. In some cases (such
+        * as on Windows), this folder is found in the Tesseract installation, 
but in other cases
+        * (such as when Tesseract is built from source), it may be located 
elsewhere.
+        */
+       public void setTessdataPath(String tessdataPath) {
+               if(!tessdataPath.isEmpty() && 
!tessdataPath.endsWith(File.separator))
+                       tessdataPath += File.separator;
 
-        this.tessdataPath = tessdataPath;
-    }
+               this.tessdataPath = tessdataPath;
+       }
 
        /** @see #setLanguage(String language)*/
        public String getLanguage() {
@@ -218,12 +270,166 @@ public class TesseractOCRConfig implements Serializable{
                this.timeout = timeout;
        }
 
-       /** @see #setTimeout(int timeout)*/
+       /** @see #setTimeout(int timeout)
+        * @return timeout value for Tesseract */
        public int getTimeout() {
                return timeout;
        }
 
        /**
+        * Set output type from ocr process.  Default is "txt", but can be 
"hocr".
+        * Default value is 120s.
+        */
+       public void setOutputType(OUTPUT_TYPE outputType) {
+               this.outputType = outputType;
+       }
+
+       /** @see #setOutputType(OUTPUT_TYPE outputType) */
+       public OUTPUT_TYPE getOutputType() {
+               return outputType;
+       }
+
+       /** @see #setEnableImageProcessing(int)
+        * @return image processing is enabled or not */
+       public int isEnableImageProcessing() {
+               return enableImageProcessing;
+       }
+
+       /**
+        * Set the value to true if processing is to be enabled.
+        * Default value is false.
+        */
+       public void setEnableImageProcessing(int enableImageProcessing) {
+               this.enableImageProcessing = enableImageProcessing;
+       }
+
+       /**
+        * @return the density
+        */
+       public int getDensity() {
+               return density;
+       }
+
+       /**
+        * @param density the density to set. Valid range of values is 150-1200.
+        * Default value is 300.
+        */
+       public void setDensity(int density) {
+               if(density < 150 || density > 1200) {
+                       throw new IllegalArgumentException("Invalid density 
value. Valid range of values is 150-1200.");
+               }
+               this.density = density;
+       }
+
+       /**
+        * @return the depth
+        */
+       public int getDepth() {
+               return depth;
+       }
+
+       /**
+        * @param depth the depth to set. Valid values are 2, 4, 8, 16, 32, 64, 
256, 4096.
+        * Default value is 4.
+        */
+       public void setDepth(int depth) {
+               int[] allowedValues = {2, 4, 8, 16, 32, 64, 256, 4096};
+               for (int i = 0; i < allowedValues.length; i++) {
+                       if(depth == allowedValues[i]) {
+                               this.depth = depth;
+                               return;
+                       }
+               }
+               throw new IllegalArgumentException("Invalid depth value. Valid 
values are 2, 4, 8, 16, 32, 64, 256, 4096.");
+       }
+
+       /**
+        * @return the colorspace
+        */
+       public String getColorspace() {
+               return colorspace;
+       }
+
+       /**
+        * @param colorspace the colorspace to set
+        * Deafult value is gray.
+        */
+       public void setColorspace(String colorspace) {
+               if(!colorspace.equals(null)) {
+                       this.colorspace = colorspace;
+               } else {
+                       throw new IllegalArgumentException("Colorspace value 
cannot be null.");
+               }
+       }
+
+       /**
+        * @return the filter
+        */
+       public String getFilter() {
+               return filter;
+       }
+
+       /**
+        * @param filter the filter to set. Valid values are point, hermite, 
cubic, box, gaussian, catrom, triangle, quadratic and mitchell.
+        * Default value is triangle.
+        */
+       public void setFilter(String filter) {
+               if(filter.equals(null)) {
+                       throw new IllegalArgumentException("Filter value cannot 
be null. Valid values are point, hermite, "
+                                       + "cubic, box, gaussian, catrom, 
triangle, quadratic and mitchell.");
+               }
+
+               String[] allowedFilters = {"Point", "Hermite", "Cubic", "Box", 
"Gaussian", "Catrom", "Triangle", "Quadratic", "Mitchell"};
+               for (int i = 0; i < allowedFilters.length; i++) {
+                       if(filter.equalsIgnoreCase(allowedFilters[i])) {
+                               this.filter = filter;
+                               return;
+                       }
+               }
+               throw new IllegalArgumentException("Invalid filter value. Valid 
values are point, hermite, "
+                               + "cubic, box, gaussian, catrom, triangle, 
quadratic and mitchell.");
+       }
+
+       /**
+        * @return the resize
+        */
+       public int getResize() {
+               return resize;
+       }
+
+       /**
+        * @param resize the resize to set. Valid range of values is 100-900.
+        * Default value is 900.
+        */
+       public void setResize(int resize) {
+               for(int i=1;i<10;i++) {
+                       if(resize == i*100) {
+                               this.resize = resize;
+                               return;
+                       }
+               }
+               throw new IllegalArgumentException("Invalid resize value. Valid 
range of values is 100-900.");
+       }
+
+       /** @see #setImageMagickPath(String ImageMagickPath)
+        * @return path to ImageMagick file. */
+       public String getImageMagickPath() {
+
+               return ImageMagickPath;
+       }
+
+       /**
+        * Set the path to the ImageMagick executable, needed if it is not on 
system path.
+        * @param ImageMagickPath to ImageMagick file.
+        */
+       public void setImageMagickPath(String ImageMagickPath) {
+               if(!ImageMagickPath.isEmpty() && 
!ImageMagickPath.endsWith(File.separator))
+                       ImageMagickPath += File.separator;
+
+               this.ImageMagickPath = ImageMagickPath;
+       }
+
+       /**
         * Get property from the properties file passed in.
         * @param properties properties file to read from.
         * @param property the property to fetch.

http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 83fe7fe..c6705b0 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,8 +16,14 @@
  */
 package org.apache.tika.parser.ocr;
 
-import java.awt.*;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import javax.imageio.ImageIO;
+import javax.xml.parsers.SAXParser;
+import java.awt.Image;
 import java.awt.image.BufferedImage;
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
@@ -25,11 +31,14 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.Callable;
@@ -38,7 +47,10 @@ import java.util.concurrent.FutureTask;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
-import javax.imageio.ImageIO;
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.exception.TikaException;
@@ -56,10 +68,12 @@ import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
 import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import static java.nio.charset.StandardCharsets.UTF_8;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
@@ -85,6 +99,8 @@ public class TesseractOCRParser extends AbstractParser {
             })));
     private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, 
Boolean>();
 
+
+
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         // If Tesseract is installed, offer our supported image types
@@ -117,13 +133,48 @@ public class TesseractOCRParser extends AbstractParser {
         if (TESSERACT_PRESENT.containsKey(tesseract)) {
             return TESSERACT_PRESENT.get(tesseract);
         }
-
         // Try running Tesseract from there, and see if it exists + works
         String[] checkCmd = { tesseract };
         boolean hasTesseract = ExternalParser.check(checkCmd);
         TESSERACT_PRESENT.put(tesseract, hasTesseract);
         return hasTesseract;
-     
+
+    }
+
+    private boolean hasImageMagick(TesseractOCRConfig config) {
+        // Fetch where the config says to find ImageMagick Program
+        String ImageMagick = config.getImageMagickPath() + 
getImageMagickProg();
+
+        // Have we already checked for a copy of ImageMagick Program there?
+        if (TESSERACT_PRESENT.containsKey(ImageMagick)) {
+            return TESSERACT_PRESENT.get(ImageMagick);
+        }
+
+        // Try running ImageMagick program from there, and see if it exists + 
works
+        String[] checkCmd = { ImageMagick };
+        boolean hasImageMagick = ExternalParser.check(checkCmd);
+        TESSERACT_PRESENT.put(ImageMagick, hasImageMagick);
+
+        return hasImageMagick;
+
+    }
+
+    private static boolean hasPython() {
+        // check if python is installed and if the rotation program path has 
been specified correctly
+
+        boolean hasPython = false;
+
+        try {
+            Process proc = Runtime.getRuntime().exec("python -h");
+            BufferedReader stdInput = new BufferedReader(new 
InputStreamReader(proc.getInputStream(), "UTF-8"));
+            if(stdInput.read() != -1) {
+                hasPython = true;
+            }
+        } catch (IOException e) {
+
+        }
+
+        return hasPython;
     }
 
     public void parse(Image image, ContentHandler handler, Metadata metadata, 
ParseContext context) throws IOException,
@@ -153,9 +204,10 @@ public class TesseractOCRParser extends AbstractParser {
     }
 
     @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext parseContext)
             throws IOException, SAXException, TikaException {
-        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, 
DEFAULT_CONFIG);
+
+        TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, 
DEFAULT_CONFIG);
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
@@ -169,12 +221,12 @@ public class TesseractOCRParser extends AbstractParser {
             XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
             xhtml.startDocument();
             File tmpImgFile = tmp.createTemporaryFile();
-            parse(tikaStream, tmpImgFile, xhtml, config);
+            parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
             // Temporary workaround for TIKA-1445 - until we can specify
             //  composite parsers with strategies (eg Composite, Try In Turn),
             //  always send the image onwards to the regular parser to have
             //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, context);
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, parseContext);
             xhtml.endDocument();
         } finally {
             tmp.dispose();
@@ -184,15 +236,37 @@ public class TesseractOCRParser extends AbstractParser {
     /**
      * Use this to parse content without starting a new document.
      * This appends SAX events to xhtml without re-adding the metadata, body 
start, etc.
+     *
      * @param stream inputstream
      * @param xhtml handler
      * @param config TesseractOCRConfig to use for this parse
      * @throws IOException
      * @throws SAXException
      * @throws TikaException
+     *
+     * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, 
ParseContext, TesseractOCRConfig)}
      */
     public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
+        parseInline(stream, xhtml, new ParseContext(), config);
+    }
+
+    /**
+     * Use this to parse content without starting a new document.
+     * This appends SAX events to xhtml without re-adding the metadata, body 
start, etc.
+     *
+     * @param stream inputstream
+     * @param xhtml handler
+     * @param config TesseractOCRConfig to use for this parse
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     *
+     * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, 
ParseContext, TesseractOCRConfig)}
+     */
+    public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
ParseContext parseContext,
+                            TesseractOCRConfig config)
+            throws IOException, SAXException, TikaException {
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
@@ -203,34 +277,99 @@ public class TesseractOCRParser extends AbstractParser {
         try {
             TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
             File tmpImgFile = tmp.createTemporaryFile();
-            parse(tikaStream, tmpImgFile, xhtml, config);
+            parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
         } finally {
             tmp.dispose();
         }
 
     }
 
-    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
XHTMLContentHandler xhtml, TesseractOCRConfig config)
+    /**
+     * This method is used to process the image to an OCR-friendly format.
+     * @param streamingObject input image to be processed
+     * @param config TesseractOCRconfig class to get ImageMagick properties
+     * @throws IOException if an input error occurred
+     * @throws TikaException if an exception timed out
+     */
+    private void processImage(File streamingObject, TesseractOCRConfig config) 
throws IOException, TikaException {
+
+        // fetch rotation script from resources
+        InputStream in = getClass().getResourceAsStream("rotation.py");
+        TemporaryResources tmp = new TemporaryResources();
+        File rotationScript = tmp.createTemporaryFile();
+        Files.copy(in, rotationScript.toPath(), 
StandardCopyOption.REPLACE_EXISTING);
+
+        String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + 
streamingObject.getAbsolutePath();
+        String angle = "0";
+
+        DefaultExecutor executor = new DefaultExecutor();
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+        executor.setStreamHandler(streamHandler);
+
+        // determine the angle of rotation required to make the text horizontal
+        CommandLine cmdLine = CommandLine.parse(cmd);
+        if(hasPython()) {
+            try {
+                executor.execute(cmdLine);
+                angle = outputStream.toString("UTF-8").trim();
+            } catch(Exception e) {
+
+            }
+        }
+
+        // process the image - parameter values can be set in 
TesseractOCRConfig.properties
+        String line = "convert -density " + config.getDensity() + " -depth " + 
config.getDepth() +
+                " -colorspace " + config.getColorspace() +  " -filter " + 
config.getFilter() +
+                " -resize " + config.getResize() + "% -rotate "+ angle + " " + 
streamingObject.getAbsolutePath() +
+                " " + streamingObject.getAbsolutePath();
+        cmdLine = CommandLine.parse(line);
+        try {
+            executor.execute(cmdLine);
+        } catch(Exception e) {
+
+        }
+
+        tmp.close();
+    }
+
+    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
ParseContext parseContext,
+                       XHTMLContentHandler xhtml, TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
         File tmpTxtOutput = null;
-
         try {
             File input = tikaInputStream.getFile();
             long size = tikaInputStream.getLength();
 
             if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
 
-                doOCR(input, tmpImgFile, config);
+                // copy the contents of the original input file into a 
temporary file
+                // which will be processed for OCR
+                TemporaryResources tmp = new TemporaryResources();
+                File tmpFile = tmp.createTemporaryFile();
+                FileUtils.copyFile(input, tmpFile);
+
+                // Process image if ImageMagick Tool is present
+                if(config.isEnableImageProcessing() == 1 && 
hasImageMagick(config)) {
+                    processImage(tmpFile,config);
+                }
 
-                // Tesseract appends .txt to output file name
-                tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
+                doOCR(tmpFile, tmpImgFile, config);
+
+                // Tesseract appends the output type (.txt or .hocr) to output 
file name
+                tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." + 
config.getOutputType());
 
                 if (tmpTxtOutput.exists()) {
                     try (InputStream is = new FileInputStream(tmpTxtOutput)) {
-                        extractOutput(is, xhtml);
+                        if 
(config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
+                            extractHOCROutput(is, parseContext, xhtml);
+                        } else {
+                            extractOutput(is, xhtml);
+                        }
                     }
                 }
 
+                tmp.close();
             }
 
         } finally {
@@ -240,6 +379,7 @@ public class TesseractOCRParser extends AbstractParser {
         }
     }
 
+
     // TIKA-1445 workaround parser
     private static Parser _TMP_IMAGE_METADATA_PARSER = new 
CompositeImageParser();
     private static class CompositeImageParser extends CompositeParser {
@@ -268,7 +408,7 @@ public class TesseractOCRParser extends AbstractParser {
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) 
throws IOException, TikaException {
         String[] cmd = { config.getTesseractPath() + getTesseractProg(), 
input.getPath(), output.getPath(), "-l",
-                config.getLanguage(), "-psm", config.getPageSegMode() };
+                config.getLanguage(), "-psm", config.getPageSegMode(), 
config.getOutputType().name().toLowerCase(Locale.US)};
 
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
@@ -334,7 +474,17 @@ public class TesseractOCRParser extends AbstractParser {
             }
         }
         xhtml.endElement("div");
+    }
 
+    private void extractHOCROutput(InputStream is, ParseContext parseContext,
+                                   XHTMLContentHandler xhtml) throws 
TikaException, IOException, SAXException {
+        if (parseContext == null) {
+            parseContext = new ParseContext();
+        }
+        SAXParser parser = parseContext.getSAXParser();
+        xhtml.startElement("div", "class", "ocr");
+        parser.parse(is, new OfflineContentHandler(new 
HOCRPassThroughHandler(xhtml)));
+        xhtml.endElement("div");
     }
 
     /**
@@ -367,4 +517,56 @@ public class TesseractOCRParser extends AbstractParser {
         return System.getProperty("os.name").startsWith("Windows") ? 
"tesseract.exe" : "tesseract";
     }
 
+    static String getImageMagickProg() {
+        return System.getProperty("os.name").startsWith("Windows") ? 
"convert.exe" : "convert";
+    }
+
+
+    private static class HOCRPassThroughHandler extends DefaultHandler {
+        private final ContentHandler xhtml;
+        public static final Set<String> IGNORE = unmodifiableSet(
+                "html", "head", "title", "meta", "body");
+
+        public HOCRPassThroughHandler(ContentHandler xhtml) {
+            this.xhtml = xhtml;
+        }
+
+        /**
+         * Starts the given element. Table cells and list items are 
automatically
+         * indented by emitting a tab character as ignorable whitespace.
+         */
+        @Override
+        public void startElement(
+                String uri, String local, String name, Attributes attributes)
+                throws SAXException {
+            if (!IGNORE.contains(name)) {
+                xhtml.startElement(uri, local, name, attributes);
+            }
+        }
+
+        /**
+         * Ends the given element. Block elements are automatically followed
+         * by a newline character.
+         */
+        @Override
+        public void endElement(String uri, String local, String name) throws 
SAXException {
+            if (!IGNORE.contains(name)) {
+                xhtml.endElement(uri, local, name);
+            }
+        }
+
+        /**
+         * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-210";>TIKA-210</a>
+         */
+        @Override
+        public void characters(char[] ch, int start, int length) throws 
SAXException {
+            xhtml.characters(ch, start, length);
+        }
+
+        private static Set<String> unmodifiableSet(String... elements) {
+            return Collections.unmodifiableSet(
+                    new HashSet<String>(Arrays.asList(elements)));
+        }
+    }
 }
+

http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 9ab958e..501364b 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -18,16 +18,8 @@ package org.apache.tika.parser.ocr;
 
 import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;
-import static org.mockito.Matchers.any;
-import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
 
 import java.io.InputStream;
 import java.util.List;
@@ -42,14 +34,9 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.mail.RFC822Parser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.junit.Ignore;
 import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.helpers.DefaultHandler;
 
 public class TesseractOCRParserTest extends TikaTest {
@@ -112,7 +99,6 @@ public class TesseractOCRParserTest extends TikaTest {
     }
 
     @Test
-    @Ignore("TODO: cyclic reference to pdf-module...maybe move these all to 
tika-app?")
     public void testPDFOCR() throws Exception {
         String resource = "/test-documents/testOCR.pdf";
         String[] nonOCRContains = new String[0];
@@ -138,15 +124,51 @@ public class TesseractOCRParserTest extends TikaTest {
         testBasicOCR(resource, nonOCRContains, 3);
     }
 
-    private void testBasicOCR(String resource, String[] nonOCRContains, int 
numMetadatas) throws Exception {
+    @Test
+    public void testOCROutputsHOCR() throws Exception {
+        assumeTrue(canRun());
+
+        String resource = "/test-documents/testOCR.pdf";
+
+        String[] nonOCRContains = new String[0];
+        String contents = runOCR(resource, nonOCRContains, 2,
+                BasicContentHandlerFactory.HANDLER_TYPE.XML,
+                TesseractOCRConfig.OUTPUT_TYPE.HOCR);
+
+        assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents);
+        assertContains("Happy</span>", contents);
+
+    }
+
+    private void testBasicOCR(String resource, String[] nonOCRContains, int 
numMetadatas) throws Exception{
+        String contents = runOCR(resource, nonOCRContains, numMetadatas,
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 
TesseractOCRConfig.OUTPUT_TYPE.TXT);
+        if (canRun()) {
+            if(resource.substring(resource.lastIndexOf('.'), 
resource.length()).equals(".jpg")) {
+                assertTrue(contents.toString().contains("Apache"));
+            } else {
+                assertTrue(contents.toString().contains("Happy New Year 
2003!"));
+            }
+        }
+    }
+
+    private String runOCR(String resource, String[] nonOCRContains, int 
numMetadatas,
+                          BasicContentHandlerFactory.HANDLER_TYPE handlerType,
+                          TesseractOCRConfig.OUTPUT_TYPE outputType) throws 
Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setOutputType(outputType);
+
         Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
                 new BasicContentHandlerFactory(
-                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+                        handlerType, -1));
+
+        PDFParserConfig pdfConfig = new PDFParserConfig();
+        pdfConfig.setExtractInlineImages(true);
 
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
         parseContext.set(Parser.class, parser);
+        parseContext.set(PDFParserConfig.class, pdfConfig);
 
         try (InputStream stream = 
TesseractOCRParserTest.class.getResourceAsStream(resource)) {
             parser.parse(stream, new DefaultHandler(), new Metadata(), 
parseContext);
@@ -158,9 +180,7 @@ public class TesseractOCRParserTest extends TikaTest {
         for (Metadata m : metadataList) {
             contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
         }
-        if (canRun()) {
-            assertTrue(contents.toString().contains("Happy New Year 2003!"));
-        }
+
         for (String needle : nonOCRContains) {
             assertContains(needle, contents.toString());
         }
@@ -168,6 +188,8 @@ public class TesseractOCRParserTest extends TikaTest {
         assertTrue(metadataList.get(1).names().length > 10);
         //test at least one value
         assertEquals("deflate", metadataList.get(1).get("Compression 
CompressionTypeName"));
+
+        return contents.toString();
     }
 
     @Test
@@ -178,6 +200,15 @@ public class TesseractOCRParserTest extends TikaTest {
     }
 
     @Test
+    public void testImageMagick() throws Exception {
+        InputStream stream = TesseractOCRConfig.class.getResourceAsStream(
+                "/test-properties/TesseractOCR.properties");
+        TesseractOCRConfig config = new TesseractOCRConfig(stream);
+        String[] CheckCmd = {config.getImageMagickPath() + 
TesseractOCRParser.getImageMagickProg()};
+        assumeTrue(ExternalParser.check(CheckCmd));
+    }
+
+    @Test
     public void getNormalMetadataToo() throws Exception {
         //this should be successful whether or not TesseractOCR is 
installed/active
         //If tesseract is installed, the internal metadata extraction parser 
should
@@ -213,50 +244,4 @@ public class TesseractOCRParserTest extends TikaTest {
         assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
         assertEquals("72 dots per inch", m.get("Y Resolution"));
     }
-    
-    @Test
-    public void testMultipart() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822-multipart");
-        ContentHandler handler = mock(XHTMLContentHandler.class);
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            verify(handler).startDocument();
-            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
-            // TIKA-1422. TesseractOCRParser interferes with the number of 
times the handler is invoked.
-            // But, different versions of Tesseract lead to a different number 
of invocations. So, we
-            // only verify the handler if Tesseract cannot run.
-            if (!TesseractOCRParserTest.canRun()) {
-                verify(handler, 
times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), 
eq("div"), eq("div"), any(Attributes.class));
-                verify(handler, 
times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
-            }
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-
-        //repeat, this time looking at content
-        parser = new RFC822Parser();
-        metadata = new Metadata();
-        stream = getStream("test-documents/testRFC822-multipart");
-        handler = new BodyContentHandler();
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            //tests correct decoding of quoted printable text, including UTF-8 
bytes into Unicode
-            String bodyText = handler.toString();
-            assertTrue(bodyText.contains("body 1"));
-            assertTrue(bodyText.contains("body 2"));
-            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of 
encoded gif
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-    
-    private static InputStream getStream(String name) {
-        InputStream stream = Thread.currentThread().getContextClassLoader()
-                .getResourceAsStream(name);
-        assertNotNull("Test file not found " + name, stream);
-        return stream;
-    }
 }

tika git commit: TIKA-2093- Add Tesseract's hOCR output format as an option, via Eric Pugh. This commit also catches 2.x up to trunk; there were clearly some other changes to Tesseract that hadn't yet made it into 2.x.

Reply via email to