Repository: tika Updated Branches: refs/heads/2.x d543378a8 -> 673533d0e
TIKA-2093- Add Tesseract's hOCR output format as an option, via Eric Pugh. This commit also catches 2.x up to trunk; there were clearly some other changes to Tesseract that hadn't yet made it into 2.x. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/673533d0 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/673533d0 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/673533d0 Branch: refs/heads/2.x Commit: 673533d0e65b2b2613e19bbf952bdb352c628e52 Parents: d543378 Author: tballison <talli...@mitre.org> Authored: Thu Sep 22 21:23:32 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Thu Sep 22 21:23:32 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../tika/parser/ocr/TesseractOCRConfig.java | 254 +++++++++++++++++-- .../tika/parser/ocr/TesseractOCRParser.java | 236 +++++++++++++++-- .../tika/parser/ocr/TesseractOCRParserTest.java | 117 ++++----- 4 files changed, 503 insertions(+), 107 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 53f9a82..d13a644 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,9 @@ Release 2.0 - ??? Release 1.14 - ??? + * Add Tesseract's hOCR output format as an option, via Eric Pugh + (TIKA-2093). + * Extract macros from MSOffice files (TIKA-2069). * Maintain passed-in mime in TXTParser (TIKA-2047). http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index a35370a..5d06a7a 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -42,11 +42,16 @@ public class TesseractOCRConfig implements Serializable{ private static final long serialVersionUID = -4861942486845757891L; + public enum OUTPUT_TYPE { + TXT, + HOCR + } + // Path to tesseract installation folder, if not on system path. private String tesseractPath = ""; - // Path to the 'tessdata' folder, which contains language files and config files. - private String tessdataPath = ""; + // Path to the 'tessdata' folder, which contains language files and config files. + private String tessdataPath = ""; // Language dictionary to be used. private String language = "eng"; @@ -63,6 +68,30 @@ public class TesseractOCRConfig implements Serializable{ // Maximum time (seconds) to wait for the ocring process termination private int timeout = 120; + // The format of the ocr'ed output to be returned, txt or hocr. + private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT; + + // enable image processing (optional) + private int enableImageProcessing = 0; + + // Path to ImageMagick program, if not on system path. + private String ImageMagickPath = ""; + + // resolution of processed image (in dpi). + private int density = 300; + + // number of bits in a color sample within a pixel. + private int depth = 4; + + // colorspace of processed image. + private String colorspace = "gray"; + + // filter to be applied to the processed image. + private String filter = "triangle"; + + // factor by which image is to be scaled. + private int resize = 900; + /** * Default contructor. */ @@ -99,10 +128,11 @@ public class TesseractOCRConfig implements Serializable{ } } + // set parameters for Tesseract setTesseractPath( getProp(props, "tesseractPath", getTesseractPath())); - setTessdataPath( - getProp(props, "tessdataPath", getTessdataPath())); + setTessdataPath( + getProp(props, "tessdataPath", getTessdataPath())); setLanguage( getProp(props, "language", getLanguage())); setPageSegMode( @@ -112,7 +142,29 @@ public class TesseractOCRConfig implements Serializable{ setMaxFileSizeToOcr( getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); setTimeout( - getProp(props, "timeout", getTimeout())); + getProp(props, "timeout", getTimeout())); + String outputTypeString = props.getProperty("outputType"); + if ("txt".equals(outputTypeString)) { + setOutputType(OUTPUT_TYPE.TXT); + } else if ("hocr".equals(outputTypeString)) { + setOutputType(OUTPUT_TYPE.HOCR); + } + + // set parameters for ImageMagick + setEnableImageProcessing( + getProp(props, "enableImageProcessing", isEnableImageProcessing())); + setImageMagickPath( + getProp(props, "ImageMagickPath", getImageMagickPath())); + setDensity( + getProp(props, "density", getDensity())); + setDepth( + getProp(props, "depth", getDepth())); + setColorspace( + getProp(props, "colorspace", getColorspace())); + setFilter( + getProp(props, "filter", getFilter())); + setResize( + getProp(props, "resize", getResize())); } @@ -123,10 +175,10 @@ public class TesseractOCRConfig implements Serializable{ /** * Set the path to the Tesseract executable, needed if it is not on system path. - * <p> - * Note that if you set this value, it is highly recommended that you also - * set the path to the 'tessdata' folder using {@link #setTessdataPath}. - * </p> + * <p> + * Note that if you set this value, it is highly recommended that you also + * set the path to the 'tessdata' folder using {@link #setTessdataPath}. + * </p> */ public void setTesseractPath(String tesseractPath) { if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) @@ -135,22 +187,22 @@ public class TesseractOCRConfig implements Serializable{ this.tesseractPath = tesseractPath; } - /** @see #setTessdataPath(String tessdataPath) */ - public String getTessdataPath() { - return tessdataPath; - } + /** @see #setTessdataPath(String tessdataPath) */ + public String getTessdataPath() { + return tessdataPath; + } - /** - * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such - * as on Windows), this folder is found in the Tesseract installation, but in other cases - * (such as when Tesseract is built from source), it may be located elsewhere. - */ - public void setTessdataPath(String tessdataPath) { - if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) - tessdataPath += File.separator; + /** + * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such + * as on Windows), this folder is found in the Tesseract installation, but in other cases + * (such as when Tesseract is built from source), it may be located elsewhere. + */ + public void setTessdataPath(String tessdataPath) { + if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) + tessdataPath += File.separator; - this.tessdataPath = tessdataPath; - } + this.tessdataPath = tessdataPath; + } /** @see #setLanguage(String language)*/ public String getLanguage() { @@ -218,12 +270,166 @@ public class TesseractOCRConfig implements Serializable{ this.timeout = timeout; } - /** @see #setTimeout(int timeout)*/ + /** @see #setTimeout(int timeout) + * @return timeout value for Tesseract */ public int getTimeout() { return timeout; } /** + * Set output type from ocr process. Default is "txt", but can be "hocr". + * Default value is 120s. + */ + public void setOutputType(OUTPUT_TYPE outputType) { + this.outputType = outputType; + } + + /** @see #setOutputType(OUTPUT_TYPE outputType) */ + public OUTPUT_TYPE getOutputType() { + return outputType; + } + + /** @see #setEnableImageProcessing(int) + * @return image processing is enabled or not */ + public int isEnableImageProcessing() { + return enableImageProcessing; + } + + /** + * Set the value to true if processing is to be enabled. + * Default value is false. + */ + public void setEnableImageProcessing(int enableImageProcessing) { + this.enableImageProcessing = enableImageProcessing; + } + + /** + * @return the density + */ + public int getDensity() { + return density; + } + + /** + * @param density the density to set. Valid range of values is 150-1200. + * Default value is 300. + */ + public void setDensity(int density) { + if(density < 150 || density > 1200) { + throw new IllegalArgumentException("Invalid density value. Valid range of values is 150-1200."); + } + this.density = density; + } + + /** + * @return the depth + */ + public int getDepth() { + return depth; + } + + /** + * @param depth the depth to set. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096. + * Default value is 4. + */ + public void setDepth(int depth) { + int[] allowedValues = {2, 4, 8, 16, 32, 64, 256, 4096}; + for (int i = 0; i < allowedValues.length; i++) { + if(depth == allowedValues[i]) { + this.depth = depth; + return; + } + } + throw new IllegalArgumentException("Invalid depth value. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096."); + } + + /** + * @return the colorspace + */ + public String getColorspace() { + return colorspace; + } + + /** + * @param colorspace the colorspace to set + * Deafult value is gray. + */ + public void setColorspace(String colorspace) { + if(!colorspace.equals(null)) { + this.colorspace = colorspace; + } else { + throw new IllegalArgumentException("Colorspace value cannot be null."); + } + } + + /** + * @return the filter + */ + public String getFilter() { + return filter; + } + + /** + * @param filter the filter to set. Valid values are point, hermite, cubic, box, gaussian, catrom, triangle, quadratic and mitchell. + * Default value is triangle. + */ + public void setFilter(String filter) { + if(filter.equals(null)) { + throw new IllegalArgumentException("Filter value cannot be null. Valid values are point, hermite, " + + "cubic, box, gaussian, catrom, triangle, quadratic and mitchell."); + } + + String[] allowedFilters = {"Point", "Hermite", "Cubic", "Box", "Gaussian", "Catrom", "Triangle", "Quadratic", "Mitchell"}; + for (int i = 0; i < allowedFilters.length; i++) { + if(filter.equalsIgnoreCase(allowedFilters[i])) { + this.filter = filter; + return; + } + } + throw new IllegalArgumentException("Invalid filter value. Valid values are point, hermite, " + + "cubic, box, gaussian, catrom, triangle, quadratic and mitchell."); + } + + /** + * @return the resize + */ + public int getResize() { + return resize; + } + + /** + * @param resize the resize to set. Valid range of values is 100-900. + * Default value is 900. + */ + public void setResize(int resize) { + for(int i=1;i<10;i++) { + if(resize == i*100) { + this.resize = resize; + return; + } + } + throw new IllegalArgumentException("Invalid resize value. Valid range of values is 100-900."); + } + + /** @see #setImageMagickPath(String ImageMagickPath) + * @return path to ImageMagick file. */ + public String getImageMagickPath() { + + return ImageMagickPath; + } + + /** + * Set the path to the ImageMagick executable, needed if it is not on system path. + * @param ImageMagickPath to ImageMagick file. + */ + public void setImageMagickPath(String ImageMagickPath) { + if(!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator)) + ImageMagickPath += File.separator; + + this.ImageMagickPath = ImageMagickPath; + } + + /** * Get property from the properties file passed in. * @param properties properties file to read from. * @param property the property to fetch. http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 83fe7fe..c6705b0 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -16,8 +16,14 @@ */ package org.apache.tika.parser.ocr; -import java.awt.*; +import static java.nio.charset.StandardCharsets.UTF_8; + +import javax.imageio.ImageIO; +import javax.xml.parsers.SAXParser; +import java.awt.Image; import java.awt.image.BufferedImage; +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -25,11 +31,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; @@ -38,7 +47,10 @@ import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import javax.imageio.ImageIO; +import org.apache.commons.exec.CommandLine; +import org.apache.commons.exec.DefaultExecutor; +import org.apache.commons.exec.PumpStreamHandler; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.LogFactory; import org.apache.tika.exception.TikaException; @@ -56,10 +68,12 @@ import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import static java.nio.charset.StandardCharsets.UTF_8; +import org.xml.sax.helpers.DefaultHandler; /** * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser, @@ -85,6 +99,8 @@ public class TesseractOCRParser extends AbstractParser { }))); private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>(); + + @Override public Set<MediaType> getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types @@ -117,13 +133,48 @@ public class TesseractOCRParser extends AbstractParser { if (TESSERACT_PRESENT.containsKey(tesseract)) { return TESSERACT_PRESENT.get(tesseract); } - // Try running Tesseract from there, and see if it exists + works String[] checkCmd = { tesseract }; boolean hasTesseract = ExternalParser.check(checkCmd); TESSERACT_PRESENT.put(tesseract, hasTesseract); return hasTesseract; - + + } + + private boolean hasImageMagick(TesseractOCRConfig config) { + // Fetch where the config says to find ImageMagick Program + String ImageMagick = config.getImageMagickPath() + getImageMagickProg(); + + // Have we already checked for a copy of ImageMagick Program there? + if (TESSERACT_PRESENT.containsKey(ImageMagick)) { + return TESSERACT_PRESENT.get(ImageMagick); + } + + // Try running ImageMagick program from there, and see if it exists + works + String[] checkCmd = { ImageMagick }; + boolean hasImageMagick = ExternalParser.check(checkCmd); + TESSERACT_PRESENT.put(ImageMagick, hasImageMagick); + + return hasImageMagick; + + } + + private static boolean hasPython() { + // check if python is installed and if the rotation program path has been specified correctly + + boolean hasPython = false; + + try { + Process proc = Runtime.getRuntime().exec("python -h"); + BufferedReader stdInput = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8")); + if(stdInput.read() != -1) { + hasPython = true; + } + } catch (IOException e) { + + } + + return hasPython; } public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, @@ -153,9 +204,10 @@ public class TesseractOCRParser extends AbstractParser { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { - TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); + + TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG); // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar @@ -169,12 +221,12 @@ public class TesseractOCRParser extends AbstractParser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); File tmpImgFile = tmp.createTemporaryFile(); - parse(tikaStream, tmpImgFile, xhtml, config); + parse(tikaStream, tmpImgFile, parseContext, xhtml, config); // Temporary workaround for TIKA-1445 - until we can specify // composite parsers with strategies (eg Composite, Try In Turn), // always send the image onwards to the regular parser to have // the metadata for them extracted as well - _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context); + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext); xhtml.endDocument(); } finally { tmp.dispose(); @@ -184,15 +236,37 @@ public class TesseractOCRParser extends AbstractParser { /** * Use this to parse content without starting a new document. * This appends SAX events to xhtml without re-adding the metadata, body start, etc. + * * @param stream inputstream * @param xhtml handler * @param config TesseractOCRConfig to use for this parse * @throws IOException * @throws SAXException * @throws TikaException + * + * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)} */ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { + parseInline(stream, xhtml, new ParseContext(), config); + } + + /** + * Use this to parse content without starting a new document. + * This appends SAX events to xhtml without re-adding the metadata, body start, etc. + * + * @param stream inputstream + * @param xhtml handler + * @param config TesseractOCRConfig to use for this parse + * @throws IOException + * @throws SAXException + * @throws TikaException + * + * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)} + */ + public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, + TesseractOCRConfig config) + throws IOException, SAXException, TikaException { // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar @@ -203,34 +277,99 @@ public class TesseractOCRParser extends AbstractParser { try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); File tmpImgFile = tmp.createTemporaryFile(); - parse(tikaStream, tmpImgFile, xhtml, config); + parse(tikaStream, tmpImgFile, parseContext, xhtml, config); } finally { tmp.dispose(); } } - private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config) + /** + * This method is used to process the image to an OCR-friendly format. + * @param streamingObject input image to be processed + * @param config TesseractOCRconfig class to get ImageMagick properties + * @throws IOException if an input error occurred + * @throws TikaException if an exception timed out + */ + private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException { + + // fetch rotation script from resources + InputStream in = getClass().getResourceAsStream("rotation.py"); + TemporaryResources tmp = new TemporaryResources(); + File rotationScript = tmp.createTemporaryFile(); + Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING); + + String cmd = "python " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath(); + String angle = "0"; + + DefaultExecutor executor = new DefaultExecutor(); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream); + executor.setStreamHandler(streamHandler); + + // determine the angle of rotation required to make the text horizontal + CommandLine cmdLine = CommandLine.parse(cmd); + if(hasPython()) { + try { + executor.execute(cmdLine); + angle = outputStream.toString("UTF-8").trim(); + } catch(Exception e) { + + } + } + + // process the image - parameter values can be set in TesseractOCRConfig.properties + String line = "convert -density " + config.getDensity() + " -depth " + config.getDepth() + + " -colorspace " + config.getColorspace() + " -filter " + config.getFilter() + + " -resize " + config.getResize() + "% -rotate "+ angle + " " + streamingObject.getAbsolutePath() + + " " + streamingObject.getAbsolutePath(); + cmdLine = CommandLine.parse(line); + try { + executor.execute(cmdLine); + } catch(Exception e) { + + } + + tmp.close(); + } + + private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext, + XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { File tmpTxtOutput = null; - try { File input = tikaInputStream.getFile(); long size = tikaInputStream.getLength(); if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { - doOCR(input, tmpImgFile, config); + // copy the contents of the original input file into a temporary file + // which will be processed for OCR + TemporaryResources tmp = new TemporaryResources(); + File tmpFile = tmp.createTemporaryFile(); + FileUtils.copyFile(input, tmpFile); + + // Process image if ImageMagick Tool is present + if(config.isEnableImageProcessing() == 1 && hasImageMagick(config)) { + processImage(tmpFile,config); + } - // Tesseract appends .txt to output file name - tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt"); + doOCR(tmpFile, tmpImgFile, config); + + // Tesseract appends the output type (.txt or .hocr) to output file name + tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." + config.getOutputType()); if (tmpTxtOutput.exists()) { try (InputStream is = new FileInputStream(tmpTxtOutput)) { - extractOutput(is, xhtml); + if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) { + extractHOCROutput(is, parseContext, xhtml); + } else { + extractOutput(is, xhtml); + } } } + tmp.close(); } } finally { @@ -240,6 +379,7 @@ public class TesseractOCRParser extends AbstractParser { } } + // TIKA-1445 workaround parser private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser(); private static class CompositeImageParser extends CompositeParser { @@ -268,7 +408,7 @@ public class TesseractOCRParser extends AbstractParser { */ private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", - config.getLanguage(), "-psm", config.getPageSegMode() }; + config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US)}; ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(config, pb); @@ -334,7 +474,17 @@ public class TesseractOCRParser extends AbstractParser { } } xhtml.endElement("div"); + } + private void extractHOCROutput(InputStream is, ParseContext parseContext, + XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { + if (parseContext == null) { + parseContext = new ParseContext(); + } + SAXParser parser = parseContext.getSAXParser(); + xhtml.startElement("div", "class", "ocr"); + parser.parse(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml))); + xhtml.endElement("div"); } /** @@ -367,4 +517,56 @@ public class TesseractOCRParser extends AbstractParser { return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract"; } + static String getImageMagickProg() { + return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert"; + } + + + private static class HOCRPassThroughHandler extends DefaultHandler { + private final ContentHandler xhtml; + public static final Set<String> IGNORE = unmodifiableSet( + "html", "head", "title", "meta", "body"); + + public HOCRPassThroughHandler(ContentHandler xhtml) { + this.xhtml = xhtml; + } + + /** + * Starts the given element. Table cells and list items are automatically + * indented by emitting a tab character as ignorable whitespace. + */ + @Override + public void startElement( + String uri, String local, String name, Attributes attributes) + throws SAXException { + if (!IGNORE.contains(name)) { + xhtml.startElement(uri, local, name, attributes); + } + } + + /** + * Ends the given element. Block elements are automatically followed + * by a newline character. + */ + @Override + public void endElement(String uri, String local, String name) throws SAXException { + if (!IGNORE.contains(name)) { + xhtml.endElement(uri, local, name); + } + } + + /** + * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a> + */ + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + xhtml.characters(ch, start, length); + } + + private static Set<String> unmodifiableSet(String... elements) { + return Collections.unmodifiableSet( + new HashSet<String>(Arrays.asList(elements))); + } + } } + http://git-wip-us.apache.org/repos/asf/tika/blob/673533d0/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 9ab958e..501364b 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -18,16 +18,8 @@ package org.apache.tika.parser.ocr; import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; import static org.junit.Assume.assumeTrue; -import static org.mockito.Matchers.any; -import static org.mockito.Matchers.eq; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; import java.io.InputStream; import java.util.List; @@ -42,14 +34,9 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; -import org.apache.tika.parser.mail.RFC822Parser; +import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BasicContentHandlerFactory; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; -import org.junit.Ignore; import org.junit.Test; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; public class TesseractOCRParserTest extends TikaTest { @@ -112,7 +99,6 @@ public class TesseractOCRParserTest extends TikaTest { } @Test - @Ignore("TODO: cyclic reference to pdf-module...maybe move these all to tika-app?") public void testPDFOCR() throws Exception { String resource = "/test-documents/testOCR.pdf"; String[] nonOCRContains = new String[0]; @@ -138,15 +124,51 @@ public class TesseractOCRParserTest extends TikaTest { testBasicOCR(resource, nonOCRContains, 3); } - private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { + @Test + public void testOCROutputsHOCR() throws Exception { + assumeTrue(canRun()); + + String resource = "/test-documents/testOCR.pdf"; + + String[] nonOCRContains = new String[0]; + String contents = runOCR(resource, nonOCRContains, 2, + BasicContentHandlerFactory.HANDLER_TYPE.XML, + TesseractOCRConfig.OUTPUT_TYPE.HOCR); + + assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents); + assertContains("Happy</span>", contents); + + } + + private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{ + String contents = runOCR(resource, nonOCRContains, numMetadatas, + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, TesseractOCRConfig.OUTPUT_TYPE.TXT); + if (canRun()) { + if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) { + assertTrue(contents.toString().contains("Apache")); + } else { + assertTrue(contents.toString().contains("Happy New Year 2003!")); + } + } + } + + private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, + BasicContentHandlerFactory.HANDLER_TYPE handlerType, + TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); + config.setOutputType(outputType); + Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory( - BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + handlerType, -1)); + + PDFParserConfig pdfConfig = new PDFParserConfig(); + pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(Parser.class, parser); + parseContext.set(PDFParserConfig.class, pdfConfig); try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) { parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext); @@ -158,9 +180,7 @@ public class TesseractOCRParserTest extends TikaTest { for (Metadata m : metadataList) { contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); } - if (canRun()) { - assertTrue(contents.toString().contains("Happy New Year 2003!")); - } + for (String needle : nonOCRContains) { assertContains(needle, contents.toString()); } @@ -168,6 +188,8 @@ public class TesseractOCRParserTest extends TikaTest { assertTrue(metadataList.get(1).names().length > 10); //test at least one value assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); + + return contents.toString(); } @Test @@ -178,6 +200,15 @@ public class TesseractOCRParserTest extends TikaTest { } @Test + public void testImageMagick() throws Exception { + InputStream stream = TesseractOCRConfig.class.getResourceAsStream( + "/test-properties/TesseractOCR.properties"); + TesseractOCRConfig config = new TesseractOCRConfig(stream); + String[] CheckCmd = {config.getImageMagickPath() + TesseractOCRParser.getImageMagickProg()}; + assumeTrue(ExternalParser.check(CheckCmd)); + } + + @Test public void getNormalMetadataToo() throws Exception { //this should be successful whether or not TesseractOCR is installed/active //If tesseract is installed, the internal metadata extraction parser should @@ -213,50 +244,4 @@ public class TesseractOCRParserTest extends TikaTest { assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); assertEquals("72 dots per inch", m.get("Y Resolution")); } - - @Test - public void testMultipart() { - Parser parser = new RFC822Parser(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("test-documents/testRFC822-multipart"); - ContentHandler handler = mock(XHTMLContentHandler.class); - - try { - parser.parse(stream, handler, metadata, new ParseContext()); - verify(handler).startDocument(); - int bodyExpectedTimes = 4, multipackExpectedTimes = 5; - // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked. - // But, different versions of Tesseract lead to a different number of invocations. So, we - // only verify the handler if Tesseract cannot run. - if (!TesseractOCRParserTest.canRun()) { - verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class)); - verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div"); - } - } catch (Exception e) { - fail("Exception thrown: " + e.getMessage()); - } - - //repeat, this time looking at content - parser = new RFC822Parser(); - metadata = new Metadata(); - stream = getStream("test-documents/testRFC822-multipart"); - handler = new BodyContentHandler(); - try { - parser.parse(stream, handler, metadata, new ParseContext()); - //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode - String bodyText = handler.toString(); - assertTrue(bodyText.contains("body 1")); - assertTrue(bodyText.contains("body 2")); - assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif - } catch (Exception e) { - fail("Exception thrown: " + e.getMessage()); - } - } - - private static InputStream getStream(String name) { - InputStream stream = Thread.currentThread().getContextClassLoader() - .getResourceAsStream(name); - assertNotNull("Test file not found " + name, stream); - return stream; - } }