[ https://issues.apache.org/jira/browse/TIKA-2385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16273690#comment-16273690 ]
ASF GitHub Bot commented on TIKA-2385: -------------------------------------- dameikle closed pull request #183: fix for TIKA-2385 contributed by pmweiss5 URL: https://github.com/apache/tika/pull/183 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index 624c97e39..c8c8bc93e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -94,6 +94,9 @@ // whether or not to preserve interword spacing private boolean preserveInterwordSpacing = false; + // whether or not to apply rotation calculated by the rotation.py script + private boolean applyRotation = false; + /** * Default contructor. @@ -169,6 +172,8 @@ private void init(InputStream is) { getProp(props, "filter", getFilter())); setResize( getProp(props, "resize", getResize())); + setApplyRotation( + getProp(props, "applyRotation", getApplyRotation())); } @@ -472,6 +477,23 @@ public void setImageMagickPath(String ImageMagickPath) { this.ImageMagickPath = ImageMagickPath; } + /** + * @return Whether or not a rotation value should be calculated and passed to ImageMagick before performing OCR. + * (Requires that Python is installed). + */ + public boolean getApplyRotation() { + return this.applyRotation; + } + + /** + * Sets whether or not a rotation value should be calculated and passed to ImageMagick. + * + * @param true to calculate and apply rotation, false to skip. Default is false, true required Python installed. + */ + public void setApplyRotation(boolean applyRotation) { + this.applyRotation = applyRotation; + } + /** * Get property from the properties file passed in. * diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 121e096e1..c28f6e1aa 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -170,7 +170,7 @@ private boolean hasImageMagick(TesseractOCRConfig config) { } - private static boolean hasPython() { + static boolean hasPython() { // check if python is installed and if the rotation program path has been specified correctly boolean hasPython = false; @@ -321,7 +321,7 @@ private void processImage(File streamingObject, TesseractOCRConfig config) throw // determine the angle of rotation required to make the text horizontal CommandLine cmdLine = CommandLine.parse(cmd); - if(hasPython()) { + if(config.getApplyRotation() && hasPython()) { try { executor.execute(cmdLine); angle = outputStream.toString("UTF-8").trim(); diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties index 26b6031bc..73c9083ce 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties @@ -32,4 +32,5 @@ density=300 depth=4 colorspace=gray filter=triangle -resize=900 \ No newline at end of file +resize=900 +applyRotation=false \ No newline at end of file diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py index fb391f1fe..c619325e3 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py @@ -46,12 +46,12 @@ def main(argv): filename = arg try: - from parabolic import parabolic + from parabolic import parabolic def argmax(x): - return parabolic(x, numpy.argmax(x))[0] - except ImportError: - from numpy import argmax + return parabolic(x, numpy.argmax(x))[0] + except ImportError: + from numpy import argmax # Load file, converting to grayscale I = asarray(Image.open(filename).convert('L')) @@ -69,4 +69,4 @@ def argmax(x): print('{:.2f}'.format(-(90-rotation))) if __name__ == "__main__": - main(sys.argv[1:]) \ No newline at end of file + main(sys.argv[1:]) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java index adec5dbc8..8e22f21aa 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java @@ -47,6 +47,7 @@ public void testNoConfig() throws Exception { assertEquals("Invalid default colorpsace value", "gray" , config.getColorspace()); assertEquals("Invalid default filter value", "triangle" , config.getFilter()); assertEquals("Invalid default resize value", 900 , config.getResize()); + assertEquals("Invalid default applyRotation value", false, config.getApplyRotation()); } @Test @@ -68,6 +69,7 @@ public void testPartialConfig() throws Exception { assertEquals("Invalid overridden depth value", 8 , config.getDepth()); assertEquals("Invalid overridden filter value", "box" , config.getFilter()); assertEquals("Invalid overridden resize value", 300 , config.getResize()); + assertEquals("Invalid default applyRotation value", false, config.getApplyRotation()); } @Test @@ -91,6 +93,7 @@ public void testFullConfig() throws Exception { assertEquals("Invalid overridden depth value", 8 , config.getDepth()); assertEquals("Invalid overridden filter value", "box" , config.getFilter()); assertEquals("Invalid overridden resize value", 300 , config.getResize()); + assertEquals("Invalid overridden applyRotation value", true, config.getApplyRotation()); } @Test diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 4c0ab76d7..4b210e10b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -278,4 +278,20 @@ public void testInterwordSpacing() throws Exception { Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml); assertTrue(m.find()); } + + @Test + public void testRotatedOCR() throws Exception { + if (TesseractOCRParser.hasPython()) { + + TesseractOCRConfig config = new TesseractOCRConfig(); + config.setApplyRotation(true); + config.setEnableImageProcessing(1); + ParseContext parseContext = new ParseContext(); + parseContext.set(TesseractOCRConfig.class, config); + assumeTrue(canRun(config)); + + String ocr = getText(getResourceAsStream("/test-documents/testRotated.png"), new AutoDetectParser(), parseContext); + assertContains("Its had resolving otherwise she contented therefore", ocr); + } + } } diff --git a/tika-parsers/src/test/resources/test-documents/testRotated.png b/tika-parsers/src/test/resources/test-documents/testRotated.png new file mode 100644 index 000000000..f535b5017 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testRotated.png differ diff --git a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties b/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties index 3a96ef192..ddc54b99a 100644 --- a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties +++ b/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties @@ -25,4 +25,5 @@ ImageMagickPath=/usr/local/bin density=200 depth=8 filter=box -resize=300 \ No newline at end of file +resize=300 +applyRotation=true \ No newline at end of file ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Tesseract OCR rotation.py not run > --------------------------------- > > Key: TIKA-2385 > URL: https://issues.apache.org/jira/browse/TIKA-2385 > Project: Tika > Issue Type: Bug > Components: ocr > Affects Versions: 1.15 > Reporter: Peter Weiss > Assignee: Dave Meikle > Fix For: 1.17 > > > It appears that even if Python is installed, the rotation.py that calculates > rotation angle of the image does not run because of indentation/spacing > errors in the Python script. > Also recommend making this a configurable parameter since it does add time > and can produce unexpected results if the supplied image contains more than > just plain text. -- This message was sent by Atlassian JIRA (v6.4.14#64029)