This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit d1526d053f91497ac7bcd4509f1555f4347377d6 Author: Ewan Mellor <[email protected]> AuthorDate: Wed Feb 21 13:09:53 2018 -0800 Fix for TIKA-2582 contributed by ewanmellor. Tesseract 4.0 includes a change to use form feed characters to separate pages by default in its text output. Previous versions used no separator unless you specified the include_page_breaks option. This confuses any parser that is not expecting the FF. ODFParserTest.testOO2Metadata fails, because it is expecting the output of a blank image to be the empty string, but now the FF is there. I haven't seen any other failures, but I expect that user code will now see either FF or U+FFFD where they are not expecting it (SafeContentHandler replaces the FF with U+FFFD when converting to text to XML). Fix this by setting Tesseract's page_separator option to the empty string. This will preserve the no-page-breaks behavior with both Tesseract 3.x and 4.0. Also, add an option TesseractOCRConfig.pageSeparator so that user code can request the FF or any other separator, if they want it. --- .../apache/tika/parser/ocr/TesseractOCRConfig.java | 22 ++++++++++++++++++++++ .../apache/tika/parser/ocr/TesseractOCRParser.java | 1 + 2 files changed, 23 insertions(+) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index afe0a21..4139cd2 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -91,6 +91,9 @@ public class TesseractOCRConfig implements Serializable { // factor by which image is to be scaled. private int resize = 900; + // See setPageSeparator. + private String pageSeparator = ""; + // whether or not to preserve interword spacing private boolean preserveInterwordSpacing = false; @@ -256,6 +259,25 @@ public class TesseractOCRConfig implements Serializable { } /** + * @see #setPageSeparator(String pageSeparator) + */ + public String getPageSeparator() { + return pageSeparator; + } + + /** + * The page separator to use in plain text output. This corresponds to Tesseract's page_separator config option. + * The default here is the empty string (i.e. no page separators). Note that this is also the default in + * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character. We are overriding + * Tesseract 4.0's default here. + * + * @param pageSeparator + */ + public void setPageSeparator(String pageSeparator) { + this.pageSeparator = pageSeparator; + } + + /** * Whether or not to maintain interword spacing. Default is <code>false</code>. * * @param preserveInterwordSpacing diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 08847fd..3e15c44 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -468,6 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US), + "-c", "page_separator=" + config.getPageSeparator(), "-c", (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"}; ProcessBuilder pb = new ProcessBuilder(cmd); -- To stop receiving notification emails like this one, please contact [email protected].
