This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 3c3e8e1 TIKA-2286
3c3e8e1 is described below
commit 3c3e8e17e3016c5d729743ef40db6b00c1c2591f
Author: tballison <[email protected]>
AuthorDate: Wed Mar 1 09:17:57 2017 -0500
TIKA-2286
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 10 +++++++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 24 ++++++++++++++++++++--
.../org/apache/tika/parser/pdf/PDFParserTest.java | 4 +++-
.../apache/tika/parser/pdf/tika-inline-config.xml | 4 +++-
5 files changed, 39 insertions(+), 5 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 944ae9c..e572d6a 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -330,7 +330,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
- os, config.getOcrDPI());
+ os, config.getOcrDPI(), config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index f48bea0..bdbea31 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -636,6 +636,16 @@ public class PDFParser extends AbstractParser {
}
@Field
+ void setOcrImageQuality(float imageQuality) {
+ defaultConfig.setOcrImageQuality(imageQuality);
+ }
+
+ @Field
+ void setOcrImageFormatName(String formatName) {
+ defaultConfig.setOcrImageFormatName(formatName);
+ }
+
+ @Field
void setExtractInlineImages(boolean extractInlineImages) {
defaultConfig.setExtractInlineImages(extractInlineImages);
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 5ffffac..ce09fa7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -118,6 +118,7 @@ public class PDFParserConfig implements Serializable {
private int ocrDPI = 300;
private ImageType ocrImageType = ImageType.GRAY;
private String ocrImageFormatName = "png";
+ private float ocrImageQuality = 1.0f;
private AccessChecker accessChecker;
@@ -329,7 +330,6 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractUniqueInlineImagesOnly(boolean
extractUniqueInlineImagesOnly) {
this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
-
}
/**
@@ -566,7 +566,9 @@ public class PDFParserConfig implements Serializable {
}
/**
- * Dots per inche used to render the page image for OCR
+ * Dots per inch used to render the page image for OCR.
+ * This does not apply to all image formats.
+ *
* @param ocrDPI
*/
public void setOcrDPI(int ocrDPI) {
@@ -574,6 +576,24 @@ public class PDFParserConfig implements Serializable {
}
/**
+ * Image quality used to render the page image for OCR.
+ * This does not apply to all image formats
+ * @return
+ */
+ public float getOcrImageQuality() {
+ return ocrImageQuality;
+ }
+
+ /**
+ * Image quality used to render the page image for OCR.
+ * This does not apply to all image formats
+ * @return
+ */
+ public void setOcrImageQuality(float ocrImageQuality) {
+ this.ocrImageQuality = ocrImageQuality;
+ }
+
+ /**
* Whether or not to extract PDActions from the file.
* Most Action types are handled inline; javascript macros
* are processed as embedded documents.
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 51fb9c9..30afc76 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1332,7 +1332,9 @@ public class PDFParserTest extends TikaTest {
assertEquals(new AccessChecker(true),
pdfParserConfig.getAccessChecker());
assertEquals(true, pdfParserConfig.getExtractInlineImages());
assertEquals(false,
pdfParserConfig.getExtractUniqueInlineImagesOnly());
- assertEquals(314159, pdfParserConfig.getOcrDPI());
+ assertEquals(314, pdfParserConfig.getOcrDPI());
+ assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
+ assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
assertEquals(false,
pdfParserConfig.getCatchIntermediateIOExceptions());
}
}
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
index 9436604..61373f7 100644
---
a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
+++
b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-inline-config.xml
@@ -11,7 +11,9 @@
<param name="catchIntermediateExceptions"
type="bool">false</param>
<param name="extractUniqueInlineImagesOnly"
type="bool">false</param>
<param name="catchIntermediateExceptions"
type="bool">false</param>
- <param name="ocrDPI" type="int">314159</param>
+ <param name="ocrDPI" type="int">314</param>
+ <param name="ocrImageQuality" type="float">2.1</param>
+ <param name="ocrImageFormatName" type="string">jpeg</param>
<!-- we really should throw an exception for this!! -->
<param name="someRandomThingOrOther" type="bool">true</param>
</params>
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].