This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 213747d7e TIKA-3794 -- Fix bug that prevented specification of
rendered image type via http header in the PDFParser.
213747d7e is described below
commit 213747d7e6f45f3e30cf40dcce8b2135f9d52bc2
Author: tallison <[email protected]>
AuthorDate: Mon Jun 20 15:14:34 2022 -0400
TIKA-3794 -- Fix bug that prevented specification of rendered image type
via http header in the PDFParser.
---
CHANGES.txt | 6 ++
.../java/org/apache/tika/parser/pdf/PDFParser.java | 6 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 2 +-
.../org/apache/tika/server/core/CXFTestBase.java | 22 +++++
.../tika/server/standard/UnpackerResourceTest.java | 96 +++++++++++++++++++++
.../test-documents/testColorRendering.pdf | Bin 0 -> 1794 bytes
6 files changed, 128 insertions(+), 4 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 8066d6ac6..cd390d441 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.4.2 - ???
+
+ * Fix bug that prevented specification of rendered image type
+ via http header in the PDFParser (TIKA-3794).
+
+
Release 2.4.1 - 06/14/2022
* Implement bulk upload in the OpenSearch emitter (TIKA-3791).
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index b3233c35e..e790378ae 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -741,9 +741,9 @@ public class PDFParser extends AbstractParser implements
RenderingParser, Initia
}
//set a default renderer if nothing was defined
PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
- pdfBoxRenderer.setDPI(defaultConfig.getOcrDPI());
- pdfBoxRenderer.setImageType(defaultConfig.getOcrImageType());
-
pdfBoxRenderer.setImageFormatName(defaultConfig.getOcrImageFormatName());
+ pdfBoxRenderer.setDPI(config.getOcrDPI());
+ pdfBoxRenderer.setImageType(config.getOcrImageType());
+ pdfBoxRenderer.setImageFormatName(config.getOcrImageFormatName());
config.setRenderer(pdfBoxRenderer);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index cf477c697..acd57e47f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -637,7 +637,7 @@ public class PDFParserConfig implements Serializable {
* @see #setOcrImageType(ImageType)
*/
public void setOcrImageType(String ocrImageTypeString) {
- this.ocrImageType = parseImageType(ocrImageTypeString);
+ setOcrImageType(parseImageType(ocrImageTypeString));
}
/**
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 8d927cb7c..8bfc02023 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -184,6 +184,28 @@ public abstract class CXFTestBase {
return data;
}
+ protected Map<String, byte[]> readZipArchiveBytes(InputStream inputStream)
throws IOException {
+ Map<String, byte[]> data = new HashMap<>();
+ Path tempFile = null;
+ try {
+ tempFile = writeTemporaryArchiveFile(inputStream, "zip");
+ ZipFile zip = new ZipFile(tempFile.toFile());
+ Enumeration<ZipArchiveEntry> entries = zip.getEntries();
+ while (entries.hasMoreElements()) {
+ ZipArchiveEntry entry = entries.nextElement();
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(zip.getInputStream(entry), bos);
+ data.put(entry.getName(), bos.toByteArray());
+ }
+ zip.close();
+ } finally {
+ if (tempFile != null ) {
+ Files.delete(tempFile);
+ }
+ }
+ return data;
+ }
+
protected String readArchiveText(InputStream inputStream) throws
IOException {
Path tempFile = writeTemporaryArchiveFile(inputStream, "zip");
ZipFile zip = new ZipFile(tempFile.toFile());
diff --git
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index af157b916..e7b1bf86c 100644
---
a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++
b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -23,10 +23,13 @@ import static
org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
+import javax.imageio.ImageIO;
import javax.ws.rs.core.Response;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
@@ -35,6 +38,8 @@ import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.jupiter.api.Test;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.server.core.CXFTestBase;
import org.apache.tika.server.core.TikaServerParseExceptionMapper;
@@ -233,4 +238,95 @@ public class UnpackerResourceTest extends CXFTestBase {
String txt = readArchiveText((InputStream) response.getEntity());
CXFTestBase.assertContains("Happy New Year", txt);
}
+
+ @Test
+ public void testPDFPerPageRenderColor() throws Exception {
+
+ Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"imageStrategy",
+ "RenderPagesAtPageEnd")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX +
"ocrImageType", "rgb")
+
.accept("application/zip").put(ClassLoader.getSystemResourceAsStream(
+ "test-documents/testColorRendering.pdf"));
+ Map<String, byte[]> results = readZipArchiveBytes((InputStream)
response.getEntity());
+ byte[] renderedImage = null;
+ for (Map.Entry<String, byte[]> e : results.entrySet()) {
+ if (e.getKey().startsWith("tika-pdfbox-rendering")) {
+ renderedImage = e.getValue();
+ break;
+ }
+ }
+ assertEquals("image/png",
+ TikaConfig.getDefaultConfig().getDetector()
+ .detect(new ByteArrayInputStream(renderedImage), new
Metadata()).toString()
+ );
+
+ try (InputStream is = new ByteArrayInputStream(renderedImage)) {
+ BufferedImage image = ImageIO.read(is);
+ //top left
+ AverageColor averageColor =
+ getAverageColor(image, 0, image.getWidth() / 5, 0,
image.getHeight() / 10);
+ assertTrue(averageColor.red > 250);
+ assertTrue(averageColor.green < 1);
+ assertTrue(averageColor.blue < 1);
+
+ //bottom left = green
+ averageColor = getAverageColor(image, 0, image.getWidth() / 5,
+ image.getHeight() / 2 + image.getHeight() / 10,
+ image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+ assertTrue(averageColor.red < 1);
+ assertTrue(averageColor.green > 250);
+ assertTrue(averageColor.blue < 1);
+
+ //bottom right = blue
+ averageColor = getAverageColor(image, image.getWidth() / 2 +
image.getWidth() / 10,
+ image.getWidth() / 2 + 2 * image.getWidth() / 10,
+ image.getHeight() / 2 + image.getHeight() / 10,
+ image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+ assertTrue(averageColor.red < 1);
+ assertTrue(averageColor.green < 1);
+ assertTrue(averageColor.blue > 250);
+ }
+ }
+
+ private static AverageColor getAverageColor(BufferedImage image, int minX,
int maxX, int minY,
+ int maxY) {
+ long totalRed = 0;
+ long totalGreen = 0;
+ long totalBlue = 0;
+ int pixels = 0;
+ for (int x = minX; x < maxX; x++) {
+ for (int y = minY; y < maxY; y++) {
+ int clr = image.getRGB(x, y);
+ int red = (clr & 0x00ff0000) >> 16;
+ int green = (clr & 0x0000ff00) >> 8;
+ int blue = clr & 0x000000ff;
+ totalRed += red;
+ totalGreen += green;
+ totalBlue += blue;
+ pixels++;
+ }
+ }
+ return new AverageColor((double) totalRed / (double) pixels,
+ (double) totalGreen / (double) pixels, (double) totalBlue /
(double) pixels);
+ }
+
+ public static class AverageColor {
+ double red;
+ double green;
+ double blue;
+
+ public AverageColor(double averageRed, double averageGreen, double
averageBlue) {
+ this.red = averageRed;
+ this.green = averageGreen;
+ this.blue = averageBlue;
+ }
+
+ @Override
+ public String toString() {
+ return "AverageColor{" + "red=" + red + ", green=" + green + ",
blue=" + blue + '}';
+ }
+ }
}
diff --git
a/tika-server/tika-server-standard/src/test/resources/test-documents/testColorRendering.pdf
b/tika-server/tika-server-standard/src/test/resources/test-documents/testColorRendering.pdf
new file mode 100644
index 000000000..4cf60720c
Binary files /dev/null and
b/tika-server/tika-server-standard/src/test/resources/test-documents/testColorRendering.pdf
differ