This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 99dc4885d9ec670396b932df4ee3a911a9c262f4 Author: tballison <[email protected]> AuthorDate: Fri May 6 14:14:33 2022 -0400 add VectorGraphicsOnlyPDFRenderer --- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 3 + .../apache/tika/parser/pdf/PDFParserConfig.java | 9 +- .../parser/pdf/VectorGraphicsOnlyPDFRenderer.java | 133 +++++++++++++++++++++ 3 files changed, 144 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 344756dd0..f473e6f01 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -579,6 +579,9 @@ class AbstractPDF2XHTML extends PDFTextStripper { case TEXT_ONLY: renderer = new TextOnlyPDFRenderer(pdDocument); break; + case VECTOR_GRAPHICS_ONLY: + renderer = new VectorGraphicsOnlyPDFRenderer(pdDocument); + break; case ALL: renderer = new PDFRenderer(pdDocument); break; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index fb8a315ae..12b10e6c1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -930,7 +930,12 @@ public class PDFParserConfig implements Serializable { } public enum OCR_RENDERING_STRATEGY { - NO_TEXT, TEXT_ONLY, ALL; //AUTO? + + NO_TEXT, //includes vector graphics and image + TEXT_ONLY, //renders only glyphs + VECTOR_GRAPHICS_ONLY, //renders only vector graphics + ALL; + //TODO: add AUTO? private static OCR_RENDERING_STRATEGY parse(String s) { if (s == null) { @@ -938,6 +943,8 @@ public class PDFParserConfig implements Serializable { } String lc = s.toLowerCase(Locale.US); switch (lc) { + case "vector_graphics_only": + return VECTOR_GRAPHICS_ONLY; case "text_only": return TEXT_ONLY; case "no_text": diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java new file mode 100644 index 000000000..acd4b9485 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.awt.Graphics2D; +import java.io.IOException; + +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType3Font; +import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.rendering.PageDrawer; +import org.apache.pdfbox.rendering.PageDrawerParameters; +import org.apache.pdfbox.util.Matrix; +import org.apache.pdfbox.util.Vector; + +/** + * This class extends the PDFRenderer to render only the textual + * elements + */ +public class VectorGraphicsOnlyPDFRenderer extends PDFRenderer { + + public VectorGraphicsOnlyPDFRenderer(PDDocument document) { + super(document); + } + + /** + * Returns a new PageDrawer instance, using the given parameters. May be overridden. + */ + protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException { + PageDrawer pageDrawer = new VectorGraphicsOnlyDrawer(parameters); + pageDrawer.setAnnotationFilter(getAnnotationsFilter()); + return pageDrawer; + } + + private class VectorGraphicsOnlyDrawer extends PageDrawer { + public VectorGraphicsOnlyDrawer(PageDrawerParameters parameters) throws IOException { + super(parameters); + } + + + @Override + public void beginText() throws IOException { + } + + @Override + public void endText() throws IOException { + } + + @Override + protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font, int code, + Vector displacement) throws IOException { + } + + @Override + protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font font, int code, + Vector displacement) throws IOException { + } + + @Override + public void drawImage(PDImage pdImage) throws IOException { + } + + @Override + protected void showTransparencyGroupOnGraphics(PDTransparencyGroup form, + Graphics2D graphics) throws IOException { + } + + @Override + public void beginMarkedContentSequence(COSName tag, COSDictionary properties) { + } + + @Override + public void endMarkedContentSequence() { + } + + + @Override + public void showTextString(byte[] string) throws IOException { + } + + @Override + public void showTextStrings(COSArray array) throws IOException { + } + + @Override + protected void applyTextAdjustment(float tx, float ty) throws IOException { + } + + @Override + protected void showText(byte[] string) throws IOException { + } + + @Override + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, + Vector displacement) throws IOException { + } + + @Override + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, + Vector displacement) throws IOException { + } + + @Override + protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font, int code, + String unicode, Vector displacement) throws IOException { + } + + @Override + protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font font, int code, + String unicode, Vector displacement) throws IOException { + } + } +}
