This is an automated email from the ASF dual-hosted git repository. tilman pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7c383171529f03ca5c6be8d3398c94a28bb53acf Author: Tilman Hausherr <[email protected]> AuthorDate: Sat Aug 12 20:06:39 2023 +0200 TIKA-4114: avoid methods that no longer exists in PDFBox 3.0 --- .../apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java index a3a49a367..1bdbebc09 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java @@ -100,7 +100,9 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML { * * @param pdDocument PDF document * @param handler SAX content handler + * @param context * @param metadata PDF metadata + * @param config * @throws SAXException if the content handler fails to process SAX events * @throws TikaException if there was an exception outside of per page processing */ @@ -273,25 +275,29 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML { for (COSBase k : ((COSArray) kids)) { recurse(k, currentPageRef, depth, paragraphs, roleMap); } - } else if (kids instanceof COSObject) { - COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE); + } else if (kids instanceof COSObject && + ((COSObject) kids).getObject() instanceof COSDictionary) { + //TODO should be merged with COSDictionary segment below? + // and maybe dereference COSObject first, i.e. before the first "if"? + COSDictionary dict = (COSDictionary) ((COSObject) kids).getObject(); + COSBase cosType = dict.getItem(COSName.TYPE); if (cosType != null && cosType instanceof COSName) { if ("OBJR".equals(((COSName) cosType).getName())) { - recurse(((COSObject) kids).getDictionaryObject(COSName.OBJ), currentPageRef, + recurse(dict.getDictionaryObject(COSName.OBJ), currentPageRef, depth + 1, paragraphs, roleMap); } } - COSBase n = ((COSObject) kids).getItem(COSName.S); + COSBase n = dict.getItem(COSName.S); String name = ""; if (n instanceof COSName) { name = ((COSName) n).getName(); } - COSBase grandkids = ((COSObject) kids).getItem(COSName.K); + COSBase grandkids = dict.getItem(COSName.K); if (grandkids == null) { return; } - COSBase pageBase = ((COSObject) kids).getItem(COSName.PG); + COSBase pageBase = dict.getItem(COSName.PG); if (pageBase != null && pageBase instanceof COSObject) { currentPageRef = new ObjectRef(((COSObject) pageBase).getObjectNumber(),
