This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7c383171529f03ca5c6be8d3398c94a28bb53acf
Author: Tilman Hausherr <[email protected]>
AuthorDate: Sat Aug 12 20:06:39 2023 +0200

    TIKA-4114: avoid methods that no longer exists in PDFBox 3.0
---
 .../apache/tika/parser/pdf/PDFMarkedContent2XHTML.java | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index a3a49a367..1bdbebc09 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -100,7 +100,9 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
      *
      * @param pdDocument PDF document
      * @param handler    SAX content handler
+     * @param context
      * @param metadata   PDF metadata
+     * @param config
      * @throws SAXException  if the content handler fails to process SAX events
      * @throws TikaException if there was an exception outside of per page 
processing
      */
@@ -273,25 +275,29 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
             for (COSBase k : ((COSArray) kids)) {
                 recurse(k, currentPageRef, depth, paragraphs, roleMap);
             }
-        } else if (kids instanceof COSObject) {
-            COSBase cosType = ((COSObject) kids).getItem(COSName.TYPE);
+        } else if (kids instanceof COSObject && 
+                ((COSObject) kids).getObject() instanceof COSDictionary) {
+            //TODO should be merged with COSDictionary segment below?
+            // and maybe dereference COSObject first, i.e. before the first 
"if"?
+            COSDictionary dict = (COSDictionary) ((COSObject) 
kids).getObject();
+            COSBase cosType = dict.getItem(COSName.TYPE);
             if (cosType != null && cosType instanceof COSName) {
                 if ("OBJR".equals(((COSName) cosType).getName())) {
-                    recurse(((COSObject) 
kids).getDictionaryObject(COSName.OBJ), currentPageRef,
+                    recurse(dict.getDictionaryObject(COSName.OBJ), 
currentPageRef,
                             depth + 1, paragraphs, roleMap);
                 }
             }
 
-            COSBase n = ((COSObject) kids).getItem(COSName.S);
+            COSBase n = dict.getItem(COSName.S);
             String name = "";
             if (n instanceof COSName) {
                 name = ((COSName) n).getName();
             }
-            COSBase grandkids = ((COSObject) kids).getItem(COSName.K);
+            COSBase grandkids = dict.getItem(COSName.K);
             if (grandkids == null) {
                 return;
             }
-            COSBase pageBase = ((COSObject) kids).getItem(COSName.PG);
+            COSBase pageBase = dict.getItem(COSName.PG);
 
             if (pageBase != null && pageBase instanceof COSObject) {
                 currentPageRef = new ObjectRef(((COSObject) 
pageBase).getObjectNumber(),

Reply via email to