Author: tilman
Date: Wed Mar 12 15:23:19 2025
New Revision: 1924338
URL: http://svn.apache.org/viewvc?rev=1924338&view=rev
Log:
PDFBOX-5974: check that all MCIDs of a page content stream have an entry in the
ParentTree.
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
Modified:
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java?rev=1924338&r1=1924337&r2=1924338&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
(original)
+++
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java
Wed Mar 12 15:23:19 2025
@@ -27,6 +27,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.TreeSet;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
@@ -44,9 +45,12 @@ import org.apache.pdfbox.pdmodel.PDResou
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.PDNumberTreeNode;
+import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDMarkedContentReference;
+import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDParentTreeValue;
import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
+import
org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
@@ -62,6 +66,7 @@ import org.apache.pdfbox.pdmodel.interac
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
@@ -648,6 +653,9 @@ class PDFMergerUtilityTest
/**
* PDFBOX-4408: Check that /StructParents values from pages and
/StructParent values from
* annotations are found in the /ParentTree.
+ * <p>
+ * Expanded in 2025 to check that all MCIDs of a page content stream have
an entry in the
+ * ParentTree.
*
* @param document
*/
@@ -673,11 +681,71 @@ class PDFMergerUtilityTest
}
}
}
- for (PDPage page : document.getPages())
+ PDPageTree pageTree = document.getPages();
+ for (PDPage page : pageTree)
{
+ int pageNum = pageTree.indexOf(page) + 1;
if (page.getStructParents() >= 0)
{
- assertTrue(keySet.contains(page.getStructParents()));
+ assertTrue(keySet.contains(page.getStructParents()),
"/StructParents " + page.getStructParents() + " from page " +
+ pageNum + " not found in /ParentTree");
+ PDParentTreeValue obj = (PDParentTreeValue)
numberTreeAsMap.get(page.getStructParents());
+ assertTrue(obj.getCOSObject() instanceof COSArray, "Expected
array in page " + pageNum + ", got " + obj.getClass());
+ COSArray array = (COSArray) obj.getCOSObject();
+
+ PDFMarkedContentExtractor markedContentExtractor = new
PDFMarkedContentExtractor();
+ markedContentExtractor.processPage(page);
+ List<PDMarkedContent> markedContents =
markedContentExtractor.getMarkedContents();
+ TreeSet<Integer> set = new TreeSet<>();
+ for (PDMarkedContent pdMarkedContent : markedContents)
+ {
+ COSDictionary pdmcProperties =
pdMarkedContent.getProperties();
+ if (pdmcProperties == null)
+ {
+ continue;
+ }
+ int mcid = pdMarkedContent.getMCID();
+ if (mcid >= 0)
+ {
+ // "For a page object (...), the value shall be an
array of references
+ // to the parent elements of those marked-content
sequences."
+ // this means that the /Pg entry doesn't have to match
the page
+ COSDictionary dict = (COSDictionary)
array.getObject(mcid);
+ assertNotNull(dict);
+ set.add(mcid);
+ PDStructureElement structureElemen =
(PDStructureElement) PDStructureNode.create(dict);
+ List<Object> kids = structureElemen.getKids();
+ boolean found = false;
+ for (Object kid : kids)
+ {
+ if (kid instanceof Integer && ((Integer) kid) ==
mcid)
+ {
+ found = true;
+ break;
+ }
+ if (kid instanceof PDMarkedContentReference)
+ {
+ PDMarkedContentReference mcr =
(PDMarkedContentReference) kid;
+ if (mcid == mcr.getMCID())
+ {
+ found = true;
+ if (mcr.getPage() != null)
+ {
+ assertEquals(page, mcr.getPage());
+ }
+ else
+ {
+ assertEquals(page,
structureElemen.getPage());
+ }
+ break;
+ }
+ }
+ }
+ assertTrue(found, "page: " + pageNum + ", mcid: " +
mcid + " not found");
+ }
+ }
+ // actual count may be larger if last element is null, e.g.
PDFBOX-4408
+ assertTrue(set.last() <= array.size() - 1);
}
for (PDAnnotation ann : page.getAnnotations())
{