include/vcl/filter/PDFiumLibrary.hxx                 |   29 +++
 vcl/qa/cppunit/PDFiumLibraryTest.cxx                 |  145 ++++++++++++++++
 vcl/qa/cppunit/data/StructureTreeExampleDocument.odt |binary
 vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf |binary
 vcl/source/pdf/PDFiumLibrary.cxx                     |  168 +++++++++++++++++++
 5 files changed, 342 insertions(+)

New commits:
commit ae6aeece1ea3a4b6838f95daad0f266fc6777b96
Author:     Tomaž Vajngerl <[email protected]>
AuthorDate: Fri Jan 10 23:26:03 2025 +0900
Commit:     Miklos Vajna <[email protected]>
CommitDate: Fri Feb 7 09:23:33 2025 +0100

    pdfium: add support for reading the structure tree
    
    + add test for reading the tree
    
    Change-Id: I2f0e9d1852d20b3aa20ec0bcdd3ebc65370d15dd
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/180124
    Tested-by: Jenkins
    Reviewed-by: Tomaž Vajngerl <[email protected]>
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/180880
    Reviewed-by: Miklos Vajna <[email protected]>
    Tested-by: Jenkins CollaboraOffice <[email protected]>

diff --git a/include/vcl/filter/PDFiumLibrary.hxx 
b/include/vcl/filter/PDFiumLibrary.hxx
index 40895a0f402c..22d656e4c367 100644
--- a/include/vcl/filter/PDFiumLibrary.hxx
+++ b/include/vcl/filter/PDFiumLibrary.hxx
@@ -183,6 +183,34 @@ public:
     virtual basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) = 
0;
 };
 
+class VCL_DLLPUBLIC PDFiumStructureElement
+{
+public:
+    virtual ~PDFiumStructureElement() = default;
+
+    virtual OUString getAltText() = 0;
+    virtual OUString getActualText() = 0;
+    virtual OUString getID() = 0;
+    virtual OUString getLang() = 0;
+    virtual OUString getTitle() = 0;
+    virtual OUString getType() = 0;
+    virtual OUString getObjectType() = 0;
+
+    virtual int getNumberOfChildren() = 0;
+    virtual int getChildMarkedContentID(int nIndex) = 0;
+    virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0;
+    virtual std::unique_ptr<PDFiumStructureElement> getParent() = 0;
+};
+
+class VCL_DLLPUBLIC PDFiumStructureTree
+{
+public:
+    virtual ~PDFiumStructureTree() = default;
+
+    virtual int getNumberOfChildren() = 0;
+    virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0;
+};
+
 class VCL_DLLPUBLIC PDFiumPage
 {
 public:
@@ -197,6 +225,7 @@ public:
     virtual std::unique_ptr<PDFiumAnnotation> getAnnotation(int nIndex) = 0;
 
     virtual std::unique_ptr<PDFiumTextPage> getTextPage() = 0;
+    virtual std::unique_ptr<PDFiumStructureTree> getStructureTree() = 0;
 
     /// Get bitmap checksum of the page, without annotations/commenting.
     virtual BitmapChecksum getChecksum(int nMDPPerm) = 0;
diff --git a/vcl/qa/cppunit/PDFiumLibraryTest.cxx 
b/vcl/qa/cppunit/PDFiumLibraryTest.cxx
index 45c59b46d447..1db325dbe889 100644
--- a/vcl/qa/cppunit/PDFiumLibraryTest.cxx
+++ b/vcl/qa/cppunit/PDFiumLibraryTest.cxx
@@ -486,6 +486,151 @@ CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testTools)
     CPPUNIT_ASSERT_EQUAL(false, bool(aDateTime.IsUTC));
 }
 
+CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testStructureTree)
+{
+    OUString aURL = getFullUrl(u"StructureTreeExampleDocument.pdf");
+    SvFileStream aStream(aURL, StreamMode::READ);
+    GraphicFilter& rGraphicFilter = GraphicFilter::GetGraphicFilter();
+    Graphic aGraphic = rGraphicFilter.ImportUnloadedGraphic(aStream);
+    auto pVectorGraphicData = aGraphic.getVectorGraphicData();
+    CPPUNIT_ASSERT(pVectorGraphicData);
+    CPPUNIT_ASSERT_EQUAL(VectorGraphicDataType::Pdf, 
pVectorGraphicData->getType());
+    auto& rDataContainer = pVectorGraphicData->getBinaryDataContainer();
+
+    auto pPdfium = vcl::pdf::PDFiumLibrary::get();
+    CPPUNIT_ASSERT(pPdfium);
+
+    auto pDocument
+        = pPdfium->openDocument(rDataContainer.getData(), 
rDataContainer.getSize(), OString());
+    CPPUNIT_ASSERT(pDocument);
+
+    CPPUNIT_ASSERT_EQUAL(1, pDocument->getPageCount());
+
+    auto pPage = pDocument->openPage(0);
+    CPPUNIT_ASSERT(pPage);
+
+    auto pTree = pPage->getStructureTree();
+    CPPUNIT_ASSERT(pTree);
+    CPPUNIT_ASSERT_EQUAL(1, pTree->getNumberOfChildren());
+
+    // Check the structure
+    {
+        auto pChildDocument = pTree->getChild(0);
+        CPPUNIT_ASSERT(pChildDocument);
+        CPPUNIT_ASSERT_EQUAL(5, pChildDocument->getNumberOfChildren());
+
+        CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getAltText());
+        CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getActualText());
+        CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getID());
+        CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getLang());
+        CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getTitle());
+        CPPUNIT_ASSERT_EQUAL(u"Document"_ustr, pChildDocument->getType());
+        CPPUNIT_ASSERT_EQUAL(u"StructElem"_ustr, 
pChildDocument->getObjectType());
+
+        {
+            auto pThis = pChildDocument->getChild(0);
+            CPPUNIT_ASSERT(pThis);
+            CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
+            CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren());
+            CPPUNIT_ASSERT_EQUAL(0, pThis->getChildMarkedContentID(0));
+        }
+
+        {
+            auto pThis = pChildDocument->getChild(1);
+            CPPUNIT_ASSERT(pThis);
+            CPPUNIT_ASSERT_EQUAL(u"H1"_ustr, pThis->getType());
+            CPPUNIT_ASSERT_EQUAL(2, pThis->getNumberOfChildren());
+            CPPUNIT_ASSERT_EQUAL(1, pThis->getChildMarkedContentID(0));
+            CPPUNIT_ASSERT_EQUAL(2, pThis->getChildMarkedContentID(1));
+        }
+
+        {
+            auto pThis = pChildDocument->getChild(2);
+            CPPUNIT_ASSERT(pThis);
+            CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
+            CPPUNIT_ASSERT_EQUAL(13, pThis->getNumberOfChildren());
+            CPPUNIT_ASSERT_EQUAL(3, pThis->getChildMarkedContentID(0));
+            {
+                auto pChild = pThis->getChild(1);
+                CPPUNIT_ASSERT_EQUAL(u"Code"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(4, pChild->getChildMarkedContentID(0));
+
+                // Check getParent
+                auto pThis2 = pChild->getParent();
+                CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis2->getType());
+                CPPUNIT_ASSERT_EQUAL(13, pThis2->getNumberOfChildren());
+            }
+            CPPUNIT_ASSERT_EQUAL(5, pThis->getChildMarkedContentID(2));
+            CPPUNIT_ASSERT_EQUAL(6, pThis->getChildMarkedContentID(3));
+            {
+                auto pChild = pThis->getChild(4);
+                CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(7, pChild->getChildMarkedContentID(0));
+            }
+            CPPUNIT_ASSERT_EQUAL(8, pThis->getChildMarkedContentID(5));
+            CPPUNIT_ASSERT_EQUAL(9, pThis->getChildMarkedContentID(6));
+            {
+                auto pChild = pThis->getChild(7);
+                CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(10, pChild->getChildMarkedContentID(0));
+            }
+            CPPUNIT_ASSERT_EQUAL(11, pThis->getChildMarkedContentID(8));
+            CPPUNIT_ASSERT_EQUAL(12, pThis->getChildMarkedContentID(9));
+            {
+                auto pChild = pThis->getChild(10);
+                CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(13, pChild->getChildMarkedContentID(0));
+            }
+            CPPUNIT_ASSERT_EQUAL(14, pThis->getChildMarkedContentID(11));
+            {
+                auto pChild = pThis->getChild(12);
+                CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(15, pChild->getChildMarkedContentID(0));
+            }
+        }
+
+        {
+            auto pThis = pChildDocument->getChild(3);
+            CPPUNIT_ASSERT(pThis);
+            CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
+            CPPUNIT_ASSERT_EQUAL(4, pThis->getNumberOfChildren());
+            CPPUNIT_ASSERT_EQUAL(16, pThis->getChildMarkedContentID(0));
+            {
+                auto pChild = pThis->getChild(1);
+                CPPUNIT_ASSERT_EQUAL(u"Quote"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(17, pChild->getChildMarkedContentID(0));
+            }
+            CPPUNIT_ASSERT_EQUAL(18, pThis->getChildMarkedContentID(2));
+            {
+                auto pChild = pThis->getChild(3);
+                // Rectangle
+                CPPUNIT_ASSERT_EQUAL(u"Div"_ustr, pChild->getType());
+                CPPUNIT_ASSERT_EQUAL(u"Only Text! - The Alt Text!"_ustr, 
pChild->getAltText());
+                CPPUNIT_ASSERT_EQUAL(20, pChild->getChildMarkedContentID(0));
+                {
+                    // Text in rectangle
+                    auto pRectangleElement = pChild->getChild(1);
+                    CPPUNIT_ASSERT_EQUAL(u"P"_ustr, 
pRectangleElement->getType());
+                    CPPUNIT_ASSERT_EQUAL(21, 
pRectangleElement->getChildMarkedContentID(0));
+                }
+            }
+        }
+
+        {
+            auto pThis = pChildDocument->getChild(4);
+            CPPUNIT_ASSERT(pThis);
+            CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType());
+            CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren());
+            CPPUNIT_ASSERT_EQUAL(19, pThis->getChildMarkedContentID(0));
+        }
+
+        {
+            auto pThis = pChildDocument->getChild(5);
+            CPPUNIT_ASSERT(!pThis);
+        }
+    }
+}
+
 CPPUNIT_PLUGIN_IMPLEMENT();
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/vcl/qa/cppunit/data/StructureTreeExampleDocument.odt 
b/vcl/qa/cppunit/data/StructureTreeExampleDocument.odt
new file mode 100644
index 000000000000..18631b2df570
Binary files /dev/null and 
b/vcl/qa/cppunit/data/StructureTreeExampleDocument.odt differ
diff --git a/vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf 
b/vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf
new file mode 100644
index 000000000000..a89fb067060a
Binary files /dev/null and 
b/vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf differ
diff --git a/vcl/source/pdf/PDFiumLibrary.cxx b/vcl/source/pdf/PDFiumLibrary.cxx
index d56562f34b74..c23db7ebc84d 100644
--- a/vcl/source/pdf/PDFiumLibrary.cxx
+++ b/vcl/source/pdf/PDFiumLibrary.cxx
@@ -21,6 +21,7 @@
 #include <fpdf_signature.h>
 #include <fpdf_formfill.h>
 #include <fpdf_attachment.h>
+#include <fpdf_structtree.h>
 
 #include <osl/endian.h>
 #include <vcl/bitmap.hxx>
@@ -337,6 +338,47 @@ public:
     int getOptionCount(PDFiumDocument* pDoc) override;
 };
 
+class PDFiumStructureElementImpl final : public PDFiumStructureElement
+{
+private:
+    FPDF_STRUCTELEMENT mpStructureElement;
+
+    PDFiumStructureElementImpl(const PDFiumStructureElementImpl&) = delete;
+    PDFiumStructureElementImpl& operator=(const PDFiumStructureElementImpl&) = 
delete;
+
+public:
+    PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement);
+
+    OUString getAltText() override;
+    OUString getActualText() override;
+    OUString getID() override;
+    OUString getLang() override;
+    OUString getTitle() override;
+    OUString getType() override;
+    OUString getObjectType() override;
+
+    int getNumberOfChildren() override;
+    int getChildMarkedContentID(int nIndex) override;
+    std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override;
+    std::unique_ptr<PDFiumStructureElement> getParent() override;
+};
+
+class PDFiumStructureTreeImpl final : public PDFiumStructureTree
+{
+private:
+    FPDF_STRUCTTREE mpStructureTree;
+
+    PDFiumStructureTreeImpl(const PDFiumStructureTreeImpl&) = delete;
+    PDFiumStructureTreeImpl& operator=(const PDFiumStructureTreeImpl&) = 
delete;
+
+public:
+    PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree);
+    ~PDFiumStructureTreeImpl();
+
+    int getNumberOfChildren() override;
+    std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override;
+};
+
 class PDFiumPageObjectImpl final : public PDFiumPageObject
 {
 private:
@@ -462,6 +504,8 @@ public:
 
     std::unique_ptr<PDFiumTextPage> getTextPage() override;
 
+    std::unique_ptr<PDFiumStructureTree> getStructureTree() override;
+
     BitmapChecksum getChecksum(int nMDPPerm) override;
 
     double getWidth() override;
@@ -910,6 +954,17 @@ std::unique_ptr<PDFiumTextPage> 
PDFiumPageImpl::getTextPage()
     return pPDFiumTextPage;
 }
 
+std::unique_ptr<PDFiumStructureTree> PDFiumPageImpl::getStructureTree()
+{
+    std::unique_ptr<PDFiumStructureTree> pPDFiumStructureTree;
+    FPDF_STRUCTTREE pStructTree = FPDF_StructTree_GetForPage(mpPage);
+    if (pStructTree)
+    {
+        pPDFiumStructureTree = 
std::make_unique<PDFiumStructureTreeImpl>(pStructTree);
+    }
+    return pPDFiumStructureTree;
+}
+
 bool PDFiumPageImpl::hasLinks()
 {
     // This could be a full iterator, but at the moment we just determine if 
the list is empty or
@@ -1546,6 +1601,119 @@ std::unique_ptr<PDFiumPageObject> 
PDFiumAnnotationImpl::getObject(int nIndex)
     return pPDFiumPageObject;
 }
 
+PDFiumStructureElementImpl::PDFiumStructureElementImpl(FPDF_STRUCTELEMENT 
pStructureElement)
+    : mpStructureElement(pStructureElement)
+{
+}
+
+OUString PDFiumStructureElementImpl::getAltText()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetAltText(mpStructureElement, buffer, 
length);
+    });
+}
+
+OUString PDFiumStructureElementImpl::getActualText()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetActualText(mpStructureElement, buffer, 
length);
+    });
+}
+
+OUString PDFiumStructureElementImpl::getID()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetID(mpStructureElement, buffer, length);
+    });
+}
+
+OUString PDFiumStructureElementImpl::getLang()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetLang(mpStructureElement, buffer, length);
+    });
+}
+
+OUString PDFiumStructureElementImpl::getType()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetType(mpStructureElement, buffer, length);
+    });
+}
+
+OUString PDFiumStructureElementImpl::getObjectType()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetObjType(mpStructureElement, buffer, 
length);
+    });
+}
+
+int PDFiumStructureElementImpl::getChildMarkedContentID(int nIndex)
+{
+    return FPDF_StructElement_GetChildMarkedContentID(mpStructureElement, 
nIndex);
+}
+
+OUString PDFiumStructureElementImpl::getTitle()
+{
+    return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) {
+        return FPDF_StructElement_GetTitle(mpStructureElement, buffer, length);
+    });
+}
+
+int PDFiumStructureElementImpl::getNumberOfChildren()
+{
+    return FPDF_StructElement_CountChildren(mpStructureElement);
+}
+
+std::unique_ptr<PDFiumStructureElement> 
PDFiumStructureElementImpl::getChild(int nIndex)
+{
+    std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
+    FPDF_STRUCTELEMENT pElement = 
FPDF_StructElement_GetChildAtIndex(mpStructureElement, nIndex);
+    if (pElement)
+    {
+        pPDFiumStructureElement = 
std::make_unique<PDFiumStructureElementImpl>(pElement);
+    }
+    return pPDFiumStructureElement;
+}
+
+std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getParent()
+{
+    std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
+    FPDF_STRUCTELEMENT pElement = 
FPDF_StructElement_GetParent(mpStructureElement);
+    if (pElement)
+    {
+        pPDFiumStructureElement = 
std::make_unique<PDFiumStructureElementImpl>(pElement);
+    }
+    return pPDFiumStructureElement;
+}
+
+PDFiumStructureTreeImpl::PDFiumStructureTreeImpl(FPDF_STRUCTTREE 
pStructureTree)
+    : mpStructureTree(pStructureTree)
+{
+}
+
+PDFiumStructureTreeImpl::~PDFiumStructureTreeImpl()
+{
+    if (mpStructureTree)
+        FPDF_StructTree_Close(mpStructureTree);
+}
+
+int PDFiumStructureTreeImpl::getNumberOfChildren()
+{
+    return FPDF_StructTree_CountChildren(mpStructureTree);
+}
+
+std::unique_ptr<PDFiumStructureElement> PDFiumStructureTreeImpl::getChild(int 
nIndex)
+{
+    std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement;
+    FPDF_STRUCTELEMENT pElement = 
FPDF_StructTree_GetChildAtIndex(mpStructureTree, nIndex);
+    if (pElement)
+    {
+        pPDFiumStructureElement = 
std::make_unique<PDFiumStructureElementImpl>(pElement);
+    }
+    return pPDFiumStructureElement;
+}
+
 PDFiumTextPageImpl::PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage)
     : mpTextPage(pTextPage)
 {

Reply via email to