include/vcl/filter/PDFiumLibrary.hxx | 29 +++ vcl/qa/cppunit/PDFiumLibraryTest.cxx | 145 ++++++++++++++++ vcl/qa/cppunit/data/StructureTreeExampleDocument.odt |binary vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf |binary vcl/source/pdf/PDFiumLibrary.cxx | 168 +++++++++++++++++++ 5 files changed, 342 insertions(+)
New commits: commit ae6aeece1ea3a4b6838f95daad0f266fc6777b96 Author: Tomaž Vajngerl <[email protected]> AuthorDate: Fri Jan 10 23:26:03 2025 +0900 Commit: Miklos Vajna <[email protected]> CommitDate: Fri Feb 7 09:23:33 2025 +0100 pdfium: add support for reading the structure tree + add test for reading the tree Change-Id: I2f0e9d1852d20b3aa20ec0bcdd3ebc65370d15dd Reviewed-on: https://gerrit.libreoffice.org/c/core/+/180124 Tested-by: Jenkins Reviewed-by: Tomaž Vajngerl <[email protected]> Reviewed-on: https://gerrit.libreoffice.org/c/core/+/180880 Reviewed-by: Miklos Vajna <[email protected]> Tested-by: Jenkins CollaboraOffice <[email protected]> diff --git a/include/vcl/filter/PDFiumLibrary.hxx b/include/vcl/filter/PDFiumLibrary.hxx index 40895a0f402c..22d656e4c367 100644 --- a/include/vcl/filter/PDFiumLibrary.hxx +++ b/include/vcl/filter/PDFiumLibrary.hxx @@ -183,6 +183,34 @@ public: virtual basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) = 0; }; +class VCL_DLLPUBLIC PDFiumStructureElement +{ +public: + virtual ~PDFiumStructureElement() = default; + + virtual OUString getAltText() = 0; + virtual OUString getActualText() = 0; + virtual OUString getID() = 0; + virtual OUString getLang() = 0; + virtual OUString getTitle() = 0; + virtual OUString getType() = 0; + virtual OUString getObjectType() = 0; + + virtual int getNumberOfChildren() = 0; + virtual int getChildMarkedContentID(int nIndex) = 0; + virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0; + virtual std::unique_ptr<PDFiumStructureElement> getParent() = 0; +}; + +class VCL_DLLPUBLIC PDFiumStructureTree +{ +public: + virtual ~PDFiumStructureTree() = default; + + virtual int getNumberOfChildren() = 0; + virtual std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) = 0; +}; + class VCL_DLLPUBLIC PDFiumPage { public: @@ -197,6 +225,7 @@ public: virtual std::unique_ptr<PDFiumAnnotation> getAnnotation(int nIndex) = 0; virtual std::unique_ptr<PDFiumTextPage> getTextPage() = 0; + virtual std::unique_ptr<PDFiumStructureTree> getStructureTree() = 0; /// Get bitmap checksum of the page, without annotations/commenting. virtual BitmapChecksum getChecksum(int nMDPPerm) = 0; diff --git a/vcl/qa/cppunit/PDFiumLibraryTest.cxx b/vcl/qa/cppunit/PDFiumLibraryTest.cxx index 45c59b46d447..1db325dbe889 100644 --- a/vcl/qa/cppunit/PDFiumLibraryTest.cxx +++ b/vcl/qa/cppunit/PDFiumLibraryTest.cxx @@ -486,6 +486,151 @@ CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testTools) CPPUNIT_ASSERT_EQUAL(false, bool(aDateTime.IsUTC)); } +CPPUNIT_TEST_FIXTURE(PDFiumLibraryTest, testStructureTree) +{ + OUString aURL = getFullUrl(u"StructureTreeExampleDocument.pdf"); + SvFileStream aStream(aURL, StreamMode::READ); + GraphicFilter& rGraphicFilter = GraphicFilter::GetGraphicFilter(); + Graphic aGraphic = rGraphicFilter.ImportUnloadedGraphic(aStream); + auto pVectorGraphicData = aGraphic.getVectorGraphicData(); + CPPUNIT_ASSERT(pVectorGraphicData); + CPPUNIT_ASSERT_EQUAL(VectorGraphicDataType::Pdf, pVectorGraphicData->getType()); + auto& rDataContainer = pVectorGraphicData->getBinaryDataContainer(); + + auto pPdfium = vcl::pdf::PDFiumLibrary::get(); + CPPUNIT_ASSERT(pPdfium); + + auto pDocument + = pPdfium->openDocument(rDataContainer.getData(), rDataContainer.getSize(), OString()); + CPPUNIT_ASSERT(pDocument); + + CPPUNIT_ASSERT_EQUAL(1, pDocument->getPageCount()); + + auto pPage = pDocument->openPage(0); + CPPUNIT_ASSERT(pPage); + + auto pTree = pPage->getStructureTree(); + CPPUNIT_ASSERT(pTree); + CPPUNIT_ASSERT_EQUAL(1, pTree->getNumberOfChildren()); + + // Check the structure + { + auto pChildDocument = pTree->getChild(0); + CPPUNIT_ASSERT(pChildDocument); + CPPUNIT_ASSERT_EQUAL(5, pChildDocument->getNumberOfChildren()); + + CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getAltText()); + CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getActualText()); + CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getID()); + CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getLang()); + CPPUNIT_ASSERT_EQUAL(u""_ustr, pChildDocument->getTitle()); + CPPUNIT_ASSERT_EQUAL(u"Document"_ustr, pChildDocument->getType()); + CPPUNIT_ASSERT_EQUAL(u"StructElem"_ustr, pChildDocument->getObjectType()); + + { + auto pThis = pChildDocument->getChild(0); + CPPUNIT_ASSERT(pThis); + CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType()); + CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren()); + CPPUNIT_ASSERT_EQUAL(0, pThis->getChildMarkedContentID(0)); + } + + { + auto pThis = pChildDocument->getChild(1); + CPPUNIT_ASSERT(pThis); + CPPUNIT_ASSERT_EQUAL(u"H1"_ustr, pThis->getType()); + CPPUNIT_ASSERT_EQUAL(2, pThis->getNumberOfChildren()); + CPPUNIT_ASSERT_EQUAL(1, pThis->getChildMarkedContentID(0)); + CPPUNIT_ASSERT_EQUAL(2, pThis->getChildMarkedContentID(1)); + } + + { + auto pThis = pChildDocument->getChild(2); + CPPUNIT_ASSERT(pThis); + CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType()); + CPPUNIT_ASSERT_EQUAL(13, pThis->getNumberOfChildren()); + CPPUNIT_ASSERT_EQUAL(3, pThis->getChildMarkedContentID(0)); + { + auto pChild = pThis->getChild(1); + CPPUNIT_ASSERT_EQUAL(u"Code"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(4, pChild->getChildMarkedContentID(0)); + + // Check getParent + auto pThis2 = pChild->getParent(); + CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis2->getType()); + CPPUNIT_ASSERT_EQUAL(13, pThis2->getNumberOfChildren()); + } + CPPUNIT_ASSERT_EQUAL(5, pThis->getChildMarkedContentID(2)); + CPPUNIT_ASSERT_EQUAL(6, pThis->getChildMarkedContentID(3)); + { + auto pChild = pThis->getChild(4); + CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(7, pChild->getChildMarkedContentID(0)); + } + CPPUNIT_ASSERT_EQUAL(8, pThis->getChildMarkedContentID(5)); + CPPUNIT_ASSERT_EQUAL(9, pThis->getChildMarkedContentID(6)); + { + auto pChild = pThis->getChild(7); + CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(10, pChild->getChildMarkedContentID(0)); + } + CPPUNIT_ASSERT_EQUAL(11, pThis->getChildMarkedContentID(8)); + CPPUNIT_ASSERT_EQUAL(12, pThis->getChildMarkedContentID(9)); + { + auto pChild = pThis->getChild(10); + CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(13, pChild->getChildMarkedContentID(0)); + } + CPPUNIT_ASSERT_EQUAL(14, pThis->getChildMarkedContentID(11)); + { + auto pChild = pThis->getChild(12); + CPPUNIT_ASSERT_EQUAL(u"Span"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(15, pChild->getChildMarkedContentID(0)); + } + } + + { + auto pThis = pChildDocument->getChild(3); + CPPUNIT_ASSERT(pThis); + CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType()); + CPPUNIT_ASSERT_EQUAL(4, pThis->getNumberOfChildren()); + CPPUNIT_ASSERT_EQUAL(16, pThis->getChildMarkedContentID(0)); + { + auto pChild = pThis->getChild(1); + CPPUNIT_ASSERT_EQUAL(u"Quote"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(17, pChild->getChildMarkedContentID(0)); + } + CPPUNIT_ASSERT_EQUAL(18, pThis->getChildMarkedContentID(2)); + { + auto pChild = pThis->getChild(3); + // Rectangle + CPPUNIT_ASSERT_EQUAL(u"Div"_ustr, pChild->getType()); + CPPUNIT_ASSERT_EQUAL(u"Only Text! - The Alt Text!"_ustr, pChild->getAltText()); + CPPUNIT_ASSERT_EQUAL(20, pChild->getChildMarkedContentID(0)); + { + // Text in rectangle + auto pRectangleElement = pChild->getChild(1); + CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pRectangleElement->getType()); + CPPUNIT_ASSERT_EQUAL(21, pRectangleElement->getChildMarkedContentID(0)); + } + } + } + + { + auto pThis = pChildDocument->getChild(4); + CPPUNIT_ASSERT(pThis); + CPPUNIT_ASSERT_EQUAL(u"P"_ustr, pThis->getType()); + CPPUNIT_ASSERT_EQUAL(1, pThis->getNumberOfChildren()); + CPPUNIT_ASSERT_EQUAL(19, pThis->getChildMarkedContentID(0)); + } + + { + auto pThis = pChildDocument->getChild(5); + CPPUNIT_ASSERT(!pThis); + } + } +} + CPPUNIT_PLUGIN_IMPLEMENT(); /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/vcl/qa/cppunit/data/StructureTreeExampleDocument.odt b/vcl/qa/cppunit/data/StructureTreeExampleDocument.odt new file mode 100644 index 000000000000..18631b2df570 Binary files /dev/null and b/vcl/qa/cppunit/data/StructureTreeExampleDocument.odt differ diff --git a/vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf b/vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf new file mode 100644 index 000000000000..a89fb067060a Binary files /dev/null and b/vcl/qa/cppunit/data/StructureTreeExampleDocument.pdf differ diff --git a/vcl/source/pdf/PDFiumLibrary.cxx b/vcl/source/pdf/PDFiumLibrary.cxx index d56562f34b74..c23db7ebc84d 100644 --- a/vcl/source/pdf/PDFiumLibrary.cxx +++ b/vcl/source/pdf/PDFiumLibrary.cxx @@ -21,6 +21,7 @@ #include <fpdf_signature.h> #include <fpdf_formfill.h> #include <fpdf_attachment.h> +#include <fpdf_structtree.h> #include <osl/endian.h> #include <vcl/bitmap.hxx> @@ -337,6 +338,47 @@ public: int getOptionCount(PDFiumDocument* pDoc) override; }; +class PDFiumStructureElementImpl final : public PDFiumStructureElement +{ +private: + FPDF_STRUCTELEMENT mpStructureElement; + + PDFiumStructureElementImpl(const PDFiumStructureElementImpl&) = delete; + PDFiumStructureElementImpl& operator=(const PDFiumStructureElementImpl&) = delete; + +public: + PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement); + + OUString getAltText() override; + OUString getActualText() override; + OUString getID() override; + OUString getLang() override; + OUString getTitle() override; + OUString getType() override; + OUString getObjectType() override; + + int getNumberOfChildren() override; + int getChildMarkedContentID(int nIndex) override; + std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override; + std::unique_ptr<PDFiumStructureElement> getParent() override; +}; + +class PDFiumStructureTreeImpl final : public PDFiumStructureTree +{ +private: + FPDF_STRUCTTREE mpStructureTree; + + PDFiumStructureTreeImpl(const PDFiumStructureTreeImpl&) = delete; + PDFiumStructureTreeImpl& operator=(const PDFiumStructureTreeImpl&) = delete; + +public: + PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree); + ~PDFiumStructureTreeImpl(); + + int getNumberOfChildren() override; + std::unique_ptr<PDFiumStructureElement> getChild(int nIndex) override; +}; + class PDFiumPageObjectImpl final : public PDFiumPageObject { private: @@ -462,6 +504,8 @@ public: std::unique_ptr<PDFiumTextPage> getTextPage() override; + std::unique_ptr<PDFiumStructureTree> getStructureTree() override; + BitmapChecksum getChecksum(int nMDPPerm) override; double getWidth() override; @@ -910,6 +954,17 @@ std::unique_ptr<PDFiumTextPage> PDFiumPageImpl::getTextPage() return pPDFiumTextPage; } +std::unique_ptr<PDFiumStructureTree> PDFiumPageImpl::getStructureTree() +{ + std::unique_ptr<PDFiumStructureTree> pPDFiumStructureTree; + FPDF_STRUCTTREE pStructTree = FPDF_StructTree_GetForPage(mpPage); + if (pStructTree) + { + pPDFiumStructureTree = std::make_unique<PDFiumStructureTreeImpl>(pStructTree); + } + return pPDFiumStructureTree; +} + bool PDFiumPageImpl::hasLinks() { // This could be a full iterator, but at the moment we just determine if the list is empty or @@ -1546,6 +1601,119 @@ std::unique_ptr<PDFiumPageObject> PDFiumAnnotationImpl::getObject(int nIndex) return pPDFiumPageObject; } +PDFiumStructureElementImpl::PDFiumStructureElementImpl(FPDF_STRUCTELEMENT pStructureElement) + : mpStructureElement(pStructureElement) +{ +} + +OUString PDFiumStructureElementImpl::getAltText() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetAltText(mpStructureElement, buffer, length); + }); +} + +OUString PDFiumStructureElementImpl::getActualText() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetActualText(mpStructureElement, buffer, length); + }); +} + +OUString PDFiumStructureElementImpl::getID() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetID(mpStructureElement, buffer, length); + }); +} + +OUString PDFiumStructureElementImpl::getLang() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetLang(mpStructureElement, buffer, length); + }); +} + +OUString PDFiumStructureElementImpl::getType() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetType(mpStructureElement, buffer, length); + }); +} + +OUString PDFiumStructureElementImpl::getObjectType() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetObjType(mpStructureElement, buffer, length); + }); +} + +int PDFiumStructureElementImpl::getChildMarkedContentID(int nIndex) +{ + return FPDF_StructElement_GetChildMarkedContentID(mpStructureElement, nIndex); +} + +OUString PDFiumStructureElementImpl::getTitle() +{ + return getUnicodeString([this](FPDF_WCHAR* buffer, unsigned long length) { + return FPDF_StructElement_GetTitle(mpStructureElement, buffer, length); + }); +} + +int PDFiumStructureElementImpl::getNumberOfChildren() +{ + return FPDF_StructElement_CountChildren(mpStructureElement); +} + +std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getChild(int nIndex) +{ + std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement; + FPDF_STRUCTELEMENT pElement = FPDF_StructElement_GetChildAtIndex(mpStructureElement, nIndex); + if (pElement) + { + pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement); + } + return pPDFiumStructureElement; +} + +std::unique_ptr<PDFiumStructureElement> PDFiumStructureElementImpl::getParent() +{ + std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement; + FPDF_STRUCTELEMENT pElement = FPDF_StructElement_GetParent(mpStructureElement); + if (pElement) + { + pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement); + } + return pPDFiumStructureElement; +} + +PDFiumStructureTreeImpl::PDFiumStructureTreeImpl(FPDF_STRUCTTREE pStructureTree) + : mpStructureTree(pStructureTree) +{ +} + +PDFiumStructureTreeImpl::~PDFiumStructureTreeImpl() +{ + if (mpStructureTree) + FPDF_StructTree_Close(mpStructureTree); +} + +int PDFiumStructureTreeImpl::getNumberOfChildren() +{ + return FPDF_StructTree_CountChildren(mpStructureTree); +} + +std::unique_ptr<PDFiumStructureElement> PDFiumStructureTreeImpl::getChild(int nIndex) +{ + std::unique_ptr<PDFiumStructureElement> pPDFiumStructureElement; + FPDF_STRUCTELEMENT pElement = FPDF_StructTree_GetChildAtIndex(mpStructureTree, nIndex); + if (pElement) + { + pPDFiumStructureElement = std::make_unique<PDFiumStructureElementImpl>(pElement); + } + return pPDFiumStructureElement; +} + PDFiumTextPageImpl::PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage) : mpTextPage(pTextPage) {
