sd/qa/unit/data/pdf/ErrareHumanumEst.pdf |binary sd/qa/unit/export-tests.cxx | 25 ++++++++++ svx/source/svdraw/svdpdf.cxx | 6 ++ vcl/qa/cppunit/pdfexport/pdfexport2.cxx | 2 vcl/source/pdf/PDFiumLibrary.cxx | 74 ++++++++++++++++++++++++++++--- 5 files changed, 101 insertions(+), 6 deletions(-)
New commits: commit 184e53c833e199264e5f0fed5ea301eefcd3eeda Author: Caolán McNamara <[email protected]> AuthorDate: Wed Oct 8 21:23:57 2025 +0100 Commit: Caolán McNamara <[email protected]> CommitDate: Fri Oct 17 09:23:14 2025 +0200 use FPDFText_GetUnicode to get text instead of FPDFTextObj_GetText, which is returning 0x2 for some hyphens. If we use the slightly lower level apis we can get info as to substituted hyphens. Change-Id: I26efa9f1acb5ba819b63034399da4f1961373f13 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/192081 Tested-by: Jenkins CollaboraOffice <[email protected]> Reviewed-by: Miklos Vajna <[email protected]> (cherry picked from commit 25550b2daf29a4eb766dd22692c43b7be354a87c) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/192360 Reviewed-by: Caolán McNamara <[email protected]> Tested-by: Jenkins diff --git a/vcl/qa/cppunit/pdfexport/pdfexport2.cxx b/vcl/qa/cppunit/pdfexport/pdfexport2.cxx index 5b8398f1ef16..bf9c5f05a693 100644 --- a/vcl/qa/cppunit/pdfexport/pdfexport2.cxx +++ b/vcl/qa/cppunit/pdfexport/pdfexport2.cxx @@ -5577,7 +5577,7 @@ CPPUNIT_TEST_FIXTURE(PdfExportTest2, testTdf162194SoftHyphen) CPPUNIT_ASSERT_EQUAL(u"Waffle"_ustr, aText.at(0).trim()); CPPUNIT_ASSERT_EQUAL(u"AAA Waf"_ustr, aText.at(1).trim()); - CPPUNIT_ASSERT_EQUAL(u""_ustr, aText.at(2).trim()); + CPPUNIT_ASSERT_EQUAL(u"-"_ustr, aText.at(2).trim()); CPPUNIT_ASSERT_EQUAL(u"fle"_ustr, aText.at(3).trim()); } diff --git a/vcl/source/pdf/PDFiumLibrary.cxx b/vcl/source/pdf/PDFiumLibrary.cxx index 478d08e7797e..4b159141764a 100644 --- a/vcl/source/pdf/PDFiumLibrary.cxx +++ b/vcl/source/pdf/PDFiumLibrary.cxx @@ -465,8 +465,6 @@ public: PDFiumTextPageImpl(FPDF_TEXTPAGE pTextPage); ~PDFiumTextPageImpl(); - FPDF_TEXTPAGE getPointer() { return mpTextPage; } - int countChars() override; unsigned int getUnicode(int index) override; std::unique_ptr<PDFiumSearchHandle> findStart(const OUString& rFindWhat, PDFFindFlags nFlags, @@ -474,6 +472,73 @@ public: /// Returned rect is no longer upside down and is in mm100. basegfx::B2DRectangle getCharBox(int nIndex, double fPageHeight) override; + + OUString getText(FPDF_PAGEOBJECT pPageObject) + { + OUStringBuffer aResult; + + bool containsPreChar = false; + bool addLineFeed = false; + double posY(0), originX(0.0), originY(0.0); + + // FPDFTextObj_GetText also does a similar loop over the entire + // contents of the text page, this is the intended to be the equivalent + // of that except for (currently) added recovery of hyphens. + int count = FPDFText_CountChars(mpTextPage); + for (int i = 0; i < count; ++i) + { + FPDF_PAGEOBJECT pOwner = FPDFText_GetTextObject(mpTextPage, i); + sal_Unicode cUnicode = FPDFText_GetUnicode(mpTextPage, i); + if (pOwner == pPageObject) + { + FPDFText_GetCharOrigin(mpTextPage, i, &originX, &originY); + + if (fabs(posY - originY) > 0 && !containsPreChar && addLineFeed) + { + posY = originY; + if (!aResult.isEmpty()) + aResult.append(" "); + } + containsPreChar = true; + addLineFeed = false; + + switch (cUnicode) + { + case 0: + SAL_INFO("vcl.filter", "PDFiumImpl: cannot get unicode for char"); + break; + default: + aResult.append(cUnicode); + break; + case 0x2: // oddly pdfium replaces some '-' with 2. + { + int isHyphen = FPDFText_IsHyphen(mpTextPage, i); + if (isHyphen == 1) + aResult.append('-'); + else + { + SAL_WARN_IF(isHyphen == -1, "vcl.filter", + "PDFiumImpl: FPDFText_IsHyphen failure"); + aResult.append(cUnicode); + } + } + break; + } + } + else if (cUnicode == ' ' && containsPreChar) + { + aResult.append(' '); + containsPreChar = false; + addLineFeed = false; + } + else + { + containsPreChar = false; + addLineFeed = true; + } + } + return aResult.toString(); + } }; class PDFiumSignatureImpl final : public PDFiumSignature @@ -1090,9 +1155,8 @@ PDFiumPageObjectImpl::PDFiumPageObjectImpl(FPDF_PAGEOBJECT pPageObject) OUString PDFiumPageObjectImpl::getText(std::unique_ptr<PDFiumTextPage> const& rTextPage) { auto pTextPage = static_cast<PDFiumTextPageImpl*>(rTextPage.get()); - return getUnicodeString([this, pTextPage](FPDF_WCHAR* buffer, unsigned long length) { - return FPDFTextObj_GetText(mpPageObject, pTextPage->getPointer(), buffer, length); - }); + // FPDFTextObj_GetText may report some hyphens as 0x2 + return pTextPage->getText(mpPageObject); } PDFPageObjectType PDFiumPageObjectImpl::getType() commit 73c5466eb53e62c28df4a71341afea3815f169e2 Author: Caolán McNamara <[email protected]> AuthorDate: Mon Oct 6 12:16:22 2025 +0100 Commit: Caolán McNamara <[email protected]> CommitDate: Fri Oct 17 09:23:05 2025 +0200 font version needs to exist and be in a non-0 fractional format Change-Id: I72420866185a890b3b2af2acf2339bad3fe0080d Reviewed-on: https://gerrit.libreoffice.org/c/core/+/191961 Reviewed-by: Miklos Vajna <[email protected]> Tested-by: Jenkins CollaboraOffice <[email protected]> (cherry picked from commit 10a2e4ea3df5d1314de3af5c7e93c1eac96c31ed) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/192359 Tested-by: Jenkins Reviewed-by: Caolán McNamara <[email protected]> diff --git a/sd/qa/unit/data/pdf/ErrareHumanumEst.pdf b/sd/qa/unit/data/pdf/ErrareHumanumEst.pdf new file mode 100644 index 000000000000..f27ba2ed886e Binary files /dev/null and b/sd/qa/unit/data/pdf/ErrareHumanumEst.pdf differ diff --git a/sd/qa/unit/export-tests.cxx b/sd/qa/unit/export-tests.cxx index 0277d2127f3f..d6829ec1f9e6 100644 --- a/sd/qa/unit/export-tests.cxx +++ b/sd/qa/unit/export-tests.cxx @@ -1185,6 +1185,31 @@ CPPUNIT_TEST_FIXTURE(SdExportTest, testExplodedPdfGrayscaleImageUnderInvisibleTe CPPUNIT_ASSERT_MESSAGE("Shape should be Invisible", !bVisible); } +CPPUNIT_TEST_FIXTURE(SdExportTest, testExplodedPdfMissingFontVersion) +{ + auto pPdfium = vcl::pdf::PDFiumLibrary::get(); + if (!pPdfium) + return; + UsePdfium aGuard; + + loadFromFile(u"pdf/ErrareHumanumEst.pdf"); + + setFilterOptions("{\"DecomposePDF\":{\"type\":\"boolean\",\"value\":\"true\"}}"); + setImportFilterName(u"OpenDocument Drawing Flat XML"_ustr); + saveAndReload(u"OpenDocument Drawing Flat XML"_ustr); + + const SdrPage* pPage = GetPage(1); + + const SdrObject* pObj = pPage->GetObj(0); + CPPUNIT_ASSERT(pObj); + const SdrObjGroup* pObjGroup = dynamic_cast<const SdrObjGroup*>(pObj); + CPPUNIT_ASSERT(pObjGroup); + const SdrTextObj* pTextObj = DynCastSdrTextObj(pObjGroup->GetObj(0)); + OUString sText = pTextObj->GetOutlinerParaObject()->GetTextObject().GetText(0); + // Without fix this fails to import at all + CPPUNIT_ASSERT_EQUAL(u"Errare humanum est"_ustr, sText); +} + CPPUNIT_TEST_FIXTURE(SdExportTest, testEmbeddedText) { createSdDrawDoc("objectwithtext.fodg"); diff --git a/svx/source/svdraw/svdpdf.cxx b/svx/source/svdraw/svdpdf.cxx index 261aacfabfd8..9943ae8254b2 100644 --- a/svx/source/svdraw/svdpdf.cxx +++ b/svx/source/svdraw/svdpdf.cxx @@ -1116,6 +1116,12 @@ static bool toPfaCID(SubSetInfo& rSubSetInfo, const OUString& fileUrl, if (version.isEmpty()) version = CIDFontVersion; + if (version.isEmpty() || version.toDouble() == 0.0) + { + SAL_WARN("sd.filter", "Font version cannot be empty or 0.0"); + version = "0.001"_ostr; + } + if (!brokenFontName.isEmpty()) FontName = postScriptName.toUtf8();
