poppler/Lexer.cc | 27 +++++++++++------- poppler/UTF.cc | 3 ++ poppler/UTF.h | 11 +++++++ qt5/tests/CMakeLists.txt | 3 +- qt5/tests/check_utf8document.cpp | 57 +++++++++++++++++++++++++++++++++++++++ qt6/tests/CMakeLists.txt | 1 qt6/tests/check_utf8document.cpp | 57 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 148 insertions(+), 11 deletions(-)
New commits: commit 9183da4fcb8d06360ed51f7f1131a14300008735 Author: Sune Vuorela <s...@vuorela.dk> Date: Tue Jun 13 22:24:24 2023 +0000 Fix reading of utf8-with-bom files diff --git a/poppler/Lexer.cc b/poppler/Lexer.cc index ab25caf5..01548950 100644 --- a/poppler/Lexer.cc +++ b/poppler/Lexer.cc @@ -33,6 +33,7 @@ #include <cctype> #include "Lexer.h" #include "Error.h" +#include "UTF.h" #include "XRef.h" //------------------------------------------------------------------------ @@ -163,7 +164,7 @@ Object Lexer::getObj(int objNum) int xi; long long xll = 0; double xf = 0, scale; - GooString *s; + std::unique_ptr<GooString> s; int n, m; // skip whitespace and comments @@ -389,7 +390,7 @@ Object Lexer::getObj(int objNum) if (c2 != EOF) { if (n == tokBufSize) { if (!s) { - s = new GooString(tokBuf, tokBufSize); + s = std::make_unique<GooString>(tokBuf, tokBufSize); } else { s->append(tokBuf, tokBufSize); } @@ -402,7 +403,7 @@ Object Lexer::getObj(int objNum) if (newObjNum != objNum) { error(errSyntaxError, getPos(), "Unterminated string"); done = true; - delete s; + s.reset(); n = -2; } } @@ -413,11 +414,15 @@ Object Lexer::getObj(int objNum) } while (!done); if (n >= 0) { if (!s) { - s = new GooString(tokBuf, n); + s = std::make_unique<GooString>(tokBuf, n); } else { s->append(tokBuf, n); } - return Object(s); + // Check utf8 + if (isUtf8WithBom(s->toStr())) { + s = utf8ToUtf16WithBom(s->toStr()); + } + return Object(s.release()); } else { return Object(objEOF); } @@ -464,7 +469,7 @@ Object Lexer::getObj(int objNum) } else if (n == tokBufSize) { error(errSyntaxError, getPos(), "Warning: name token is longer than what the specification says it can be"); *p = c; - s = new GooString(tokBuf, n); + s = std::make_unique<GooString>(tokBuf, n); } else { s->append((char)c); } @@ -474,7 +479,6 @@ Object Lexer::getObj(int objNum) return Object(objName, tokBuf); } else { Object obj(objName, s->c_str()); - delete s; return obj; } break; @@ -525,7 +529,7 @@ Object Lexer::getObj(int objNum) if (++m == 2) { if (n == tokBufSize) { if (!s) { - s = new GooString(tokBuf, tokBufSize); + s = std::make_unique<GooString>(tokBuf, tokBufSize); } else { s->append(tokBuf, tokBufSize); } @@ -540,14 +544,17 @@ Object Lexer::getObj(int objNum) } } if (!s) { - s = new GooString(tokBuf, n); + s = std::make_unique<GooString>(tokBuf, n); } else { s->append(tokBuf, n); } if (m == 1) { s->append((char)(c2 << 4)); } - return Object(s); + if (isUtf8WithBom(s->toStr())) { + s = utf8ToUtf16WithBom(s->toStr()); + } + return Object(s.release()); } break; diff --git a/poppler/UTF.cc b/poppler/UTF.cc index 9b1bf954..2ea00895 100644 --- a/poppler/UTF.cc +++ b/poppler/UTF.cc @@ -356,6 +356,9 @@ int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8) // Allocate utf16 string and convert utf8 into it. uint16_t *utf8ToUtf16(const char *utf8, int *len) { + if (isUtf8WithBom(utf8)) { + utf8 += 3; + } int n = utf8CountUtf16CodeUnits(utf8); if (len) { *len = n; diff --git a/poppler/UTF.h b/poppler/UTF.h index 626c6862..312f231d 100644 --- a/poppler/UTF.h +++ b/poppler/UTF.h @@ -73,6 +73,17 @@ int POPPLER_PRIVATE_EXPORT utf8ToUtf16(const char *utf8, uint16_t *utf16, int ma // Allocate utf16 string and convert utf8 into it. uint16_t POPPLER_PRIVATE_EXPORT *utf8ToUtf16(const char *utf8, int *len = nullptr); +inline bool isUtf8WithBom(std::string_view str) +{ + if (str.size() < 4) { + return false; + } + if (str[0] == '\xef' && str[1] == '\xbb' && str[2] == '\xbf') { + return true; + } + return false; +} + // Converts a UTF-8 string to a big endian UTF-16 string with BOM. // The caller owns the returned pointer. // utf8 - UTF-8 string to convert. An empty string is acceptable. diff --git a/qt5/tests/CMakeLists.txt b/qt5/tests/CMakeLists.txt index 0b1931ba..9de870ee 100644 --- a/qt5/tests/CMakeLists.txt +++ b/qt5/tests/CMakeLists.txt @@ -17,7 +17,7 @@ macro(QT5_ADD_SIMPLETEST exe source) endmacro(QT5_ADD_SIMPLETEST) macro(QT5_ADD_QTEST exe source) - if (Qt5Test_FOUND) + if (Qt5Test_FOUND) string(REPLACE "-" "" test_name ${exe}) set(${test_name}_SOURCES ${source} @@ -71,6 +71,7 @@ qt5_add_qtest(check_qt5_stroke_opacity check_stroke_opacity.cpp) qt5_add_qtest(check_qt5_utf_conversion check_utf_conversion.cpp) qt5_add_qtest(check_qt5_outline check_outline.cpp) qt5_add_qtest(check_qt5_signature_basics check_signature_basics.cpp) +qt5_add_qtest(check_qt5_utf8document check_utf8document.cpp) qt5_add_qtest(check_qt5_distinguished_name_parser check_distinguished_name_parser.cpp) qt5_add_qtest(check_qt5_cidfontswidthsbuilder check_cidfontswidthsbuilder.cpp) if (NOT WIN32) diff --git a/qt5/tests/check_utf8document.cpp b/qt5/tests/check_utf8document.cpp new file mode 100644 index 00000000..ebeb22c6 --- /dev/null +++ b/qt5/tests/check_utf8document.cpp @@ -0,0 +1,57 @@ +#include <QtTest/QtTest> + +#include "PDFDoc.h" +#include "GlobalParams.h" + +#include "Outline.h" +#include "poppler-private.h" + +class TestUtf8Document : public QObject +{ + Q_OBJECT +public: + explicit TestUtf8Document(QObject *parent = nullptr) : QObject(parent) { } +private Q_SLOTS: + void checkStrings(); +}; + +inline QString outlineItemTitle(OutlineItem *item) +{ + if (!item) { + return {}; + } + return QString::fromUcs4(item->getTitle(), item->getTitleLength()); +} + +void TestUtf8Document::checkStrings() +{ + + globalParams = std::make_unique<GlobalParams>(); + auto doc = std::make_unique<PDFDoc>(std::make_unique<GooString>(TESTDATADIR "/unittestcases/pdf20-utf8-test.pdf")); + QVERIFY(doc); + QVERIFY(doc->isOk()); + + QVERIFY(doc->getOptContentConfig() && doc->getOptContentConfig()->hasOCGs()); + + QCOMPARE(Poppler::UnicodeParsedString(doc->getDocInfoTitle().get()), QStringLiteral("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀")); + + QSet<QString> expectedNames { QStringLiteral("گچپژ"), QStringLiteral("Layer 1") }; + QSet<QString> foundNames; + + for (auto &[ref, group] : doc->getOptContentConfig()->getOCGs()) { + foundNames.insert(Poppler::UnicodeParsedString(group->getName())); + } + QCOMPARE(expectedNames, foundNames); + + auto outlineItems = doc->getOutline()->getItems(); + QVERIFY(outlineItems); + QCOMPARE(outlineItems->size(), 3); + + QCOMPARE(outlineItemTitle(outlineItems->at(0)), QStringLiteral("PDF 2.0 with UTF-8 test file")); + QCOMPARE(outlineItemTitle(outlineItems->at(1)), QStringLiteral("\u202A\u202Atest\u202A")); + QCOMPARE(outlineItemTitle(outlineItems->at(2)), QStringLiteral("🌈️\n" /*emoji rainbow flag*/)); +} + +QTEST_GUILESS_MAIN(TestUtf8Document) + +#include "check_utf8document.moc" diff --git a/qt6/tests/CMakeLists.txt b/qt6/tests/CMakeLists.txt index da18b15d..577aad7f 100644 --- a/qt6/tests/CMakeLists.txt +++ b/qt6/tests/CMakeLists.txt @@ -63,6 +63,7 @@ qt6_add_qtest(check_qt6_stroke_opacity check_stroke_opacity.cpp) qt6_add_qtest(check_qt6_utf_conversion check_utf_conversion.cpp) qt6_add_qtest(check_qt6_outline check_outline.cpp) qt6_add_qtest(check_qt6_signature_basics check_signature_basics.cpp) +qt6_add_qtest(check_qt6_utf8document check_utf8document.cpp) qt6_add_qtest(check_qt6_distinguished_name_parser check_distinguished_name_parser.cpp) qt6_add_qtest(check_qt6_cidfontswidthsbuilder check_cidfontswidthsbuilder.cpp) if (NOT WIN32) diff --git a/qt6/tests/check_utf8document.cpp b/qt6/tests/check_utf8document.cpp new file mode 100644 index 00000000..ebeb22c6 --- /dev/null +++ b/qt6/tests/check_utf8document.cpp @@ -0,0 +1,57 @@ +#include <QtTest/QtTest> + +#include "PDFDoc.h" +#include "GlobalParams.h" + +#include "Outline.h" +#include "poppler-private.h" + +class TestUtf8Document : public QObject +{ + Q_OBJECT +public: + explicit TestUtf8Document(QObject *parent = nullptr) : QObject(parent) { } +private Q_SLOTS: + void checkStrings(); +}; + +inline QString outlineItemTitle(OutlineItem *item) +{ + if (!item) { + return {}; + } + return QString::fromUcs4(item->getTitle(), item->getTitleLength()); +} + +void TestUtf8Document::checkStrings() +{ + + globalParams = std::make_unique<GlobalParams>(); + auto doc = std::make_unique<PDFDoc>(std::make_unique<GooString>(TESTDATADIR "/unittestcases/pdf20-utf8-test.pdf")); + QVERIFY(doc); + QVERIFY(doc->isOk()); + + QVERIFY(doc->getOptContentConfig() && doc->getOptContentConfig()->hasOCGs()); + + QCOMPARE(Poppler::UnicodeParsedString(doc->getDocInfoTitle().get()), QStringLiteral("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀")); + + QSet<QString> expectedNames { QStringLiteral("گچپژ"), QStringLiteral("Layer 1") }; + QSet<QString> foundNames; + + for (auto &[ref, group] : doc->getOptContentConfig()->getOCGs()) { + foundNames.insert(Poppler::UnicodeParsedString(group->getName())); + } + QCOMPARE(expectedNames, foundNames); + + auto outlineItems = doc->getOutline()->getItems(); + QVERIFY(outlineItems); + QCOMPARE(outlineItems->size(), 3); + + QCOMPARE(outlineItemTitle(outlineItems->at(0)), QStringLiteral("PDF 2.0 with UTF-8 test file")); + QCOMPARE(outlineItemTitle(outlineItems->at(1)), QStringLiteral("\u202A\u202Atest\u202A")); + QCOMPARE(outlineItemTitle(outlineItems->at(2)), QStringLiteral("🌈️\n" /*emoji rainbow flag*/)); +} + +QTEST_GUILESS_MAIN(TestUtf8Document) + +#include "check_utf8document.moc"