commit: 4ee291116bdea2eb537cac76f6aa3c1ca2e46ae0 Author: Andreas Sturmlechner <asturm <AT> gentoo <DOT> org> AuthorDate: Sun Mar 26 09:22:53 2023 +0000 Commit: Andreas Sturmlechner <asturm <AT> gentoo <DOT> org> CommitDate: Sun Mar 26 09:50:04 2023 +0000 URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=4ee29111
kde-frameworks/baloo: Skip all unprintable characters (5.105 backport) See also: https://invent.kde.org/frameworks/baloo/-/merge_requests/87 Upstream commit 886aba423f3659ef591903f1f3dea87f8b4c6016 Bug: https://bugs.gentoo.org/899706 Signed-off-by: Andreas Sturmlechner <asturm <AT> gentoo.org> kde-frameworks/baloo/baloo-5.104.0-r1.ebuild | 37 ++++++++++++ .../baloo-5.104.0-skip-all-unprintable-chars.patch | 70 ++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/kde-frameworks/baloo/baloo-5.104.0-r1.ebuild b/kde-frameworks/baloo/baloo-5.104.0-r1.ebuild new file mode 100644 index 000000000000..ead3aed883ec --- /dev/null +++ b/kde-frameworks/baloo/baloo-5.104.0-r1.ebuild @@ -0,0 +1,37 @@ +# Copyright 1999-2023 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI=8 + +ECM_TEST="forceoptional" +PVCUT=$(ver_cut 1-2) +QTMIN=5.15.5 +VIRTUALX_REQUIRED="test" +inherit ecm frameworks.kde.org + +DESCRIPTION="Framework for searching and managing metadata" +LICENSE="LGPL-2+" +KEYWORDS="~amd64 ~arm ~arm64 ~loong ~ppc64 ~riscv ~x86" +IUSE="" + +RESTRICT="test" # bug 624250 + +DEPEND=" + >=dev-db/lmdb-0.9.17 + >=dev-qt/qtdbus-${QTMIN}:5 + >=dev-qt/qtdeclarative-${QTMIN}:5 + >=dev-qt/qtgui-${QTMIN}:5 + >=dev-qt/qtwidgets-${QTMIN}:5 + =kde-frameworks/kconfig-${PVCUT}*:5 + =kde-frameworks/kcoreaddons-${PVCUT}*:5 + =kde-frameworks/kcrash-${PVCUT}*:5 + =kde-frameworks/kdbusaddons-${PVCUT}*:5 + =kde-frameworks/kfilemetadata-${PVCUT}*:5 + =kde-frameworks/ki18n-${PVCUT}*:5 + =kde-frameworks/kidletime-${PVCUT}*:5 + =kde-frameworks/kio-${PVCUT}*:5 + =kde-frameworks/solid-${PVCUT}*:5 +" +RDEPEND="${DEPEND}" + +PATCHES=( "${FILESDIR}/${P}-skip-all-unprintable-chars.patch" ) diff --git a/kde-frameworks/baloo/files/baloo-5.104.0-skip-all-unprintable-chars.patch b/kde-frameworks/baloo/files/baloo-5.104.0-skip-all-unprintable-chars.patch new file mode 100644 index 000000000000..7e9eb0d74c42 --- /dev/null +++ b/kde-frameworks/baloo/files/baloo-5.104.0-skip-all-unprintable-chars.patch @@ -0,0 +1,70 @@ +From 886aba423f3659ef591903f1f3dea87f8b4c6016 Mon Sep 17 00:00:00 2001 +From: Igor Poboiko <[email protected]> +Date: Mon, 20 Mar 2023 13:20:33 +0000 +Subject: [PATCH] [TermGenerator] Skip all unprintable characters + +Some extractors can produce text which includes special unicode +control characters (e.g. Poppler can give us 0x0001 from some PDFs). +TermGenerator then generates proper (yet meaningless) terms out of those +characters, and they end up in database. It should be safe to skip all +unprintable characters to avoid that (although surrogates are fine, they +are dealt with later via QString::normalize call). + +Character 0x0001 is the worst, as it is used internally in DocTermsCodec +for compactification. Such collision then leads to the corrupted database +(some terms from DocTermsDB are not present in PostingDB). + +The corruption is not hypothetical (although not critical), I've encountered bunch of broken DB entries for some PDF files on my machine. + + +(cherry picked from commit 492321e53a41762555ba6528e15cd0d0188ed153) +--- + autotests/unit/engine/termgeneratortest.cpp | 11 +++++++++++ + src/engine/termgenerator.cpp | 2 +- + 2 files changed, 12 insertions(+), 1 deletion(-) + +diff --git a/autotests/unit/engine/termgeneratortest.cpp b/autotests/unit/engine/termgeneratortest.cpp +index 361c4934c..69885c133 100644 +--- a/autotests/unit/engine/termgeneratortest.cpp ++++ b/autotests/unit/engine/termgeneratortest.cpp +@@ -31,6 +31,7 @@ private Q_SLOTS: + void testWordPositions(); + void testWordPositionsCJK(); + void testNumbers(); ++ void testControlCharacter(); + + QList<QByteArray> allWords(const QString& str) + { +@@ -213,6 +214,16 @@ void TermGeneratorTest::testNumbers() + QCOMPARE(words, expectedWords); + } + ++void TermGeneratorTest::testControlCharacter() ++{ ++ QString str = QString::fromUtf8("word1\u0001word2"); ++ ++ QList<QByteArray> words = allWords(str); ++ QList<QByteArray> expectedWords = { "word1", "word2" }; ++ ++ QCOMPARE(words, expectedWords); ++} ++ + QTEST_MAIN(TermGeneratorTest) + + #include "termgeneratortest.moc" +diff --git a/src/engine/termgenerator.cpp b/src/engine/termgenerator.cpp +index d98b28416..832962da1 100644 +--- a/src/engine/termgenerator.cpp ++++ b/src/engine/termgenerator.cpp +@@ -59,7 +59,7 @@ QByteArrayList TermGenerator::termList(const QString& text_) + int start = 0; + + auto isSkipChar = [] (const QChar& c) { +- return c.isPunct() || c.isMark() || c.isSpace(); ++ return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && !c.isSurrogate()); + }; + + QByteArrayList list; +-- +GitLab +
