commit:     4ee291116bdea2eb537cac76f6aa3c1ca2e46ae0
Author:     Andreas Sturmlechner <asturm <AT> gentoo <DOT> org>
AuthorDate: Sun Mar 26 09:22:53 2023 +0000
Commit:     Andreas Sturmlechner <asturm <AT> gentoo <DOT> org>
CommitDate: Sun Mar 26 09:50:04 2023 +0000
URL:        https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=4ee29111

kde-frameworks/baloo: Skip all unprintable characters (5.105 backport)

See also: https://invent.kde.org/frameworks/baloo/-/merge_requests/87

Upstream commit 886aba423f3659ef591903f1f3dea87f8b4c6016

Bug: https://bugs.gentoo.org/899706
Signed-off-by: Andreas Sturmlechner <asturm <AT> gentoo.org>

 kde-frameworks/baloo/baloo-5.104.0-r1.ebuild       | 37 ++++++++++++
 .../baloo-5.104.0-skip-all-unprintable-chars.patch | 70 ++++++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/kde-frameworks/baloo/baloo-5.104.0-r1.ebuild 
b/kde-frameworks/baloo/baloo-5.104.0-r1.ebuild
new file mode 100644
index 000000000000..ead3aed883ec
--- /dev/null
+++ b/kde-frameworks/baloo/baloo-5.104.0-r1.ebuild
@@ -0,0 +1,37 @@
+# Copyright 1999-2023 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=8
+
+ECM_TEST="forceoptional"
+PVCUT=$(ver_cut 1-2)
+QTMIN=5.15.5
+VIRTUALX_REQUIRED="test"
+inherit ecm frameworks.kde.org
+
+DESCRIPTION="Framework for searching and managing metadata"
+LICENSE="LGPL-2+"
+KEYWORDS="~amd64 ~arm ~arm64 ~loong ~ppc64 ~riscv ~x86"
+IUSE=""
+
+RESTRICT="test" # bug 624250
+
+DEPEND="
+       >=dev-db/lmdb-0.9.17
+       >=dev-qt/qtdbus-${QTMIN}:5
+       >=dev-qt/qtdeclarative-${QTMIN}:5
+       >=dev-qt/qtgui-${QTMIN}:5
+       >=dev-qt/qtwidgets-${QTMIN}:5
+       =kde-frameworks/kconfig-${PVCUT}*:5
+       =kde-frameworks/kcoreaddons-${PVCUT}*:5
+       =kde-frameworks/kcrash-${PVCUT}*:5
+       =kde-frameworks/kdbusaddons-${PVCUT}*:5
+       =kde-frameworks/kfilemetadata-${PVCUT}*:5
+       =kde-frameworks/ki18n-${PVCUT}*:5
+       =kde-frameworks/kidletime-${PVCUT}*:5
+       =kde-frameworks/kio-${PVCUT}*:5
+       =kde-frameworks/solid-${PVCUT}*:5
+"
+RDEPEND="${DEPEND}"
+
+PATCHES=( "${FILESDIR}/${P}-skip-all-unprintable-chars.patch" )

diff --git 
a/kde-frameworks/baloo/files/baloo-5.104.0-skip-all-unprintable-chars.patch 
b/kde-frameworks/baloo/files/baloo-5.104.0-skip-all-unprintable-chars.patch
new file mode 100644
index 000000000000..7e9eb0d74c42
--- /dev/null
+++ b/kde-frameworks/baloo/files/baloo-5.104.0-skip-all-unprintable-chars.patch
@@ -0,0 +1,70 @@
+From 886aba423f3659ef591903f1f3dea87f8b4c6016 Mon Sep 17 00:00:00 2001
+From: Igor Poboiko <[email protected]>
+Date: Mon, 20 Mar 2023 13:20:33 +0000
+Subject: [PATCH] [TermGenerator] Skip all unprintable characters
+
+Some extractors can produce text which includes special unicode
+control characters (e.g. Poppler can give us 0x0001 from some PDFs).
+TermGenerator then generates proper (yet meaningless) terms out of those
+characters, and they end up in database. It should be safe to skip all
+unprintable characters to avoid that (although surrogates are fine, they
+are dealt with later via QString::normalize call).
+
+Character 0x0001 is the worst, as it is used internally in DocTermsCodec
+for compactification. Such collision then leads to the corrupted database
+(some terms from DocTermsDB are not present in PostingDB).
+
+The corruption is not hypothetical (although not critical), I've encountered 
bunch of broken DB entries for some PDF files on my machine.
+
+
+(cherry picked from commit 492321e53a41762555ba6528e15cd0d0188ed153)
+---
+ autotests/unit/engine/termgeneratortest.cpp | 11 +++++++++++
+ src/engine/termgenerator.cpp                |  2 +-
+ 2 files changed, 12 insertions(+), 1 deletion(-)
+
+diff --git a/autotests/unit/engine/termgeneratortest.cpp 
b/autotests/unit/engine/termgeneratortest.cpp
+index 361c4934c..69885c133 100644
+--- a/autotests/unit/engine/termgeneratortest.cpp
++++ b/autotests/unit/engine/termgeneratortest.cpp
+@@ -31,6 +31,7 @@ private Q_SLOTS:
+     void testWordPositions();
+     void testWordPositionsCJK();
+     void testNumbers();
++    void testControlCharacter();
+ 
+     QList<QByteArray> allWords(const QString& str)
+     {
+@@ -213,6 +214,16 @@ void TermGeneratorTest::testNumbers()
+     QCOMPARE(words, expectedWords);
+ }
+ 
++void TermGeneratorTest::testControlCharacter()
++{
++    QString str = QString::fromUtf8("word1\u0001word2");
++
++    QList<QByteArray> words = allWords(str);
++    QList<QByteArray> expectedWords = { "word1", "word2" };
++
++    QCOMPARE(words, expectedWords);
++}
++
+ QTEST_MAIN(TermGeneratorTest)
+ 
+ #include "termgeneratortest.moc"
+diff --git a/src/engine/termgenerator.cpp b/src/engine/termgenerator.cpp
+index d98b28416..832962da1 100644
+--- a/src/engine/termgenerator.cpp
++++ b/src/engine/termgenerator.cpp
+@@ -59,7 +59,7 @@ QByteArrayList TermGenerator::termList(const QString& text_)
+     int start = 0;
+ 
+     auto isSkipChar = [] (const QChar& c) {
+-        return c.isPunct() || c.isMark() || c.isSpace();
++        return c.isPunct() || c.isMark() || c.isSpace() || (!c.isPrint() && 
!c.isSurrogate());
+     };
+ 
+     QByteArrayList list;
+-- 
+GitLab
+

Reply via email to