editeng/source/editeng/impedit2.cxx | 22 ++++++++++++---------- i18nutil/source/utility/unicode.cxx | 12 +++++++----- include/i18nutil/unicode.hxx | 10 +++++----- 3 files changed, 24 insertions(+), 20 deletions(-)
New commits: commit 529dce3d5b695637a1ccc8b4b697d87c5db0d3a2 Author: Khaled Hosny <kha...@libreoffice.org> AuthorDate: Mon Jul 24 22:07:25 2023 +0300 Commit: خالد حسني <kha...@libreoffice.org> CommitDate: Tue Jul 25 02:15:05 2023 +0200 editeng: Call unicode::getUnicodeType() on UTF-32 code units By using OUString::iterateCodePoints() instead of accessing individual UTF-16 code units that might be surrogate pairs. Change-Id: I5e3e513a788f0c939f96b0521fed16fe4848a053 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/154875 Tested-by: Jenkins Reviewed-by: خالد حسني <kha...@libreoffice.org> diff --git a/editeng/source/editeng/impedit2.cxx b/editeng/source/editeng/impedit2.cxx index 664c8f5d18f0..c976ebf93cb2 100644 --- a/editeng/source/editeng/impedit2.cxx +++ b/editeng/source/editeng/impedit2.cxx @@ -1732,10 +1732,10 @@ void ImpEditEngine::InitScriptTypes( sal_Int32 nPara ) { if ( _xBI->getScriptType( aText, nPos - 1 ) == i18n::ScriptType::WEAK ) { - switch ( u_charType(aText.iterateCodePoints(&nPos, 0) ) ) { - case U_NON_SPACING_MARK: - case U_ENCLOSING_MARK: - case U_COMBINING_SPACING_MARK: + switch (unicode::getUnicodeType(aText.iterateCodePoints(&nPos, 0))) { + case css::i18n::UnicodeType::NON_SPACING_MARK: + case css::i18n::UnicodeType::ENCLOSING_MARK: + case css::i18n::UnicodeType::COMBINING_SPACING_MARK: --nPos; rTypes.back().nEndPos--; break; @@ -2761,7 +2761,9 @@ EditPaM ImpEditEngine::ImpInsertText(const EditSelection& aCurSel, const OUStrin sal_Int32 nPos = nMaxNewChars; while (nPos-- > 0 && (nMaxNewChars - nPos) <= 84) { - switch (unicode::getUnicodeType(aLine[nPos])) + auto nNextPos = nPos; + const auto c = aLine.iterateCodePoints(&nNextPos); + switch (unicode::getUnicodeType(c)) { case css::i18n::UnicodeType::UPPERCASE_LETTER: case css::i18n::UnicodeType::LOWERCASE_LETTER: @@ -2775,24 +2777,24 @@ EditPaM ImpEditEngine::ImpInsertText(const EditSelection& aCurSel, const OUStrin break; default: { - const sal_Unicode c = aLine[nPos]; // Ignore NO-BREAK spaces, NBSP, NNBSP, ZWNBSP. if (c == 0x00A0 || c == 0x202F || c == 0xFEFF) break; - if (c == '-' && nPos + 1 < nMaxNewChars) + const auto n = aLine.iterateCodePoints(&nNextPos, 0); + if (c == '-' && nNextPos < nMaxNewChars) { // Keep HYPHEN-MINUS with a number to the right. - const sal_Int16 t = unicode::getUnicodeType(aLine[nPos+1]); + const sal_Int16 t = unicode::getUnicodeType(n); if ( t == css::i18n::UnicodeType::DECIMAL_DIGIT_NUMBER || t == css::i18n::UnicodeType::LETTER_NUMBER || t == css::i18n::UnicodeType::OTHER_NUMBER) nMaxNewChars = nPos; // line break before else - nMaxNewChars = nPos + 1; // line break after + nMaxNewChars = nNextPos; // line break after } else { - nMaxNewChars = nPos + 1; // line break after + nMaxNewChars = nNextPos; // line break after } nPos = 0; // will break loop } commit 2e6e40b7453e2005d46ba7866feff2f2caa1f100 Author: Khaled Hosny <kha...@libreoffice.org> AuthorDate: Mon Jul 24 20:52:44 2023 +0300 Commit: خالد حسني <kha...@libreoffice.org> CommitDate: Tue Jul 25 02:14:56 2023 +0200 i18nutil: Make unicode::getUnicodeType() take a UTF-32 code point Still need to fix call sites to handles surrogate pairs. Change-Id: I3ba896714fc6a90596c041148a3c9d965f60f4a1 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/154874 Tested-by: Jenkins Reviewed-by: خالد حسني <kha...@libreoffice.org> diff --git a/i18nutil/source/utility/unicode.cxx b/i18nutil/source/utility/unicode.cxx index dbb81a8240f1..ae7b4c512aca 100644 --- a/i18nutil/source/utility/unicode.cxx +++ b/i18nutil/source/utility/unicode.cxx @@ -67,9 +67,10 @@ unicode::getUnicodeScriptEnd( UnicodeScript type) { } sal_Int16 -unicode::getUnicodeType( const sal_Unicode ch ) { - static sal_Unicode c = 0x00; - static sal_Int16 r = 0x00; +unicode::getUnicodeType(const sal_uInt32 ch) +{ + static sal_uInt32 c = 0x00; + static sal_uInt32 r = 0x00; if (ch == c) return r; else c = ch; @@ -213,7 +214,7 @@ sal_uInt32 unicode::GetMirroredChar(sal_uInt32 nChar) { bit(UnicodeType::PARAGRAPH_SEPARATOR) #define IsType(func, mask) \ -bool func( const sal_Unicode ch) {\ +bool func( const sal_uInt32 ch) {\ return (bit(getUnicodeType(ch)) & (mask)) != 0;\ } @@ -224,7 +225,8 @@ IsType(unicode::isSpace, SPACEMASK) #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\ bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f) -bool unicode::isWhiteSpace( const sal_Unicode ch) { +bool unicode::isWhiteSpace(const sal_uInt32 ch) +{ return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE))); } diff --git a/include/i18nutil/unicode.hxx b/include/i18nutil/unicode.hxx index 69e3e9d6e267..be08595e0b10 100644 --- a/include/i18nutil/unicode.hxx +++ b/include/i18nutil/unicode.hxx @@ -38,17 +38,17 @@ struct ScriptTypeList class I18NUTIL_DLLPUBLIC unicode { public: - static sal_Int16 getUnicodeType(const sal_Unicode ch); + static sal_Int16 getUnicodeType(const sal_uInt32 ch); static sal_Int16 getUnicodeScriptType(const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType = 0); static sal_Unicode getUnicodeScriptStart(css::i18n::UnicodeScript type); static sal_Unicode getUnicodeScriptEnd(css::i18n::UnicodeScript type); static sal_uInt8 getUnicodeDirection(const sal_Unicode ch); static sal_uInt32 GetMirroredChar(sal_uInt32); - static bool isControl(const sal_Unicode ch); - static bool isAlpha(const sal_Unicode ch); - static bool isSpace(const sal_Unicode ch); - static bool isWhiteSpace(const sal_Unicode ch); + static bool isControl(const sal_uInt32 ch); + static bool isAlpha(const sal_uInt32 ch); + static bool isSpace(const sal_uInt32 ch); + static bool isWhiteSpace(const sal_uInt32 ch); /** Check for Unicode variation sequence selectors