i18npool/inc/transliteration_Ignore.hxx                  |   19 +++
 i18npool/source/transliteration/ignoreDiacritics_CTL.cxx |   79 +++++++++++++--
 2 files changed, 87 insertions(+), 11 deletions(-)

New commits:
commit 18bc169b4727744f35227532078cbf7c9558bc9a
Author: Khaled Hosny <khaledho...@eglug.org>
Date:   Sun Jan 8 01:02:20 2017 +0200

    tdf#105170: Ignore diacritics on precomposed chars
    
    When ignoring diacritics, the old code just ignore combining marks which
    works when using decomposed forms (NFD) but does not work for
    precomposed forms (NFC).
    
    Instead, we now decompose, strip marks, then recompose, and use a nice
    icu::Transliterator that does the hard work for us. As a bonus, we
    should now handle surrogate pairs fine (most of the time).
    
    The new code (in ignoreDiacritics_CTL::folding()) might not be as
    efficient as the old code that used transliteration_Ignore::folding(),
    but it is less ugly and more easier to resonate with, or so I hope.
    
    Change-Id: If48c8be30527580cdd68f20b40a6533c5f258d83
    Reviewed-on: https://gerrit.libreoffice.org/32826
    Tested-by: Jenkins <c...@libreoffice.org>
    Reviewed-by: Eike Rathke <er...@redhat.com>
    (cherry picked from commit 278eabab2b5bdc95a51d501fcdb46c216ded3baa)
    Reviewed-on: https://gerrit.libreoffice.org/33214

diff --git a/i18npool/inc/transliteration_Ignore.hxx 
b/i18npool/inc/transliteration_Ignore.hxx
index e6573b6..3b67d64 100644
--- a/i18npool/inc/transliteration_Ignore.hxx
+++ b/i18npool/inc/transliteration_Ignore.hxx
@@ -21,6 +21,7 @@
 
 #include <transliteration_commonclass.hxx>
 #include <i18nutil/oneToOneMapping.hxx>
+#include <unicode/translit.h>
 
 typedef sal_Unicode (*TransFunc)(const sal_Unicode);
 
@@ -91,9 +92,25 @@ TRANSLITERATION_IGNORE(Space_ja_JP)
 TRANSLITERATION_IGNORE(TraditionalKana_ja_JP)
 TRANSLITERATION_IGNORE(TraditionalKanji_ja_JP)
 TRANSLITERATION_IGNORE(ZiZu_ja_JP)
-TRANSLITERATION_IGNORE(Diacritics_CTL)
 TRANSLITERATION_IGNORE(Kashida_CTL)
 
+class ignoreDiacritics_CTL : public transliteration_Ignore
+{
+    icu::Transliterator* m_transliterator;
+
+public:
+    ignoreDiacritics_CTL();
+
+    OUString SAL_CALL
+    folding(const OUString& rInStr, sal_Int32 nStartPos, sal_Int32 nCount, 
css::uno::Sequence<sal_Int32>& rOffset)
+        throw(css::uno::RuntimeException, std::exception) override;
+
+    sal_Unicode SAL_CALL
+    transliterateChar2Char(sal_Unicode nInChar)
+        throw(css::uno::RuntimeException,
+        css::i18n::MultipleCharsOutputException, std::exception) override;
+};
+
 #undef TRANSLITERATION_IGNORE
 
 #define TRANSLITERATION_IGNORE( name ) \
diff --git a/i18npool/source/transliteration/ignoreDiacritics_CTL.cxx 
b/i18npool/source/transliteration/ignoreDiacritics_CTL.cxx
index 8d6292c..c910b94 100644
--- a/i18npool/source/transliteration/ignoreDiacritics_CTL.cxx
+++ b/i18npool/source/transliteration/ignoreDiacritics_CTL.cxx
@@ -9,25 +9,84 @@
 
 #include <transliteration_Ignore.hxx>
 #include <unicode/uchar.h>
+#include <unicode/translit.h>
 
 namespace com { namespace sun { namespace star { namespace i18n {
 
-sal_Unicode
-ignoreDiacritics_CTL_translator (const sal_Unicode c)
-{
-    if(u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY) == U_NON_SPACING_MARK)
-        return 0xffff; // Skip this character
-
-    return c;
-}
-
 ignoreDiacritics_CTL::ignoreDiacritics_CTL()
 {
-    func = ignoreDiacritics_CTL_translator;
+    func = nullptr;
     table = nullptr;
     map = nullptr;
     transliterationName = "ignoreDiacritics_CTL";
     implementationName = 
"com.sun.star.i18n.Transliteration.ignoreDiacritics_CTL";
+
+    UErrorCode nStatus = U_ZERO_ERROR;
+    m_transliterator = icu::Transliterator::createInstance("NFD; [:M:] Remove; 
NFC",
+            UTRANS_FORWARD, nStatus);
+    if (U_FAILURE(nStatus))
+        m_transliterator = nullptr;
+}
+
+sal_Unicode SAL_CALL
+ignoreDiacritics_CTL::transliterateChar2Char(sal_Unicode nInChar)
+    throw(css::uno::RuntimeException,
+        css::i18n::MultipleCharsOutputException, std::exception)
+{
+    if (!m_transliterator)
+        throw css::uno::RuntimeException();
+
+    UnicodeString aChar(nInChar);
+    m_transliterator->transliterate(aChar);
+
+    if (aChar.isEmpty())
+        return 0xffff; // Skip this character.
+
+    if (aChar.length() > 1)
+        return nInChar; // Don't know what to do here, return the otiginal.
+
+    return aChar[0];
+}
+
+OUString SAL_CALL
+ignoreDiacritics_CTL::folding(const OUString& rInStr, sal_Int32 nStartPos,
+    sal_Int32 nCount, css::uno::Sequence<sal_Int32>& rOffset)
+    throw(css::uno::RuntimeException, std::exception)
+{
+    if (!m_transliterator)
+        throw css::uno::RuntimeException();
+
+    OUString aOutStr;
+
+    sal_Int32 nPosition = nStartPos;
+    sal_Int32 nOffset = 0;
+    if (useOffset)
+        rOffset.realloc(nCount);
+
+    while (nPosition < nStartPos + nCount)
+    {
+        sal_Int32 nIndex = nPosition;
+        UChar32 nChar = rInStr.iterateCodePoints(&nIndex);
+        UnicodeString aChar(nChar);
+        m_transliterator->transliterate(aChar);
+
+        if (useOffset && nOffset + aChar.length() > rOffset.getLength())
+            rOffset.realloc(rOffset.getLength() + aChar.length());
+
+        for (int32_t i = 0; i < aChar.length(); i++)
+        {
+            aOutStr += OUStringLiteral1(aChar[i]);
+            if (useOffset)
+                rOffset[nOffset++] = nPosition;
+        }
+
+        nPosition = nIndex;
+    }
+
+    if (useOffset)
+        rOffset.realloc(aOutStr.getLength());
+
+    return aOutStr;
 }
 
 } } } }
_______________________________________________
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

Reply via email to