sw/inc/iodetect.hxx                     |    4 ++
 sw/qa/uitest/data/tdf106899.odt         |binary
 sw/qa/uitest/data/tdf106899.sdi         |    1 
 sw/qa/uitest/writer_tests5/tdf106899.py |   44 ++++++++++++++++++++++++++++++++
 sw/source/core/edit/edtox.cxx           |    6 +++-
 sw/source/filter/basflt/iodetect.cxx    |   17 ++++++++++++
 6 files changed, 71 insertions(+), 1 deletion(-)

New commits:
commit ddf9b2e23768a33041a3efe20840f1e11abff434
Author:     Andreas Heinisch <andreas.heini...@yahoo.de>
AuthorDate: Sun Jan 9 18:44:11 2022 +0100
Commit:     Mike Kaganski <mike.kagan...@collabora.com>
CommitDate: Mon Jan 31 13:51:20 2022 +0100

    tdf#106899 - Import concordance file using appropriate charset
    
    At the beginning of the import process for the various index entries,
    try to determine the correct character set for the tox concordance file.
    
    Change-Id: I3f48325a80ed08c2c06c295a24b2fc29ce1adf99
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/128194
    Reviewed-by: Mike Kaganski <mike.kagan...@collabora.com>
    Reviewed-by: Andreas Heinisch <andreas.heini...@yahoo.de>
    Tested-by: Jenkins
    Signed-off-by: Xisco Fauli <xiscofa...@libreoffice.org>
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129206

diff --git a/sw/inc/iodetect.hxx b/sw/inc/iodetect.hxx
index 9069b01ffda0..ce24e0800134 100644
--- a/sw/inc/iodetect.hxx
+++ b/sw/inc/iodetect.hxx
@@ -28,6 +28,8 @@
 #include <tools/solar.h>
 #include "swdllapi.h"
 
+#define DETECT_ENCODING_BUFFER_SIZE 4096
+
 #define FILTER_RTF      "RTF"       ///< RTF filter
 #define sRtfWH          "WH_RTF"
 #define FILTER_TEXT     u"TEXT"      ///< text filter with default codeset
@@ -105,6 +107,8 @@ public:
     static bool IsValidStgFilter( SotStorage& , const SfxFilter& );
     static bool IsValidStgFilter( const css::uno::Reference < 
css::embed::XStorage >& rStg, const SfxFilter& rFilter);
 
+    // tdf#106899 - wrapper around IsDetectableText to retrieve the text 
encoding for a given stream
+    static rtl_TextEncoding GetTextEncoding(SvStream&);
     static bool IsDetectableText( const char* pBuf, sal_uLong &rLen,
             rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool 
*pBom);
 
diff --git a/sw/qa/uitest/data/tdf106899.odt b/sw/qa/uitest/data/tdf106899.odt
new file mode 100644
index 000000000000..3f77c82655f3
Binary files /dev/null and b/sw/qa/uitest/data/tdf106899.odt differ
diff --git a/sw/qa/uitest/data/tdf106899.sdi b/sw/qa/uitest/data/tdf106899.sdi
new file mode 100644
index 000000000000..6b110b259661
--- /dev/null
+++ b/sw/qa/uitest/data/tdf106899.sdi
@@ -0,0 +1 @@
+Nguyễn Khánh
\ No newline at end of file
diff --git a/sw/qa/uitest/writer_tests5/tdf106899.py 
b/sw/qa/uitest/writer_tests5/tdf106899.py
new file mode 100644
index 000000000000..389552e24a89
--- /dev/null
+++ b/sw/qa/uitest/writer_tests5/tdf106899.py
@@ -0,0 +1,44 @@
+# -*- tab-width: 4; indent-tabs-mode: nil; py-indent-offset: 4 -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+import org.libreoffice.unotest
+from uitest.framework import UITestCase
+from uitest.uihelper.common import get_url_for_data_file
+import time
+
+class tdf106899(UITestCase):
+
+    def test_tdf106899_alphabetical_index_utf8(self):
+        # Copy concordance file containing an utf8 index entry
+        org.libreoffice.unotest.makeCopyFromTDOC("tdf106899.sdi")
+        with self.ui_test.load_file(get_url_for_data_file("tdf106899.odt")) as 
document:
+            xWriterDoc = self.xUITest.getTopFocusWindow()
+
+            # Update the alphabetical index and check if it contains the utf8 
index entry
+            xDocumentIndexes = document.DocumentIndexes
+            self.assertEqual(xDocumentIndexes.getCount(), 1)
+            self.assertEqual(xDocumentIndexes.hasByName("Alphabetical 
Index1"), True)
+            xDocumentIndex = xDocumentIndexes.getByName("Alphabetical Index1")
+            xIndexAnchor = xDocumentIndex.getAnchor()
+            self.assertEqual("Nguyễn Khánh" in xIndexAnchor.getString(), False)
+
+            # TODO Bug Report - Refresh of the index does only work using 
.uno:UpdateAllIndexes
+            # It does not work with xDocumentIndex.refresh() nor with 
xDocumentIndex.update()
+            self.xUITest.executeCommand(".uno:UpdateAllIndexes")
+
+            # TODO Bug Report - Retrieving the text of the updated index only 
works using the cursor
+            # It does not work with xIndexAnchor.getString()
+            xCursor = document.getText().createTextCursor()
+            xCursor.gotoRange(xDocumentIndex.getAnchor().getEnd(), False)
+            xCursor.gotoStartOfParagraph(True)
+
+            # Without the fix in place the index does not contain the ut8 
index entry
+            self.assertEqual("Nguyễn Khánh" in xCursor.getString(), True)
+
+# vim: set shiftwidth=4 softtabstop=4 expandtab:
diff --git a/sw/source/core/edit/edtox.cxx b/sw/source/core/edit/edtox.cxx
index c10f36671cd4..df58e49cdbaa 100644
--- a/sw/source/core/edit/edtox.cxx
+++ b/sw/source/core/edit/edtox.cxx
@@ -42,6 +42,7 @@
 #include <docary.hxx>
 #include <mdiexp.hxx>
 #include <strings.hrc>
+#include <iodetect.hxx>
 
 using namespace ::com::sun::star;
 using namespace ::com::sun::star::i18n;
@@ -295,7 +296,10 @@ void SwEditShell::ApplyAutoMark()
         SfxMedium aMedium( sAutoMarkURL, StreamMode::STD_READ );
         SvStream& rStrm = *aMedium.GetInStream();
         Push();
-        rtl_TextEncoding eChrSet = ::osl_getThreadTextEncoding();
+        // tdf#106899 - import tox concordance file using the appropriate 
character set
+        rtl_TextEncoding eChrSet = SwIoSystem::GetTextEncoding(rStrm);
+        if (eChrSet == RTL_TEXTENCODING_DONTKNOW)
+            eChrSet = ::osl_getThreadTextEncoding();
 
         // SearchOptions to be used in loop below
         sal_Int32 const nLEV_Other    = 2;    //  -> changedChars;
diff --git a/sw/source/filter/basflt/iodetect.cxx 
b/sw/source/filter/basflt/iodetect.cxx
index e4d214391f2c..bc91a8460c19 100644
--- a/sw/source/filter/basflt/iodetect.cxx
+++ b/sw/source/filter/basflt/iodetect.cxx
@@ -20,6 +20,7 @@
 #include <iodetect.hxx>
 #include <memory>
 #include <osl/endian.h>
+#include <osl/thread.h>
 #include <sot/storage.hxx>
 #include <tools/urlobj.hxx>
 #include <unotools/moduleoptions.hxx>
@@ -238,6 +239,22 @@ std::shared_ptr<const SfxFilter> 
SwIoSystem::GetFileFilter(const OUString& rFile
     return SwIoSystem::GetFilterOfFormat(FILTER_TEXT);
 }
 
+rtl_TextEncoding SwIoSystem::GetTextEncoding(SvStream& rStrm)
+{
+    sal_Size nLen, nOrig;
+    char aBuf[DETECT_ENCODING_BUFFER_SIZE];
+    nOrig = nLen = rStrm.ReadBytes(aBuf, DETECT_ENCODING_BUFFER_SIZE);
+
+    rtl_TextEncoding eCharSet;
+    const bool bRet = SwIoSystem::IsDetectableText(aBuf, nLen, &eCharSet, 
nullptr, nullptr, nullptr);
+    if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
+        rStrm.SeekRel(-(tools::Long(nLen)));
+    else
+        rStrm.SeekRel(-(tools::Long(nOrig)));
+
+    return eCharSet;
+}
+
 bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen,
     rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom)
 {

Reply via email to