Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package nuspell for openSUSE:Factory checked in at 2021-01-18 11:26:07 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/nuspell (Old) and /work/SRC/openSUSE:Factory/.nuspell.new.28504 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "nuspell" Mon Jan 18 11:26:07 2021 rev:4 rq:862717 version:4.2.0 Changes: -------- --- /work/SRC/openSUSE:Factory/nuspell/nuspell.changes 2020-11-29 12:19:08.777438895 +0100 +++ /work/SRC/openSUSE:Factory/.nuspell.new.28504/nuspell.changes 2021-01-18 11:26:12.988398614 +0100 @@ -1,0 +2,9 @@ +Tue Jan 12 21:07:52 UTC 2021 - andy great <andythe_gr...@pm.me> + +- Update to version 4.2.0. + * Deprecate functions that allowed non-Unicode encoding. In + particular, Dictionary::imbue() and Dictionary::imbue_utf8(). + * Completely remove dependency on Boost. The CLI tools were + refactored to use ICU directly. + +------------------------------------------------------------------- Old: ---- nuspell-4.1.0.tar.gz New: ---- nuspell-4.2.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ nuspell.spec ++++++ --- /var/tmp/diff_new_pack.6vyn1t/_old 2021-01-18 11:26:13.744400356 +0100 +++ /var/tmp/diff_new_pack.6vyn1t/_new 2021-01-18 11:26:13.748400365 +0100 @@ -1,7 +1,7 @@ # # spec file for package nuspell # -# Copyright (c) 2020 SUSE LLC +# Copyright (c) 2021 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,7 +19,7 @@ %define sonum 4 %define libname libnuspell Name: nuspell -Version: 4.1.0 +Version: 4.2.0 Release: 0 Summary: A spell checker library and command-line tool License: LGPL-3.0-or-later @@ -30,7 +30,6 @@ BuildRequires: cmake BuildRequires: doxygen BuildRequires: gcc-c++ -BuildRequires: libboost_locale-devel BuildRequires: libicu-devel BuildRequires: rubygem(%{rb_default_ruby_abi}:ronn) Requires: hunspell @@ -107,7 +106,7 @@ %files -n %{libname}%{sonum} %doc README.md CHANGELOG.md AUTHORS %license COPYING.LESSER COPYING -%{_libdir}/%{libname}.so.%{sonum}.1.0 +%{_libdir}/%{libname}.so.%{version} %exclude %{_datadir}/doc/nuspell/README.md %files devel ++++++ nuspell-4.1.0.tar.gz -> nuspell-4.2.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/.travis.yml new/nuspell-4.2.0/.travis.yml --- old/nuspell-4.1.0/.travis.yml 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/.travis.yml 2020-12-12 23:40:04.000000000 +0100 @@ -1,9 +1,9 @@ language: cpp -dist: bionic -os: -- linux -- osx +os: [ linux, osx ] + +dist: bionic +osx_image: xcode11.3 env: global: @@ -13,30 +13,25 @@ - CXXFLAGS=-fsanitize=address - CXXFLAGS=-fsanitize=undefined - CXXFLAGS="-Wall -Wextra -Werror" - - CONFIGURE_ARGS="-DBUILD_SHARED_LIBS=1 -DCMAKE_BUILD_TYPE=Release" + - CONFIGURE_ARGS="-DCMAKE_BUILD_TYPE=Release" matrix: include: -# set B2_ARGS to non-empty to use custom build Boost. B2_ARGS=" " is sufficient - os: linux - env: CXXFLAGS="-D_GLIBCXX_DEBUG -O1" B2_ARGS=define=_GLIBCXX_DEBUG + env: CXXFLAGS="-D_GLIBCXX_DEBUG -O1" - os: windows env: CTEST_ARGS="-C Debug" - os: windows - env: CONFIGURE_ARGS="-DBUILD_SHARED_LIBS=1" BUILD_ARGS="--config Release" CTEST_ARGS="-C Release" + env: BUILD_ARGS="--config Release" CTEST_ARGS="-C Release" + +addons: + apt: + packages: [ libicu-dev ] + homebrew: + packages: [ icu4c ] before_install: - | - if [ "$B2_ARGS" ]; then - cd ~ && - wget https://dl.bintray.com/boostorg/release/1.64.0/source/boost_1_64_0.tar.bz2 -O - | tar -xj && - cd - - fi -- | - if [ -z "$B2_ARGS" ] && [ "$TRAVIS_OS_NAME" = linux ]; then - sudo apt-get install -y libboost-locale-dev - fi -- | if [ "$TRAVIS_OS_NAME" = windows ]; then # This should be added to PATH as on local Windows Git Bash install, # but it isn't on travis. xz is there @@ -45,16 +40,6 @@ fi install: -# if we update to boost 1.67 we can add define=BOOST_LOCALE_HIDE_AUTO_PTR to b2 -- | - if [ "$B2_ARGS" ]; then - cd ~/boost_1_64_0 && - ./bootstrap.sh --with-libraries=locale --with-toolset="$CC" && - ./b2 -d0 -j 4 cxxflags=-std=c++14 $B2_ARGS && - cd - && - export BOOST_ROOT=~/boost_1_64_0 && - export LD_LIBRARY_PATH=$BOOST_ROOT/stage/lib - fi - | if [ "$TRAVIS_OS_NAME" = linux ]; then export CMAKE_BUILD_PARALLEL_LEVEL=4 @@ -62,7 +47,7 @@ - | if [ "$TRAVIS_OS_NAME" = osx ]; then export ICU_ROOT=$(brew --prefix icu4c) - BUILD_ARGS+=" -- -j 4" #parallel build + export CMAKE_BUILD_PARALLEL_LEVEL=4 fi - | if [ "$TRAVIS_OS_NAME" = windows ]; then @@ -77,13 +62,3 @@ - ctest $CTEST_ARGS after_failure: cat Testing/Temporary/LastTest.log - -addons: -# apt: -# packages: -# - libicu-dev -# - libboost-locale-dev - homebrew: - packages: - - icu4c - - boost diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/CHANGELOG.md new/nuspell-4.2.0/CHANGELOG.md --- old/nuspell-4.1.0/CHANGELOG.md 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/CHANGELOG.md 2020-12-12 23:40:04.000000000 +0100 @@ -6,16 +6,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [4.2.0] - 2020-12-12 +### Deprecated +- Deprecate functions that allowed non-Unicode encoding. In particular, + `Dictionary::imbue()` and `Dictionary::imbue_utf8()`. + +### Removed +- Completely remove dependency on Boost. The CLI tools were refactored to use + ICU directly. + ## [4.1.0] - 2020-11-19 -## Added +### Added - Add new API for finding dictionaries on the file-system. It is a set of free functions located in the file finder.hxx. -## Fixed +### Fixed - Improve searching for dictionaries on the file-system. Fix finding them on Fedora. Fixes #94. -## Deprecated +### Deprecated - Deprecate the old API for finding dictionaries, i.e. the class `Finder` in the file finder.hxx. @@ -175,7 +184,9 @@ - Spelling error detection (checking) is closely matching Hunspell - Support for spelling error correction (suggestions) -[Unreleased]: https://github.com/nuspell/nuspell/compare/v4.0.1...HEAD +[Unreleased]: https://github.com/nuspell/nuspell/compare/v4.2.0...HEAD +[4.2.0]: https://github.com/nuspell/nuspell/compare/v4.1.0...v4.2.0 +[4.1.0]: https://github.com/nuspell/nuspell/compare/v4.0.1...v4.1.0 [4.0.1]: https://github.com/nuspell/nuspell/compare/v4.0.0...v4.0.1 [4.0.0]: https://github.com/nuspell/nuspell/compare/v3.1.2...v4.0.0 [3.1.2]: https://github.com/nuspell/nuspell/compare/v3.1.1...v3.1.2 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/CMakeLists.txt new/nuspell-4.2.0/CMakeLists.txt --- old/nuspell-4.1.0/CMakeLists.txt 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/CMakeLists.txt 2020-12-12 23:40:04.000000000 +0100 @@ -1,14 +1,13 @@ cmake_minimum_required(VERSION 3.8) -project(nuspell VERSION 4.1.0) +project(nuspell VERSION 4.2.0) set(PROJECT_HOMEPAGE_URL "https://nuspell.github.io/") -option(BUILD_SHARED_LIBS "Buils as shared library" ON) +option(BUILD_SHARED_LIBS "Build as shared library" ON) include(GNUInstallDirs) include(CMakePackageConfigHelpers) find_package(ICU REQUIRED COMPONENTS uc data) -find_package(Boost 1.48.0 REQUIRED COMPONENTS locale) get_directory_property(subproject PARENT_DIRECTORY) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/README.md new/nuspell-4.2.0/README.md --- old/nuspell-4.1.0/README.md 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/README.md 2020-12-12 23:40:04.000000000 +0100 @@ -34,7 +34,6 @@ Run-time (and build-time) dependencies: - ICU4C - - Boost Locale >= v1.48 (needed only for the CLI tool, not the library) Recommended tools for developers: qtcreator, ninja, clang-format, gdb, vim, doxygen. @@ -47,7 +46,7 @@ For Ubuntu and Debian: ```bash -sudo apt install git cmake libboost-locale-dev libicu-dev +sudo apt install git cmake libicu-dev ``` Then run the following commands inside the Nuspell directory: @@ -81,7 +80,7 @@ <!-- end list --> ```bash -brew install cmake icu4c boost +brew install cmake icu4c export ICU_ROOT=$(brew --prefix icu4c) ``` @@ -102,7 +101,7 @@ Visual Studio Build Tools. 2. Install Git for Windows and Cmake. 3. Install vcpkg in some folder, e.g. in `c:\vcpkg`. -4. With vcpkg install: icu, boost-locale\[icu\]. +4. With vcpkg install: icu. 5. Run the commands bellow. <!-- end list --> @@ -119,7 +118,7 @@ Download MSYS2, update everything and install the following packages: ```bash -pacman -S base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-boost \ +pacman -S base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-icu \ mingw-w64-x86_64-cmake ``` @@ -144,7 +143,7 @@ Install the following required packages ```bash -pkg cmake icu boost-libs catch +pkg cmake icu catch ``` Then run the standard cmake and make as on Linux. See above. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/docs/Third-party_licenses new/nuspell-4.2.0/docs/Third-party_licenses --- old/nuspell-4.1.0/docs/Third-party_licenses 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/docs/Third-party_licenses 2020-12-12 23:40:04.000000000 +0100 @@ -93,7 +93,7 @@ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -Boost license: +Catch2 license: Boost Software License - Version 1.0 - August 17th, 2003 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/src/nuspell/CMakeLists.txt new/nuspell-4.2.0/src/nuspell/CMakeLists.txt --- old/nuspell-4.1.0/src/nuspell/CMakeLists.txt 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/src/nuspell/CMakeLists.txt 2020-12-12 23:40:04.000000000 +0100 @@ -33,7 +33,7 @@ RUNTIME_OUTPUT_NAME nuspell) target_compile_definitions(nuspell-bin PRIVATE PROJECT_VERSION=\"${PROJECT_VERSION}\") -target_link_libraries(nuspell-bin nuspell Boost::locale) +target_link_libraries(nuspell-bin nuspell) if (BUILD_SHARED_LIBS AND WIN32) # This should be PRE_LINK (or PRE_BUILD), so Vcpkg's POST_BUILD # step (see VCPKG_APPLOCAL_DEPS) that copies dll can pick up nuspell.dll diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/src/nuspell/dictionary.cxx new/nuspell-4.2.0/src/nuspell/dictionary.cxx --- old/nuspell-4.1.0/src/nuspell/dictionary.cxx 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/src/nuspell/dictionary.cxx 2020-12-12 23:40:04.000000000 +0100 @@ -3107,6 +3107,11 @@ * convert the strings from the external locale to UTF-32 on non-Windows * platforms, and to UTF-16 on Windows. * + * @deprecated You should always feed dictionary with words encoded in UTF-8, + * and you should not use this function to set other encodings. The recommened + * way to get words out of text is via the algorithm known as Unicode text + * segmentation which only works on text encoded in Unicode anyway. + * * @param loc locale object with valid codecvt<wchar_t, char, mbstate_t> */ auto Dictionary::imbue(const locale& loc) -> void @@ -3119,6 +3124,8 @@ * @brief Sets external (public API) encoding to UTF-8 * * Call this only if you used imbue() and want to revert it to UTF-8. + * + * @deprecated see imbue() */ auto Dictionary::imbue_utf8() -> void { external_locale_known_utf8 = true; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/src/nuspell/dictionary.hxx new/nuspell-4.2.0/src/nuspell/dictionary.hxx --- old/nuspell-4.1.0/src/nuspell/dictionary.hxx 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/src/nuspell/dictionary.hxx 2020-12-12 23:40:04.000000000 +0100 @@ -407,8 +407,8 @@ -> Dictionary; auto static load_from_path( const std::string& file_path_without_extension) -> Dictionary; - auto imbue(const std::locale& loc) -> void; - auto imbue_utf8() -> void; + [[deprecated]] auto imbue(const std::locale& loc) -> void; + [[deprecated]] auto imbue_utf8() -> void; auto spell(std::string_view word) const -> bool; auto suggest(std::string_view word, std::vector<std::string>& out) const -> void; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/src/nuspell/main.cxx new/nuspell-4.2.0/src/nuspell/main.cxx --- old/nuspell-4.1.0/src/nuspell/main.cxx 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/src/nuspell/main.cxx 2020-12-12 23:40:04.000000000 +0100 @@ -19,16 +19,27 @@ #include "dictionary.hxx" #include "finder.hxx" -#include <boost/locale.hpp> +#include <cassert> #include <fstream> #include <iomanip> #include <iostream> +#include <unicode/brkiter.h> +#include <unicode/ucnv.h> #if defined(__MINGW32__) || defined(__unix__) || defined(__unix) || \ (defined(__APPLE__) && defined(__MACH__)) #include <getopt.h> #include <unistd.h> #endif +#ifdef _POSIX_VERSION +#include <langinfo.h> +#endif +#ifdef _WIN32 +#include <io.h> +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#endif // manually define if not supplied by the build system #ifndef PROJECT_VERSION @@ -280,13 +291,57 @@ } } -auto process_word( - Mode mode, const Dictionary& dic, const string& line, - string::const_iterator b, string::const_iterator c, string& word, - vector<pair<string::const_iterator, string::const_iterator>>& wrong_words, - vector<string>& suggestions, ostream& out) +auto to_utf8(string_view source, string& dest, UConverter* ucnv, + UErrorCode& uerr) +{ + dest.resize(dest.capacity()); + auto len = ucnv_toAlgorithmic(UCNV_UTF8, ucnv, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + dest.resize(len); + if (uerr == U_BUFFER_OVERFLOW_ERROR) { + uerr = U_ZERO_ERROR; + ucnv_toAlgorithmic(UCNV_UTF8, ucnv, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + } +} + +auto from_utf8(string_view source, string& dest, UConverter* ucnv, + UErrorCode& uerr) +{ + dest.resize(dest.capacity()); + auto len = + ucnv_fromAlgorithmic(ucnv, UCNV_UTF8, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + dest.resize(len); + if (uerr == U_BUFFER_OVERFLOW_ERROR) { + uerr = U_ZERO_ERROR; + ucnv_fromAlgorithmic(ucnv, UCNV_UTF8, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + } +} + +auto to_unicode_string(string_view source, icu::UnicodeString& dest, + UConverter* ucnv, UErrorCode& uerr) +{ + auto buf = dest.getBuffer(-1); + auto len = ucnv_toUChars(ucnv, buf, dest.getCapacity(), source.data(), + source.size(), &uerr); + if (uerr == U_BUFFER_OVERFLOW_ERROR) { + uerr = U_ZERO_ERROR; + dest.releaseBuffer(0); + buf = dest.getBuffer(len); + if (!buf) + throw bad_alloc(); + len = ucnv_toUChars(ucnv, buf, dest.getCapacity(), + source.data(), source.size(), &uerr); + } + dest.releaseBuffer(len); +} + +auto process_word(Mode mode, const Dictionary& dic, string_view word, + size_t pos_word, vector<string_view>& wrong_words, + vector<string>& suggestions, ostream& out) { - word.assign(b, c); auto correct = dic.spell(word); switch (mode) { case DEFAULT_MODE: { @@ -295,7 +350,6 @@ break; } dic.suggest(word, suggestions); - auto pos_word = b - begin(line); if (suggestions.empty()) { out << "# " << word << ' ' << pos_word << '\n'; break; @@ -319,18 +373,66 @@ case MISSPELLED_LINES_MODE: case CORRECT_LINES_MODE: if (!correct) - wrong_words.emplace_back(b, c); + wrong_words.push_back(word); break; default: break; } } -auto process_line( - Mode mode, const string& line, - const vector<pair<string::const_iterator, string::const_iterator>>& - wrong_words, - ostream& out) +auto process_word_other_encoding(Mode mode, const Dictionary& dic, + string_view word, string_view u8word, + size_t pos_word, + vector<string_view>& wrong_words, + vector<string>& suggestions, ostream& out, + UConverter* ucnv, UErrorCode& uerr) +{ + auto correct = dic.spell(u8word); + switch (mode) { + case DEFAULT_MODE: { + if (correct) { + out << "*\n"; + break; + } + dic.suggest(u8word, suggestions); + if (suggestions.empty()) { + out << "# " << word << ' ' << pos_word << '\n'; + break; + } + out << "& " << word << ' ' << suggestions.size() << ' ' + << pos_word << ": "; + auto sug_in_encoding = string(); + from_utf8(suggestions[0], sug_in_encoding, ucnv, uerr); + out << sug_in_encoding; + for_each(begin(suggestions) + 1, end(suggestions), + [&](const string& u8sug) { + out << ", "; + from_utf8(u8sug, sug_in_encoding, ucnv, uerr); + out << sug_in_encoding; + }); + out << '\n'; + break; + } + case MISSPELLED_WORDS_MODE: + if (!correct) + out << word << '\n'; + break; + case CORRECT_WORDS_MODE: + if (correct) + out << word << '\n'; + break; + case MISSPELLED_LINES_MODE: + case CORRECT_LINES_MODE: + if (!correct) + wrong_words.push_back(word); + break; + default: + break; + } +} + +auto finish_line(Mode mode, const string& line, + const vector<string_view>& wrong_words, ostream& out) { switch (mode) { case DEFAULT_MODE: @@ -350,81 +452,140 @@ } auto whitespace_segmentation_loop(istream& in, ostream& out, - const Dictionary& dic, Mode mode) + const Dictionary& dic, Mode mode, + UConverter* ucnv, UErrorCode& uerr) { auto line = string(); - auto word = string(); auto suggestions = vector<string>(); - using Str_Iter = string::const_iterator; - auto wrong_words = vector<pair<Str_Iter, Str_Iter>>(); + auto wrong_words = vector<string_view>(); auto loc = in.getloc(); - auto line_num = size_t(0); auto& facet = use_facet<ctype<char>>(loc); auto isspace = [&](char c) { return facet.is(facet.space, c); }; + auto u8word = string(); + auto is_utf8 = ucnv_getType(ucnv) == UCNV_UTF8; + while (getline(in, line)) { - ++line_num; wrong_words.clear(); for (auto a = begin(line); a != end(line);) { - auto b = find_if_not(a, end(line), isspace); - if (b == end(line)) + a = find_if_not(a, end(line), isspace); + if (a == end(line)) break; - auto c = find_if(b, end(line), isspace); - - process_word(mode, dic, line, b, c, word, wrong_words, - suggestions, out); - - a = c; + auto b = find_if(a, end(line), isspace); + auto word = string_view(&*a, distance(a, b)); + auto pos_word = distance(begin(line), a); + if (is_utf8) { + process_word(mode, dic, word, pos_word, + wrong_words, suggestions, out); + } + else { + to_utf8(word, u8word, ucnv, uerr); + process_word_other_encoding( + mode, dic, word, u8word, pos_word, + wrong_words, suggestions, out, ucnv, uerr); + } + a = b; } - process_line(mode, line, wrong_words, out); + finish_line(mode, line, wrong_words, out); } } -auto unicode_segentation_loop(istream& in, ostream& out, const Dictionary& dic, - Mode mode) +auto is_word_break(int32_t typ) { - namespace b = boost::locale::boundary; - auto line = string(); - auto word = string(); - auto suggestions = vector<string>(); - using Str_Iter = string::const_iterator; - auto wrong_words = vector<pair<Str_Iter, Str_Iter>>(); - auto loc = in.getloc(); - auto line_num = size_t(0); - auto index = b::ssegment_index(); - index.rule(b::word_any); - auto line_stream = istringstream(); - while (getline(in, line)) { - ++line_num; - index.map(b::word, begin(line), end(line), loc); - wrong_words.clear(); - auto a = cbegin(line); - for (auto& segment : index) { - auto b = begin(segment); - auto c = end(segment); + return (UBRK_WORD_NUMBER <= typ && typ < UBRK_WORD_NUMBER_LIMIT) || + (UBRK_WORD_LETTER <= typ && typ < UBRK_WORD_LETTER_LIMIT) || + (UBRK_WORD_KANA <= typ && typ < UBRK_WORD_KANA_LIMIT) || + (UBRK_WORD_IDEO <= typ && typ < UBRK_WORD_IDEO_LIMIT); +} - process_word(mode, dic, line, b, c, word, wrong_words, +auto segment_line_utf8(Mode mode, const Dictionary& dic, const string& line, + UText* utext, icu::BreakIterator* ubrkiter, + UErrorCode& uerr, vector<string>& suggestions, + vector<string_view>& wrong_words, ostream& out) +{ + utext_openUTF8(utext, line.data(), line.size(), &uerr); + ubrkiter->setText(utext, uerr); + for (auto i = ubrkiter->first(), prev = 0; i != ubrkiter->DONE; + prev = i, i = ubrkiter->next()) { + auto typ = ubrkiter->getRuleStatus(); + if (is_word_break(typ)) { + auto word = string_view(line).substr(prev, i - prev); + process_word(mode, dic, word, prev, wrong_words, suggestions, out); - - a = c; } - process_line(mode, line, wrong_words, out); } + finish_line(mode, line, wrong_words, out); + assert(U_SUCCESS(uerr)); } -namespace std { -ostream& operator<<(ostream& out, const locale& loc) +auto segment_line_generic(Mode mode, const Dictionary& dic, const string& line, + icu::UnicodeString& uline, UConverter* ucnv, + icu::BreakIterator* ubrkiter, UErrorCode& uerr, + string& u8word, vector<string>& suggestions, + vector<string_view>& wrong_words, ostream& out) { - if (has_facet<boost::locale::info>(loc)) { - auto& f = use_facet<boost::locale::info>(loc); - out << "name=" << f.name() << ", lang=" << f.language() - << ", country=" << f.country() << ", enc=" << f.encoding(); + to_unicode_string(line, uline, ucnv, uerr); + ubrkiter->setText(uline); + size_t orig_prev = 0, orig_i = 0; + auto src = line.c_str(); + auto src_end = src + line.size(); + + ucnv_resetToUnicode(ucnv); + for (auto i = ubrkiter->first(), prev = 0; i != ubrkiter->DONE; + prev = i, i = ubrkiter->next(), orig_prev = orig_i) { + + for (auto j = prev; j != i; ++j) { + auto cp = ucnv_getNextUChar(ucnv, &src, src_end, &uerr); + + // U_IS_SURROGATE(uline[j]) or + // U_IS_LEAD(uline[j]) can work too + j += !U_IS_BMP(cp); + } + orig_i = distance(line.c_str(), src); + + auto typ = ubrkiter->getRuleStatus(); + if (is_word_break(typ)) { + auto uword = uline.tempSubStringBetween(prev, i); + u8word.clear(); + uword.toUTF8String(u8word); + auto word = string_view(line).substr( + orig_prev, orig_i - orig_prev); + process_word_other_encoding( + mode, dic, word, u8word, orig_prev, wrong_words, + suggestions, out, ucnv, uerr); + } } - else { - out << loc.name(); + finish_line(mode, line, wrong_words, out); + assert(U_SUCCESS(uerr)); +} + +auto unicode_segentation_loop(istream& in, ostream& out, const Dictionary& dic, + Mode mode, UConverter* ucnv, UErrorCode& uerr) +{ + auto line = string(); + auto suggestions = vector<string>(); + auto wrong_words = vector<string_view>(); + + // TODO: try to use Locale constructed from dictionary name. + auto ubrkiter = unique_ptr<icu::BreakIterator>( + icu::BreakIterator::createWordInstance(icu::Locale(), uerr)); + auto utext = icu::LocalUTextPointer( + utext_openUTF8(nullptr, line.data(), line.size(), &uerr)); + auto uline = icu::UnicodeString(); + auto u8word = string(); + auto is_utf8 = ucnv_getType(ucnv) == UCNV_UTF8; + + while (getline(in, line)) { + wrong_words.clear(); + if (is_utf8) + segment_line_utf8(mode, dic, line, utext.getAlias(), + ubrkiter.get(), uerr, suggestions, + wrong_words, out); + else + segment_line_generic(mode, dic, line, uline, ucnv, + ubrkiter.get(), uerr, u8word, + suggestions, wrong_words, out); } - return out; } -} // namespace std int main(int argc, char* argv[]) { @@ -432,30 +593,6 @@ ios_base::sync_with_stdio(false); auto args = Args_t(argc, argv); - if (args.mode == ERROR_MODE) { - cerr << "Invalid (combination of) arguments, try '" - << args.program_name << " --help' for more information\n"; - return 1; - } - boost::locale::generator gen; - auto loc = std::locale(); - try { - if (args.encoding.empty()) - loc = gen(""); - else - loc = gen("en_US." + args.encoding); - } - catch (const boost::locale::conv::invalid_charset_error& e) { - cerr << e.what() << '\n'; -#ifdef _POSIX_VERSION - cerr << "Nuspell error: see `locale -m` for supported " - "encodings.\n"; -#endif - return 1; - } - cin.imbue(loc); - cout.imbue(loc); - switch (args.mode) { case HELP_MODE: print_help(args.program_name); @@ -463,26 +600,64 @@ case VERSION_MODE: print_version(); return 0; + case ERROR_MODE: + cerr << "Invalid (combination of) arguments, try '" + << args.program_name << " --help' for more information\n"; + return 1; default: break; } - clog << "INFO: I/O locale " << loc << '\n'; - auto f = Dict_Finder_For_CLI_Tool(); - if (args.mode == LIST_DICTIONARIES_MODE) { list_dictionaries(f); return 0; } + char* loc_str = nullptr; +#ifdef _WIN32 + loc_str = setlocale(LC_CTYPE, nullptr); // will return "C" + + /* On Windows, the console is a buggy thing. If the default C locale is + active, then the encoding of the strings gotten from C or C++ stdio + (fgets, scanf, cin) is GetConsoleCP(). Stdout accessed via standard + functions (printf, cout) expects encoding of GetConsoleOutputCP() which + is the same as GetConsoleCP() unless manually changed. By default both + are the active OEM encoding, unless changed with the command chcp, or by + calling the Set functions. + + If we call setlocale(LC_CTYPE, ""), or let's say setlocale(LC_CTYPE, + ".1251"), then stdin will still return in the encoding GetConsoleCP(), + but stdout functions like printf now will expect a different encoding, + the one set via setlocale. Because of this mess don't change locale with + setlocale on Windows. + + When stdin or stout are redirected from/to file or another terminal like + the one in MSYS2, they are read/written as-is. Then we will assume UTF-8 + encoding. */ +#else + loc_str = setlocale(LC_CTYPE, ""); + if (!loc_str) { + clog << "WARNING: Invalid locale string, fall back to \"C\".\n"; + loc_str = setlocale(LC_CTYPE, nullptr); // will return "C" + } +#endif + auto loc_str_sv = string_view(loc_str); + if (args.encoding.empty()) { +#if _POSIX_VERSION + auto enc_str = nl_langinfo(CODESET); + args.encoding = enc_str; +#elif _WIN32 + if (_isatty(_fileno(stdin)) || _isatty(_fileno(stdout))) + args.encoding = "cp" + to_string(GetConsoleCP()); + else + args.encoding = "UTF-8"; +#endif + } + clog << "INFO: Locale LC_CTYPE=" << loc_str_sv + << ", Used encoding=" << args.encoding << '\n'; if (args.dictionary.empty()) { // infer dictionary from locale - auto& info = use_facet<boost::locale::info>(loc); - args.dictionary = info.language(); - auto c = info.country(); - if (!c.empty()) { - args.dictionary += '_'; - args.dictionary += c; - } + auto idx = min(loc_str_sv.find('.'), loc_str_sv.find('@')); + args.dictionary = loc_str_sv.substr(0, idx); } if (args.dictionary.empty()) { cerr << "No dictionary provided and can not infer from OS " @@ -502,13 +677,31 @@ cerr << e.what() << '\n'; return 1; } - if (!use_facet<boost::locale::info>(loc).utf8()) - dic.imbue(loc); + // ICU reports all types of errors, logic errors and runtime errors + // using this enum. We should not check for logic errors, they should + // not happend. Optionally, only assert that they are not there can be + // used. We should check for runtime errors. + // The encoding conversion is a common case where runtime error can + // happen, but by default ICU uses Unicode replacement character on + // errors and reprots success. This can be changed, but there is no need + // for that. + auto uerr = U_ZERO_ERROR; + auto enc_cstr = args.encoding.c_str(); + if (args.encoding.empty()) { + enc_cstr = nullptr; + clog << "WARNING: using default ICU encoding converter for IO" + << endl; + } + auto ucnv = icu::LocalUConverterPointer(ucnv_open(enc_cstr, &uerr)); + if (U_FAILURE(uerr)) { + cerr << "ERROR: Invalid encoding " << args.encoding << ".\n"; + return 1; + } auto loop_function = unicode_segentation_loop; if (args.whitespace_segmentation) loop_function = whitespace_segmentation_loop; if (args.files.empty()) { - loop_function(cin, cout, dic, args.mode); + loop_function(cin, cout, dic, args.mode, ucnv.getAlias(), uerr); } else { for (auto& file_name : args.files) { @@ -517,8 +710,8 @@ cerr << "Can't open " << file_name << '\n'; return 1; } - in.imbue(loc); - loop_function(in, cout, dic, args.mode); + loop_function(in, cout, dic, args.mode, ucnv.getAlias(), + uerr); } } return 0; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/tests/CMakeLists.txt new/nuspell-4.2.0/tests/CMakeLists.txt --- old/nuspell-4.1.0/tests/CMakeLists.txt 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/tests/CMakeLists.txt 2020-12-12 23:40:04.000000000 +0100 @@ -4,7 +4,7 @@ structures_test.cxx utils_test.cxx catch_main.cxx) -target_link_libraries(unit_test nuspell Catch2::Catch2 Boost::locale) +target_link_libraries(unit_test nuspell Catch2::Catch2) if (MSVC) target_compile_options(unit_test PRIVATE "/utf-8") # Consider doing this for all the other targets by setting this flag @@ -15,7 +15,7 @@ target_link_libraries(legacy_test nuspell) add_executable(verify verify.cxx) -target_link_libraries(verify nuspell hunspell Boost::locale) +target_link_libraries(verify nuspell hunspell) if (BUILD_SHARED_LIBS AND WIN32) add_custom_command(TARGET verify PRE_LINK diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/tests/utils_test.cxx new/nuspell-4.2.0/tests/utils_test.cxx --- old/nuspell-4.1.0/tests/utils_test.cxx 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/tests/utils_test.cxx 2020-12-12 23:40:04.000000000 +0100 @@ -18,7 +18,6 @@ #include <nuspell/utils.hxx> -#include <boost/locale.hpp> #include <catch2/catch.hpp> using namespace std; @@ -118,44 +117,19 @@ TEST_CASE("to_wide", "[locale_utils]") { - auto gen = boost::locale::generator(); - gen.characters(boost::locale::wchar_t_facet); - gen.categories(boost::locale::codepage_facet); - auto loc = gen("en_US.UTF-8"); - auto in = string("\U0010FFFF ??"); - CHECK(L"\U0010FFFF ??" == to_wide(in, loc)); - - in = "\U00011D59\U00011D59\U00011D59\U00011D59\U00011D59"; - auto out = wstring(); - auto exp = L"\U00011D59\U00011D59\U00011D59\U00011D59\U00011D59"; - CHECK(true == to_wide(in, loc, out)); - CHECK(exp == out); - - loc = locale(locale::classic(), new latin1_codecvt()); - in = "abcd\xDF"; + auto loc = locale(locale::classic(), new latin1_codecvt()); + auto in = "abcd\xDF"; CHECK(L"abcd??" == to_wide(in, loc)); } TEST_CASE("to_narrow", "[locale_utils]") { - auto gen = boost::locale::generator(); - gen.characters(boost::locale::wchar_t_facet); - gen.categories(boost::locale::codepage_facet); - auto loc = gen("en_US.UTF-8"); - auto in = wstring(L"\U0010FFFF ??"); - CHECK("\U0010FFFF ??" == to_narrow(in, loc)); - - in = L"\U00011D59\U00011D59\U00011D59\U00011D59\U00011D59"; - auto out = string(); - CHECK(true == to_narrow(in, out, loc)); - CHECK("\U00011D59\U00011D59\U00011D59\U00011D59\U00011D59" == out); - - loc = locale(locale::classic(), new latin1_codecvt()); - in = L"abcd??"; + auto loc = locale(locale::classic(), new latin1_codecvt()); + auto in = L"abcd??"; CHECK("abcd\xDF" == to_narrow(in, loc)); in = L"\U00011D59\U00011D59\U00011D59\U00011D59\U00011D59"; - out = string(); + auto out = string(); CHECK(false == to_narrow(in, out, loc)); CHECK(all_of(begin(out), end(out), [](auto c) { return c == '?'; })); } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/nuspell-4.1.0/tests/verify.cxx new/nuspell-4.2.0/tests/verify.cxx --- old/nuspell-4.1.0/tests/verify.cxx 2020-11-19 11:56:49.000000000 +0100 +++ new/nuspell-4.2.0/tests/verify.cxx 2020-12-12 23:40:04.000000000 +0100 @@ -16,16 +16,15 @@ * along with Nuspell. If not, see <http://www.gnu.org/licenses/>. */ -#include <boost/locale.hpp> #include <hunspell/hunspell.hxx> #include <nuspell/dictionary.hxx> #include <nuspell/finder.hxx> -#include <nuspell/utils.hxx> #include <chrono> #include <fstream> #include <iomanip> #include <iostream> +#include <unicode/ucnv.h> #if defined(__MINGW32__) || defined(__unix__) || defined(__unix) || \ (defined(__APPLE__) && defined(__MACH__)) @@ -33,6 +32,7 @@ #include <unistd.h> #endif #ifdef _POSIX_VERSION +#include <langinfo.h> #include <sys/resource.h> #include <sys/time.h> #endif @@ -205,31 +205,84 @@ #endif } -auto normal_loop(istream& in, ostream& out, Dictionary& dic, Hunspell& hun, - locale& hloc, bool print_false = false, bool test_sugs = false) +auto to_utf8(string_view source, string& dest, UConverter* ucnv, + UErrorCode& uerr) { + dest.resize(dest.capacity()); + auto len = ucnv_toAlgorithmic(UCNV_UTF8, ucnv, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + dest.resize(len); + if (uerr == U_BUFFER_OVERFLOW_ERROR) { + uerr = U_ZERO_ERROR; + ucnv_toAlgorithmic(UCNV_UTF8, ucnv, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + } +} + +auto from_utf8(string_view source, string& dest, UConverter* ucnv, + UErrorCode& uerr) +{ + dest.resize(dest.capacity()); + auto len = + ucnv_fromAlgorithmic(ucnv, UCNV_UTF8, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + dest.resize(len); + if (uerr == U_BUFFER_OVERFLOW_ERROR) { + uerr = U_ZERO_ERROR; + ucnv_fromAlgorithmic(ucnv, UCNV_UTF8, dest.data(), dest.size(), + source.data(), source.size(), &uerr); + } +} + +auto normal_loop(const Args_t& args, const Dictionary& dic, Hunspell& hun, + istream& in, ostream& out) +{ + auto print_false = args.print_false; + auto test_sugs = args.sugs; auto word = string(); - auto wide_word = wstring(); - auto narrow_word = string(); - // total number of words + auto u8_buffer = string(); + auto hun_word = string(); auto total = 0; - // total number of words with identical spelling correctness auto true_pos = 0; auto true_neg = 0; auto false_pos = 0; auto false_neg = 0; - // store cpu time for Hunspell and Nuspell auto duration_hun = chrono::high_resolution_clock::duration(); auto duration_nu = duration_hun; auto in_loc = in.getloc(); + + auto uerr = U_ZERO_ERROR; + auto io_cnv = icu::LocalUConverterPointer( + ucnv_open(args.encoding.c_str(), &uerr)); + if (U_FAILURE(uerr)) + throw runtime_error("Invalid io encoding"); + auto hun_enc = + nuspell::Encoding(hun.get_dict_encoding()).value_or_default(); + auto hun_cnv = + icu::LocalUConverterPointer(ucnv_open(hun_enc.c_str(), &uerr)); + if (U_FAILURE(uerr)) + throw runtime_error("Invalid hun encoding"); + auto io_is_utf8 = ucnv_getType(io_cnv.getAlias()) == UCNV_UTF8; + auto hun_is_utf8 = ucnv_getType(hun_cnv.getAlias()) == UCNV_UTF8; + // need to take entine line here, not `in >> word` while (getline(in, word)) { + auto u8_word = string_view(); auto tick_a = chrono::high_resolution_clock::now(); - auto res_nu = dic.spell(word); + if (io_is_utf8) { + u8_word = word; + } + else { + to_utf8(word, u8_buffer, io_cnv.getAlias(), uerr); + u8_word = u8_buffer; + } + auto res_nu = dic.spell(u8_word); auto tick_b = chrono::high_resolution_clock::now(); - to_wide(word, in_loc, wide_word); - to_narrow(wide_word, narrow_word, hloc); - auto res_hun = hun.spell(narrow_word); + if (hun_is_utf8) + hun_word = u8_word; + else + from_utf8(u8_word, hun_word, hun_cnv.getAlias(), uerr); + auto res_hun = hun.spell(hun_word); auto tick_c = chrono::high_resolution_clock::now(); duration_nu += tick_b - tick_a; duration_hun += tick_c - tick_b; @@ -260,7 +313,7 @@ auto nus_sugs = vector<string>(); auto hun_sugs = vector<string>(); dic.suggest(word, nus_sugs); - hun.suggest(narrow_word); + hun.suggest(hun_word); } } out << "Total Words " << total << '\n'; @@ -281,51 +334,12 @@ out << "Speedup Rate " << speedup << '\n'; } -namespace std { -ostream& operator<<(ostream& out, const locale& loc) -{ - if (has_facet<boost::locale::info>(loc)) { - auto& f = use_facet<boost::locale::info>(loc); - out << "name=" << f.name() << ", lang=" << f.language() - << ", country=" << f.country() << ", enc=" << f.encoding(); - } - else { - out << loc.name(); - } - return out; -} -} // namespace std - int main(int argc, char* argv[]) { // May speed up I/O. After this, don't use C printf, scanf etc. ios_base::sync_with_stdio(false); auto args = Args_t(argc, argv); - if (args.mode == ERROR_MODE) { - cerr << "Invalid (combination of) arguments, try '" - << args.program_name << " --help' for more information\n"; - return 1; - } - boost::locale::generator gen; - auto loc = std::locale(); - try { - if (args.encoding.empty()) - loc = gen(""); - - else - loc = gen("en_US." + args.encoding); - } - catch (const boost::locale::conv::invalid_charset_error& e) { - cerr << e.what() << '\n'; -#ifdef _POSIX_VERSION - cerr << "Nuspell error: see `locale -m` for supported " - "encodings.\n"; -#endif - return 1; - } - cin.imbue(loc); - cout.imbue(loc); switch (args.mode) { case HELP_MODE: @@ -334,22 +348,34 @@ case VERSION_MODE: print_version(); return 0; + case ERROR_MODE: + cerr << "Invalid (combination of) arguments, try '" + << args.program_name << " --help' for more information\n"; + return 1; default: break; } - clog << "INFO: I/O locale " << loc << '\n'; - auto f = Dict_Finder_For_CLI_Tool(); + auto loc_str = setlocale(LC_CTYPE, ""); + if (!loc_str) { + clog << "WARNING: Invalid locale string, fall back to \"C\".\n"; + loc_str = setlocale(LC_CTYPE, nullptr); // will return "C" + } + auto loc_str_sv = string_view(loc_str); + if (args.encoding.empty()) { +#if _POSIX_VERSION + auto enc_str = nl_langinfo(CODESET); + args.encoding = enc_str; +#elif _WIN32 +#endif + } + clog << "INFO: Locale LC_CTYPE=" << loc_str_sv + << ", Used encoding=" << args.encoding << '\n'; if (args.dictionary.empty()) { // infer dictionary from locale - auto& info = use_facet<boost::locale::info>(loc); - args.dictionary = info.language(); - auto c = info.country(); - if (!c.empty()) { - args.dictionary += '_'; - args.dictionary += c; - } + auto idx = min(loc_str_sv.find('.'), loc_str_sv.find('@')); + args.dictionary = loc_str_sv.substr(0, idx); } if (args.dictionary.empty()) { cerr << "No dictionary provided and can not infer from OS " @@ -361,8 +387,8 @@ return 1; } clog << "INFO: Pointed dictionary " << filename << ".{dic,aff}\n"; - auto dic = Dictionary(); auto peak_ram_a = get_peak_ram_usage(); + auto dic = Dictionary(); try { dic = Dictionary::load_from_path(filename); } @@ -370,22 +396,16 @@ cerr << e.what() << '\n'; return 1; } - if (!use_facet<boost::locale::info>(loc).utf8()) - dic.imbue(loc); auto nuspell_ram = get_peak_ram_usage() - peak_ram_a; - auto aff_name = filename + ".aff"; auto dic_name = filename + ".dic"; peak_ram_a = get_peak_ram_usage(); Hunspell hun(aff_name.c_str(), dic_name.c_str()); auto hunspell_ram = get_peak_ram_usage() - peak_ram_a; - auto hun_loc = gen( - "en_US." + Encoding(hun.get_dict_encoding()).value_or_default()); cout << "Nuspell peak RAM usage: " << nuspell_ram << "kB\n" << "Hunspell peak RAM usage: " << hunspell_ram << "kB\n"; if (args.files.empty()) { - normal_loop(cin, cout, dic, hun, hun_loc, args.print_false, - args.sugs); + normal_loop(args, dic, hun, cin, cout); } else { for (auto& file_name : args.files) { @@ -395,8 +415,7 @@ return 1; } in.imbue(cin.getloc()); - normal_loop(in, cout, dic, hun, hun_loc, - args.print_false); + normal_loop(args, dic, hun, in, cout); } } return 0;