Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package lttoolbox for openSUSE:Factory checked in at 2023-12-28 23:03:08 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/lttoolbox (Old) and /work/SRC/openSUSE:Factory/.lttoolbox.new.28375 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "lttoolbox" Thu Dec 28 23:03:08 2023 rev:6 rq:1135408 version:3.7.6 Changes: -------- --- /work/SRC/openSUSE:Factory/lttoolbox/lttoolbox.changes 2022-11-01 13:43:48.440291591 +0100 +++ /work/SRC/openSUSE:Factory/.lttoolbox.new.28375/lttoolbox.changes 2023-12-28 23:04:51.581890035 +0100 @@ -1,0 +2,6 @@ +Thu Dec 28 02:43:38 UTC 2023 - Jan Engelhardt <jeng...@inai.de> + +- Update to release 3.7.6 + * Add option to set compound_max_elements in lt-proc + +------------------------------------------------------------------- Old: ---- v3.7.1.tar.gz New: ---- v3.7.6.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ lttoolbox.spec ++++++ --- /var/tmp/diff_new_pack.JwFSkN/_old 2023-12-28 23:04:52.005905531 +0100 +++ /var/tmp/diff_new_pack.JwFSkN/_new 2023-12-28 23:04:52.005905531 +0100 @@ -1,7 +1,7 @@ # # spec file for package lttoolbox # -# Copyright (c) 2022 SUSE LLC +# Copyright (c) 2023 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,7 +19,7 @@ Name: lttoolbox %define lname liblttoolbox3 Summary: Toolbox for lexical processing and morphological analysis -Version: 3.7.1 +Version: 3.7.6 Release: 0 License: GPL-2.0-or-later Group: Productivity/Scientific/Other ++++++ v3.7.1.tar.gz -> v3.7.6.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/CMakeLists.txt new/lttoolbox-3.7.6/CMakeLists.txt --- old/lttoolbox-3.7.1/CMakeLists.txt 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 @@ -1,149 +0,0 @@ -cmake_minimum_required(VERSION 3.0 FATAL_ERROR) -cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}) -project(lttoolbox - VERSION 3.7.0 - LANGUAGES CXX C - ) -set(VERSION ${PROJECT_VERSION}) -set(VERSION_ABI 3) -set(PACKAGE_BUGREPORT "apertium-st...@lists.sourceforge.net") - -add_definitions("-DPACKAGE_VERSION=\"${PROJECT_VERSION}\"") - -set(MASTER_PROJECT OFF) -if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) - set(MASTER_PROJECT ON) -endif () - -# Release or Debug -if(MASTER_PROJECT AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release") -endif() - -set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_MACOSX_RPATH ON) - -include(GNUInstallDirs) - -option(BUILD_SHARED_LIBS "Set to OFF to use static library" ON) -option(BUILD_TESTING "Set to OFF to disable tests" ON) -option(ENABLE_PYTHON_BINDINGS "Set to ON to build the Python wrapper" OFF) - -if(MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8 /std:c++latest /Zc:__cplusplus /permissive- /W4 /MP") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /O2") - set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG") - set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS}) - set(CMAKE_C_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) -else() - set(_FLAGS_COMMON "-Wall -Wextra -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -fPIC") - - include(CheckCCompilerFlag) - include(CheckCXXCompilerFlag) - - foreach(flag "-Wno-unused-result" "-flto") - string(REGEX REPLACE "[^A-Za-z0-9]" "-" _flag ${flag}) - CHECK_CXX_COMPILER_FLAG(${flag} COMPILER_SUPPORTS_${_flag}) - if(COMPILER_SUPPORTS_${_flag}) - set(_FLAGS_COMMON "${_FLAGS_COMMON} ${flag}") - endif() - endforeach() - if(COMPILER_SUPPORTS_flto) - set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} -flto") - endif() - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_FLAGS_COMMON} -fvisibility-inlines-hidden") - - # Enable latest possible C standard - foreach(flag "-std=c2x" "-std=c11" "-std=c1x" "-std=c99") - string(REGEX REPLACE "[^a-z0-9]" "-" _flag ${flag}) - CHECK_C_COMPILER_FLAG(${flag} COMPILER_SUPPORTS_${_flag}) - if(COMPILER_SUPPORTS_${_flag}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}") - break() - endif() - endforeach() - - # Require latest possible C++ standard - foreach(flag "-std=c++23" "-std=c++2b" "-std=c++20" "-std=c++2a" "-std=c++17") - string(REGEX REPLACE "[^a-z0-9]" "-" _flag ${flag}) - CHECK_CXX_COMPILER_FLAG(${flag} COMPILER_SUPPORTS_${_flag}) - if(COMPILER_SUPPORTS_${_flag}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}") - set(_ENABLED_CXX ${flag}) - break() - endif() - endforeach() - if(NOT _ENABLED_CXX) - message(FATAL_ERROR "Could not enable at least C++17 - upgrade your compiler") - endif() - - # Generate pkg-config file - set(prefix ${CMAKE_INSTALL_PREFIX}) - set(exec_prefix "\${prefix}") - set(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}") - set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") - configure_file(lttoolbox.pc.in lttoolbox.pc @ONLY) - install(FILES "${CMAKE_CURRENT_BINARY_DIR}/lttoolbox.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") -endif() - -try_compile(SIZET_NOT_CSTDINT ${CMAKE_CURRENT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/lttoolbox/check-cstdint.cc) -if(SIZET_NOT_CSTDINT) - add_definitions(-DSIZET_NOT_CSTDINT) -endif() - -find_package(LibXml2 REQUIRED) -include_directories(${LIBXML2_INCLUDE_DIR}) - -if(WIN32) - add_definitions(-D_SECURE_SCL=0 -D_ITERATOR_DEBUG_LEVEL=0 -D_CRT_SECURE_NO_DEPRECATE -DWIN32_LEAN_AND_MEAN -DVC_EXTRALEAN -DNOMINMAX) - add_definitions(-DSTDC_HEADERS -DREGEX_MALLOC) - include_directories("lttoolbox/win32") -else() - add_definitions(-D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE) -endif() - -if(NOT APPLE) - find_package(Threads REQUIRED) -endif() - -# Unlocked I/O functions -include(CheckSymbolExists) -set(CMAKE_REQUIRED_DEFINITIONS -D_POSIX_C_SOURCE=200112 -D_GNU_SOURCE) -foreach(func fread_unlocked fwrite_unlocked fgetc_unlocked fputc_unlocked fputs_unlocked) - string(TOUPPER ${func} _uc) - CHECK_SYMBOL_EXISTS(${func} "stdio.h" HAVE_DECL_${_uc}) - if(HAVE_DECL_${_uc}) - add_definitions(-DHAVE_DECL_${_uc}) - endif() -endforeach() -unset(CMAKE_REQUIRED_DEFINITIONS) - -# getopt -find_path(GETOPT_INCLUDE getopt.h) -include_directories(${GETOPT_INCLUDE}) -if(VCPKG_TOOLCHAIN) - find_library(GETOPT_LIB NAMES getopt) - add_definitions(-DHAVE_GETOPT_LONG) -else() - set(GETOPT_LIB) -endif() - -# ICU -find_package(ICU COMPONENTS i18n io uc REQUIRED) - -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -if(BUILD_TESTING) - enable_testing() - find_package(PythonInterp 3.5 REQUIRED) - set(ENV{CTEST_OUTPUT_ON_FAILURE} 1) - set(CMAKE_CTEST_ARGUMENTS "-VV") -endif() - -add_subdirectory(lttoolbox) - -if(ENABLE_PYTHON_BINDINGS) - add_subdirectory(python) -endif() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/cmake.sh new/lttoolbox-3.7.6/cmake.sh --- old/lttoolbox-3.7.1/cmake.sh 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/cmake.sh 1970-01-01 01:00:00.000000000 +0100 @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -set -e -args=() - -while [[ $# > 0 ]]; -do - case "$1" in - --prefix) - args+=("-DCMAKE_INSTALL_PREFIX=$2") - shift 2 - ;; - --prefix=*) - args+=("-DCMAKE_INSTALL_PREFIX=${1#*=}") - shift - ;; - *) - args+=("$1") - shift - ;; - esac -done - -set -- "${args[@]}" - -echo "- rm -rf CMake caches" -rm -rf install_manifest.txt CMakeCache.txt *.cmake CMakeFiles lttoolbox/CMakeFiles lttoolbox/*.cmake _CPack_Packages Testing -echo "- cmake " "$@" "." -cmake "$@" . -echo "- You may now perform: make -j3" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/configure.ac new/lttoolbox-3.7.6/configure.ac --- old/lttoolbox-3.7.1/configure.ac 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/configure.ac 2023-12-27 21:15:14.000000000 +0100 @@ -2,7 +2,7 @@ m4_define([PKG_VERSION_MAJOR], [3]) m4_define([PKG_VERSION_MINOR], [7]) -m4_define([PKG_VERSION_PATCH], [1]) +m4_define([PKG_VERSION_PATCH], [6]) # Bump if the ABI (not API) changed in a backwards-incompatible manner m4_define([PKG_VERSION_ABI], [3]) @@ -61,8 +61,7 @@ # Checks for library functions. AC_FUNC_ERROR_AT_LINE -AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, \ -fputc_unlocked, fputs_unlocked]) +AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked, fmemopen]) AC_CHECK_FUNCS([setlocale strdup getopt_long]) @@ -74,7 +73,7 @@ version_flag="-std=c++${version}" AX_CHECK_COMPILE_FLAG([${version_flag}], [break], [version_flag=none]) done -AS_IF([test "$version_flag" == none], [ +AS_IF([test "$version_flag" = none], [ AC_MSG_ERROR([Could not enable at least C++17 - upgrade your compiler]) ]) CXXFLAGS="$CXXFLAGS ${version_flag}" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/CMakeLists.txt new/lttoolbox-3.7.6/lttoolbox/CMakeLists.txt --- old/lttoolbox-3.7.1/lttoolbox/CMakeLists.txt 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 @@ -1,128 +0,0 @@ -set(LIBLTTOOLBOX_HEADERS - acx.h - alphabet.h - att_compiler.h - buffer.h - cli.h - compiler.h - compression.h - deserialiser.h - entry_token.h - exception.h - expander.h - file_utils.h - fst_processor.h - input_file.h - lt_locale.h - match_exe.h - match_node.h - match_state.h - my_stdio.h - node.h - pattern_list.h - regexp_compiler.h - serialiser.h - sorted_vector.h - sorted_vector.hpp - state.h - string_utils.h - tmx_compiler.h - transducer.h - trans_exe.h - ustring.h - xml_parse_util.h - xml_walk_util.h - ) -set(LIBLTTOOLBOX_SOURCES - acx.cc - alphabet.cc - att_compiler.cc - cli.cc - compiler.cc - compression.cc - entry_token.cc - expander.cc - file_utils.cc - fst_processor.cc - input_file.cc - lt_locale.cc - match_exe.cc - match_node.cc - match_state.cc - node.cc - pattern_list.cc - regexp_compiler.cc - sorted_vector.cc - state.cc - string_utils.cc - tmx_compiler.cc - transducer.cc - trans_exe.cc - ustring.cc - xml_parse_util.cc - xml_walk_util.cc - ${LIBLTTOOLBOX_HEADERS} - ) -if(WIN32) - set(LIBLTTOOLBOX_SOURCES - win32/libgen.c - win32/libgen.h - win32/regex.c - win32/regex.h - win32/unistd.h - ${LIBLTTOOLBOX_SOURCES} - ) - if(NOT VCPKG_TOOLCHAIN) - set(LIBLTTOOLBOX_SOURCES - win32/getopt.c - win32/getopt.h - ${LIBLTTOOLBOX_SOURCES} - ) - endif() -else() - set(GETOPT) -endif() - -add_library(lttoolbox ${LIBLTTOOLBOX_SOURCES}) -target_compile_definitions(lttoolbox PRIVATE LTTOOLBOX_EXPORTS) -set_target_properties(lttoolbox PROPERTIES SOVERSION ${VERSION_ABI}) -target_link_libraries(lttoolbox ${LIBXML2_LIBRARIES} ${ICU_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) - -add_executable(lt-print lt_print.cc) -target_link_libraries(lt-print lttoolbox ${GETOPT_LIB}) - -add_executable(lt-trim lt_trim.cc) -target_link_libraries(lt-trim lttoolbox ${GETOPT_LIB}) - -add_executable(lt-comp lt_comp.cc) -target_link_libraries(lt-comp lttoolbox ${GETOPT_LIB}) - -add_executable(lt-proc lt_proc.cc) -target_link_libraries(lt-proc lttoolbox ${GETOPT_LIB}) - -add_executable(lt-expand lt_expand.cc) -target_link_libraries(lt-expand lttoolbox ${GETOPT_LIB}) - -add_executable(lt-tmxcomp lt_tmxcomp.cc) -target_link_libraries(lt-tmxcomp lttoolbox ${GETOPT_LIB}) - -add_executable(lt-tmxproc lt_tmxproc.cc) -target_link_libraries(lt-tmxproc lttoolbox ${GETOPT_LIB}) - -if(BUILD_TESTING) - add_test(NAME tests COMMAND ${PYTHON_EXECUTABLE} "${CMAKE_SOURCE_DIR}/tests/run_tests.py" $<TARGET_FILE_DIR:lt-comp>) - set_tests_properties(tests PROPERTIES FAIL_REGULAR_EXPRESSION "FAILED") -endif() - -install(TARGETS lttoolbox - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -install(FILES ${LIBLTTOOLBOX_HEADERS} - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lttoolbox) -install(TARGETS lt-print lt-trim lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - -install(FILES dix.dtd dix.rng dix.rnc acx.rng xsd/dix.xsd xsd/acx.xsd - DESTINATION ${CMAKE_INSTALL_DATADIR}/lttoolbox) - -install(FILES lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 - DESTINATION ${CMAKE_INSTALL_MANDIR}/man1) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/Makefile.am new/lttoolbox-3.7.6/lttoolbox/Makefile.am --- old/lttoolbox-3.7.1/lttoolbox/Makefile.am 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/Makefile.am 2023-12-27 21:15:14.000000000 +0100 @@ -2,13 +2,13 @@ h_sources = acx.h alphabet.h att_compiler.h buffer.h cli.h compiler.h compression.h \ deserialiser.h entry_token.h expander.h file_utils.h fst_processor.h input_file.h lt_locale.h \ match_exe.h match_node.h match_state.h my_stdio.h node.h \ - pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h \ + pattern_list.h regexp_compiler.h serialiser.h sorted_vector.h state.h string_utils.h symbol_iter.h \ transducer.h trans_exe.h xml_parse_util.h xml_walk_util.h exception.h tmx_compiler.h \ ustring.h sorted_vector.hpp cc_sources = acx.cc alphabet.cc att_compiler.cc cli.cc compiler.cc compression.cc entry_token.cc \ expander.cc file_utils.cc fst_processor.cc input_file.cc lt_locale.cc match_exe.cc \ match_node.cc match_state.cc node.cc pattern_list.cc \ - regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc transducer.cc \ + regexp_compiler.cc sorted_vector.cc state.cc string_utils.cc symbol_iter.cc transducer.cc \ trans_exe.cc xml_parse_util.cc xml_walk_util.cc tmx_compiler.cc ustring.cc library_includedir = $(includedir)/$(PACKAGE_NAME) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/acx.h new/lttoolbox-3.7.6/lttoolbox/acx.h --- old/lttoolbox-3.7.1/lttoolbox/acx.h 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/acx.h 2023-12-27 21:15:14.000000000 +0100 @@ -18,6 +18,7 @@ #define _ACXPARSEUTIL_ #include <lttoolbox/sorted_vector.hpp> +#include <cstdint> #include <map> std::map<int32_t, sorted_vector<int32_t>> readACX(const char* file); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/alphabet.cc new/lttoolbox-3.7.6/lttoolbox/alphabet.cc --- old/lttoolbox-3.7.1/lttoolbox/alphabet.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/alphabet.cc 2023-12-27 21:15:14.000000000 +0100 @@ -19,6 +19,7 @@ #include <lttoolbox/my_stdio.h> #include <lttoolbox/serialiser.h> #include <lttoolbox/deserialiser.h> +#include <lttoolbox/symbol_iter.h> #include <cctype> #include <cstdlib> @@ -311,24 +312,9 @@ Alphabet::tokenize(UStringView str) const { std::vector<int32_t> ret; - size_t end = str.size(); - size_t i = 0; - UChar32 c; - while (i < end) { - U16_NEXT(str.data(), i, end, c); - if (c == '\\') { - } else if (c == '<') { - size_t j = i; - while (c != '>' && j < end) { - U16_NEXT(str.data(), j, end, c); - } - if (c == '>') { - ret.push_back(operator()(str.substr(i-1, j-i+1))); - i = j; - } - } else { - ret.push_back(static_cast<int32_t>(c)); - } + for (auto sym : symbol_iter(str)) { + if (sym.size() > 1) ret.push_back(operator()(sym)); + else ret.push_back(static_cast<int32_t>(sym[0])); } return ret; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/cli.cc new/lttoolbox-3.7.6/lttoolbox/cli.cc --- old/lttoolbox-3.7.1/lttoolbox/cli.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/cli.cc 2023-12-27 21:15:14.000000000 +0100 @@ -63,15 +63,15 @@ epilog = e; } -void CLI::print_usage() +void CLI::print_usage(std::ostream& out) { if (!prog_name.empty()) { - std::cout << prog_name; + out << prog_name; if (!version.empty()) { - std::cout << " v" << version; + out << " v" << version; } - std::cout << ": " << description << std::endl; - std::cout << "USAGE: " << prog_name; + out << ": " << description << std::endl; + out << "USAGE: " << prog_name; std::string bargs; std::string sargs; for (auto& it : options) { @@ -86,34 +86,34 @@ } } if (!bargs.empty()) { - std::cout << " [-" << bargs << "]"; + out << " [-" << bargs << "]"; } - std::cout << sargs; + out << sargs; int depth = 0; for (auto& it : file_args) { - std::cout << ' '; + out << ' '; if (it.second) { - std::cout << '['; + out << '['; depth += 1; } - std::cout << it.first; + out << it.first; } - while (depth-- > 0) std::cout << "]"; - std::cout << std::endl; + while (depth-- > 0) out << "]"; + out << std::endl; for (auto& it : options) { - std::cout << " -" << it.short_opt; + out << " -" << it.short_opt; #if HAVE_GETOPT_LONG - std::cout << ", --" << it.long_opt << ':'; + out << ", --" << it.long_opt << ':'; for (size_t i = it.long_opt.size(); i < 20; i++) { - std::cout << ' '; + out << ' '; } #else - std::cout << ": "; + out << ": "; #endif - std::cout << it.desc << std::endl; + out << it.desc << std::endl; } if (!epilog.empty()) { - std::cout << epilog << std::endl; + out << epilog << std::endl; } } exit(EXIT_FAILURE); @@ -162,8 +162,11 @@ break; } } - if (!found || cnt == 'h') { - print_usage(); + if (!found) { + print_usage(std::cerr); + } + else if (cnt == 'h') { + print_usage(std::cout); } } while (optind < argc) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/cli.h new/lttoolbox-3.7.6/lttoolbox/cli.h --- old/lttoolbox-3.7.1/lttoolbox/cli.h 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/cli.h 2023-12-27 21:15:14.000000000 +0100 @@ -18,6 +18,7 @@ #include <string> #include <vector> #include <map> +#include <iostream> class CLI { private: @@ -52,7 +53,7 @@ void add_bool_arg(char short_flag, std::string long_flag, std::string desc); void add_file_arg(std::string name, bool optional = true); void set_epilog(std::string e); - void print_usage(); + void print_usage(std::ostream& out = std::cerr); void parse_args(int argc, char* argv[]); std::map<std::string, std::vector<std::string>>& get_strs(); std::map<std::string, bool>& get_bools(); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/fst_processor.cc new/lttoolbox-3.7.6/lttoolbox/fst_processor.cc --- old/lttoolbox-3.7.1/lttoolbox/fst_processor.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/fst_processor.cc 2023-12-27 21:15:14.000000000 +0100 @@ -20,6 +20,7 @@ #include <lttoolbox/xml_parse_util.h> #include <lttoolbox/file_utils.h> #include <lttoolbox/string_utils.h> +#include <lttoolbox/symbol_iter.h> #include <iostream> #include <cerrno> @@ -1806,10 +1807,38 @@ } } +bool +FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue, + bool delim, bool mark) +{ + State current_state = initial_state; + bool firstupper = u_isupper(word[0]); + bool uppercase = firstupper && u_isupper(word[1]); + for (auto symbol : symbol_iter(word)) { + int32_t val = (symbol.size() == 1 ? symbol[0] : alphabet(symbol)); + if (current_state.size() != 0) { + current_state.step(val, beCaseSensitive(current_state)); + } + if (current_state.isFinal(all_finals)) { + result.clear(); + if (delim) result += '^'; + if (mark) result += '='; + result += current_state.filterFinals(all_finals, alphabet, + escaped_chars, + displayWeightsMode, maxAnalyses, maxWeightClasses, + uppercase, firstupper, 0).substr(1); + } + if (current_state.size() == 0) { + if (!result.empty()) queue.append(symbol); + else return false; + } + } + return !result.empty(); +} + UString FSTProcessor::biltransfull(UStringView input_word, bool with_delim) { - State current_state = initial_state; UString result; unsigned int start_point = 1; unsigned int end_point = input_word.size()-2; @@ -1833,83 +1862,11 @@ mark = true; } - bool firstupper = u_isupper(input_word[start_point]); - bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast<int32_t>(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast<int32_t>(input_word[i]); - } - if(current_state.size() != 0) - { - if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } - } - if(current_state.isFinal(all_finals)) - { - result.clear(); - if(with_delim) { - result += '^'; - } - if(mark) { - result += '='; - } - result += current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0).substr(1); - } - - if(current_state.size() == 0) - { - if(!symbol.empty() && !result.empty()) - { - queue.append(symbol); - } - else - { - // word is not present - if(with_delim) - { - result = "^@"_u + US(input_word.substr(1)); - } - else - { - result = "@"_u + US(input_word); - } - return result; - } - } + auto word = input_word.substr(start_point, end_point-start_point); + bool exists = step_biltrans(word, result, queue, with_delim, mark); + if (!exists) { + if (with_delim) return "^@"_u + US(input_word.substr(1)); + else return "@"_u + US(input_word); } if(start_point < (end_point - 3)) @@ -1920,27 +1877,7 @@ if(!queue.empty()) { - UString result_with_queue; - for(unsigned int i = 0, limit = result.size(); i != limit; i++) - { - switch(result[i]) - { - case '\\': - result_with_queue += '\\'; - i++; - break; - - case '/': - result_with_queue.append(queue); - break; - - default: - break; - } - result_with_queue += result[i]; - } - result_with_queue.append(queue); - + UString result_with_queue = compose(result, queue); if(with_delim) { result_with_queue += '$'; @@ -1986,110 +1923,18 @@ mark = true; } - bool firstupper = u_isupper(input_word[start_point]); - bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast<int32_t>(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast<int32_t>(input_word[i]); - } - if(current_state.size() != 0) - { - if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } - } - if(current_state.isFinal(all_finals)) - { - result.clear(); - if (with_delim) { - result += '^'; - } - if (mark) { - result += '='; - } - result += current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0).substr(1); - } - - if(current_state.size() == 0) - { - if(!symbol.empty() && !result.empty()) - { - queue.append(symbol); - } - else - { - // word is not present - if(with_delim) - { - result = "^@"_u + US(input_word.substr(1)); - } - else - { - result = "@"_u + US(input_word); - } - return result; - } - } + UStringView word = input_word.substr(start_point, end_point-start_point); + bool exists = step_biltrans(word, result, queue, with_delim, mark); + if (!exists) { + if (with_delim) return "^@"_u + US(input_word.substr(1)); + else return "@"_u + US(input_word); } // attach unmatched queue automatically if(!queue.empty()) { - UString result_with_queue; - for(unsigned int i = 0, limit = result.size(); i != limit; i++) - { - switch(result[i]) - { - case '\\': - result_with_queue += '\\'; - i++; - break; - - case '/': - result_with_queue.append(queue); - break; - - default: - break; - } - result_with_queue += result[i]; - } - result_with_queue.append(queue); - + UString result_with_queue = compose(result, queue); if(with_delim) { result_with_queue += '$'; @@ -2345,45 +2190,18 @@ bool firstupper = u_isupper(input_word[start_point]); bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - for(unsigned int i = start_point; i <= end_point; i++) - { - int val = 0; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = input_word[i]; - } - else if(input_word[i] == '<') - { - seentags = true; - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } + UStringView word = input_word.substr(start_point, end_point-start_point); + for (auto symbol : symbol_iter(word)) { + int32_t val; + if (symbol.size() == 1) { + val = symbol[0]; + } else { val = alphabet(symbol); - } - else - { - val = input_word[i]; + seentags = true; } if(current_state.size() != 0) { - if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } + current_state.step_case(val, beCaseSensitive(current_state)); } if(current_state.isFinal(all_finals)) { @@ -2445,27 +2263,7 @@ if(!queue.empty()) { - UString result_with_queue; - for(unsigned int i = 0, limit = result.size(); i != limit; i++) - { - switch(result[i]) - { - case '\\': - result_with_queue += '\\'; - i++; - break; - - case '/': - result_with_queue.append(queue); - break; - - default: - break; - } - result_with_queue += result[i]; - } - result_with_queue.append(queue); - + UString result_with_queue = compose(result, queue); if(with_delim) { result_with_queue += '$'; @@ -2508,79 +2306,12 @@ mark = true; } - bool firstupper = u_isupper(input_word[start_point]); - bool uppercase = firstupper && u_isupper(input_word[start_point+1]); - - for(unsigned int i = start_point; i <= end_point; i++) - { - int val; - UString symbol; - - if(input_word[i] == '\\') - { - i++; - val = static_cast<int32_t>(input_word[i]); - } - else if(input_word[i] == '<') - { - symbol = '<'; - for(unsigned int j = i + 1; j <= end_point; j++) - { - symbol += input_word[j]; - if(input_word[j] == '>') - { - i = j; - break; - } - } - val = alphabet(symbol); - } - else - { - val = static_cast<int32_t>(input_word[i]); - } - if(current_state.size() != 0) - { - if(!alphabet.isTag(val) && u_isupper(val) && !beCaseSensitive(current_state)) - { - current_state.step(val, u_tolower(val)); - } - else - { - current_state.step(val); - } - } - if(current_state.isFinal(all_finals)) - { - result.clear(); - if (with_delim) { - result += '^'; - } - if (mark) { - result += '='; - } - result += current_state.filterFinals(all_finals, alphabet, - escaped_chars, - displayWeightsMode, maxAnalyses, maxWeightClasses, - uppercase, firstupper, 0).substr(1); - } - - if(current_state.size() == 0) - { - if(symbol.empty()) - { - // word is not present - if(with_delim) - { - result = "^@"_u + US(input_word.substr(1)); - } - else - { - result = "@"_u + US(input_word); - } - return result; - } - } + auto word = input_word.substr(start_point, end_point-start_point); + UString queue; + bool exists = step_biltrans(word, result, queue, with_delim, mark); + if (!exists || !queue.empty()) { + if (with_delim) return "^@"_u + US(input_word.substr(1)); + else return "@"_u + US(input_word); } if(with_delim) @@ -2896,6 +2627,12 @@ maxWeightClasses = value; } +void +FSTProcessor::setCompoundMaxElements(int value) +{ + compound_max_elements = value; +} + bool FSTProcessor::getDecompoundingMode() { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/fst_processor.h new/lttoolbox-3.7.6/lttoolbox/fst_processor.h --- old/lttoolbox-3.7.1/lttoolbox/fst_processor.h 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/fst_processor.h 2023-12-27 21:15:14.000000000 +0100 @@ -430,6 +430,8 @@ void generation_wrapper_null_flush(InputFile& input, UFILE *output, GenerationMode mode); UString compose(UStringView lexforms, UStringView queue) const; + bool step_biltrans(UStringView word, UString& result, UString& queue, + bool delim, bool mark); void procNodeICX(); void procNodeRCX(); @@ -440,6 +442,7 @@ xmlTextReaderPtr reader; static constexpr size_t max_case_insensitive_state_size = 65536; + bool max_case_insensitive_state_size_warned = false; /* * Including lowercased versions for every character can potentially create very large states * (See https://github.com/apertium/lttoolbox/issues/167 ). As a sanity-check we don't do @@ -448,7 +451,20 @@ * @return running with --case-sensitive or state size exceeds max */ bool beCaseSensitive(const State& state) { - return caseSensitive || state.size() >= max_case_insensitive_state_size; + if(caseSensitive) { + return true; + } + else if(state.size() < max_case_insensitive_state_size) { + return false; // ie. do case-folding + } + else { + if(!max_case_insensitive_state_size_warned) { + max_case_insensitive_state_size_warned = true; // only warn once + UFILE* err_out = u_finit(stderr, NULL, NULL); + u_fprintf(err_out, "Warning: matching case-sensitively since processor state size >= %d\n", max_case_insensitive_state_size); + } + return true; + } } public: @@ -505,6 +521,7 @@ void setDisplayWeightsMode(bool value); void setMaxAnalysesValue(int value); void setMaxWeightClassesValue(int value); + void setCompoundMaxElements(int value); bool getNullFlush(); bool getDecompoundingMode(); }; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/input_file.cc new/lttoolbox-3.7.6/lttoolbox/input_file.cc --- old/lttoolbox-3.7.1/lttoolbox/input_file.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/input_file.cc 2023-12-27 21:15:14.000000000 +0100 @@ -44,6 +44,16 @@ return (infile != nullptr); } +#if HAVE_DECL_FMEMOPEN +bool +InputFile::open_in_memory(char *input_buffer) +{ + close(); + infile = fmemopen(input_buffer, strlen(input_buffer), "rb"); + return (infile != nullptr); +} +#endif + void InputFile::open_or_exit(const char* fname) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/input_file.h new/lttoolbox-3.7.6/lttoolbox/input_file.h --- old/lttoolbox-3.7.1/lttoolbox/input_file.h 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/input_file.h 2023-12-27 21:15:14.000000000 +0100 @@ -34,6 +34,9 @@ InputFile(); ~InputFile(); bool open(const char* fname = nullptr); +#if HAVE_DECL_FMEMOPEN + bool open_in_memory(char* input_buffer); +#endif void open_or_exit(const char* fname = nullptr); void close(); void wrap(FILE* newinfile); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/lt_comp.cc new/lttoolbox-3.7.6/lttoolbox/lt_comp.cc --- old/lttoolbox-3.7.1/lttoolbox/lt_comp.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/lt_comp.cc 2023-12-27 21:15:14.000000000 +0100 @@ -125,7 +125,7 @@ if(opc == "lr") { if (have_vl) { - std::cout << "Error: -l specified, but mode is lr" << std::endl; + std::cerr << "Error: -l specified, but mode is lr" << std::endl; cli.print_usage(); } if(ttype == 'a') @@ -144,7 +144,7 @@ else if(opc == "rl") { if (have_vr) { - std::cout << "Error: -r specified, but mode is rl" << std::endl; + std::cerr << "Error: -r specified, but mode is rl" << std::endl; cli.print_usage(); } if(ttype == 'a') diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/lt_paradigm.cc new/lttoolbox-3.7.6/lttoolbox/lt_paradigm.cc --- old/lttoolbox-3.7.1/lttoolbox/lt_paradigm.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/lt_paradigm.cc 2023-12-27 21:15:14.000000000 +0100 @@ -21,6 +21,8 @@ #include <lttoolbox/state.h> #include <lttoolbox/trans_exe.h> #include <lttoolbox/cli.h> +#include <lttoolbox/symbol_iter.h> +#include <lttoolbox/string_utils.h> #include <queue> @@ -55,17 +57,33 @@ } } +sorted_vector<int32_t> split_tag(UStringView sym, Alphabet& alpha, int prefix, + UChar32 sep) +{ + sorted_vector<int32_t> ret; + auto names = StringUtils::split_escaped(sym.substr(prefix+1, sym.size()-prefix-2), sep); + for (auto& tg : names) { + UString tag; + tag += '<'; + tag += tg; + tag += '>'; + ret.insert(alpha(tag)); + } + return ret; +} + void process(UStringView pattern, std::map<UString, Transducer>& trans, Alphabet& alpha, - const std::set<UChar32>& letters, const std::set<int32_t>& tags, + const std::set<UChar32>& letters, + const sorted_vector<int32_t>& tags, UFILE* output, bool sort) { int32_t any_char = static_cast<int32_t>('*'); int32_t any_tag = alpha(u"<*>"); - std::vector<int32_t> pat = alpha.tokenize(pattern); Transducer other; int state = other.getInitial(); - for (auto& it : pat) { + for (auto sym : symbol_iter(pattern)) { + int32_t it = (sym.size() == 1 ? sym[0] : alpha(sym)); if (it == any_char) { state = other.insertNewSingleTransduction(0, state); for (auto& sym : letters) { @@ -76,6 +94,30 @@ for (auto& sym : tags) { other.linkStates(state, state, alpha(sym, sym)); } + } else if (it == 0 && StringUtils::startswith(sym, "<*|"_u)) { + auto or_tags = split_tag(sym, alpha, 2, '|'); + state = other.insertNewSingleTransduction(0, state); + for (auto& t : or_tags) { + other.linkStates(state, state, alpha(t, t)); + } + } else if (it == 0 && StringUtils::startswith(sym, "<*"_u)) { + auto del_tags = split_tag(sym, alpha, 1, '-'); + state = other.insertNewSingleTransduction(0, state); + for (auto& t : tags) { + if (del_tags.find(t) == del_tags.end()) { + other.linkStates(state, state, alpha(t, t)); + } + } + } else if (it == 0 && StringUtils::startswith(sym, "<|"_u)) { + auto or_tags = split_tag(sym, alpha, 1, '|'); + auto old_state = state; + for (auto& t : or_tags) { + if (old_state == state) { + state = other.insertNewSingleTransduction(alpha(t, t), state); + } else { + other.linkStates(old_state, state, alpha(t, t)); + } + } } else { state = other.insertNewSingleTransduction(alpha(it, it), state); } @@ -128,7 +170,7 @@ fclose(fst); alpha.includeSymbol(u"<*>"); - std::set<int32_t> tags; + sorted_vector<int32_t> tags; for (int32_t i = 1; i <= alpha.size(); i++) { if (!skip_tags.empty()) { UString t; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/lt_proc.cc new/lttoolbox-3.7.6/lttoolbox/lt_proc.cc --- old/lttoolbox-3.7.1/lttoolbox/lt_proc.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/lt_proc.cc 2023-12-27 21:15:14.000000000 +0100 @@ -59,6 +59,7 @@ cli.add_bool_arg('W', "show-weights", "Print final analysis weights (if any)"); cli.add_str_arg('N', "analyses", "Output no more than N analyses (if the transducer is weighted, the N best analyses)", "N"); cli.add_str_arg('L', "weight-classes", "Output no more than N best weight classes (where analyses with equal weight constitute a class)", "N"); + cli.add_str_arg('M', "compound-max-elements", "Set compound max elements", "N"); cli.add_bool_arg('h', "help", "show this help"); cli.parse_args(argc, argv); @@ -157,6 +158,14 @@ } fstp.setMaxWeightClassesValue(n); } + if (strs.find("compound-max-elements") != strs.end()) { // Test + int n = atoi(strs["compound-max-elements"].back().c_str()); + if (n < 1) { + std::cerr << "Invalid or no argument for compound max elements" << std::endl; + exit(EXIT_FAILURE); + } + fstp.setCompoundMaxElements(n); + } FILE* in = openInBinFile(cli.get_files()[0]); fstp.load(in); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/sorted_vector.hpp new/lttoolbox-3.7.6/lttoolbox/sorted_vector.hpp --- old/lttoolbox-3.7.1/lttoolbox/sorted_vector.hpp 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/sorted_vector.hpp 2023-12-27 21:15:14.000000000 +0100 @@ -22,6 +22,7 @@ #include <vector> #include <algorithm> #include <functional> +#include <iterator> namespace detail { template<typename ForwardIt, typename Comp> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/state.cc new/lttoolbox-3.7.6/lttoolbox/state.cc --- old/lttoolbox-3.7.1/lttoolbox/state.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/state.cc 2023-12-27 21:15:14.000000000 +0100 @@ -431,6 +431,19 @@ } } +void +State::step_optional(UChar32 val) +{ + if (val == 0) return; + std::vector<TNodeState> new_state; + for (size_t i = 0; i < state.size(); i++) { + apply_into(&new_state, val, i, false); + } + new_state.swap(state); + epsilonClosure(); + new_state.swap(state); + state.insert(state.end(), new_state.begin(), new_state.end()); +} bool State::isFinal(std::map<Node *, double> const &finals) const @@ -946,3 +959,14 @@ retval += ']'; return retval; } + +void +State::merge(const State& other) +{ + for (auto& it : other.state) { + std::vector<std::pair<int, double>>* tmp = new std::vector<std::pair<int, double>>(); + *tmp = *(it.sequence); + TNodeState ns(it.where, tmp, it.dirty); + this->state.push_back(std::move(ns)); + } +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/state.h new/lttoolbox-3.7.6/lttoolbox/state.h --- old/lttoolbox-3.7.1/lttoolbox/state.h 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/state.h 2023-12-27 21:15:14.000000000 +0100 @@ -201,6 +201,10 @@ void step_case_override(const int val, const bool caseSensitive); + void step_optional(UChar32 val); + + void closure(const sorted_vector<int32_t>& symbols); + /** * Init the state with the initial node and empty output * @param initial the initial node of the transducer @@ -223,6 +227,12 @@ void pruneStatesWithForbiddenSymbol(int forbiddenSymbol); /** + * Remove states not containing a particular symbol + * @param symbol the symbol that is required + */ + void requireSymbol(int32_t symbol); + + /** * Whether any of the analyses contains a certain symbol * @param requiredSymbol the symbol we're looking for */ @@ -343,6 +353,11 @@ std::queue<UString> &blanks, std::vector<UString> &numbers) const; + /** + * Add all paths in other to self + */ + void merge(const State& other); + }; #endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/string_utils.cc new/lttoolbox-3.7.6/lttoolbox/string_utils.cc --- old/lttoolbox-3.7.1/lttoolbox/string_utils.cc 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/string_utils.cc 2023-12-27 21:15:14.000000000 +0100 @@ -67,6 +67,29 @@ return result; } +std::vector<UString> +StringUtils::split_escaped(UStringView str, UChar delim) +{ + std::vector<UString> result; + size_t start = 0; + for (size_t i = 0; i < str.size(); i++) { + if (str[i] == '\\') { + i++; + continue; + } + if (str[i] == delim) { + if (i > start) { + result.push_back(US(str.substr(start, i-start))); + } + start = i+1; + } + } + if (start < str.size()) { + result.push_back(US(str.substr(start))); + } + return result; +} + UString StringUtils::join(const std::vector<UString>& vec, UStringView delim) { diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/string_utils.h new/lttoolbox-3.7.6/lttoolbox/string_utils.h --- old/lttoolbox-3.7.1/lttoolbox/string_utils.h 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/string_utils.h 2023-12-27 21:15:14.000000000 +0100 @@ -12,6 +12,9 @@ // split string on delimiter static std::vector<UString> split(UStringView str, UStringView delim=u" "); + // split but respect \ escapes + static std::vector<UString> split_escaped(UStringView str, UChar delim); + // inverse of split static UString join(const std::vector<UString>& vec, UStringView delim); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/symbol_iter.cc new/lttoolbox-3.7.6/lttoolbox/symbol_iter.cc --- old/lttoolbox-3.7.1/lttoolbox/symbol_iter.cc 1970-01-01 01:00:00.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/symbol_iter.cc 2023-12-27 21:15:14.000000000 +0100 @@ -0,0 +1,58 @@ +#include <lttoolbox/symbol_iter.h> +#include <unicode/uchar.h> + +symbol_iter::iterator::iterator(UStringView s) : str(s) +{ + ++*this; +} + +symbol_iter::iterator::iterator(const symbol_iter::iterator& other) + : str(other.str), sloc(other.sloc), eloc(other.eloc) {} + +symbol_iter::iterator::~iterator() {} + +UStringView symbol_iter::iterator::operator*() const { + return str.substr(sloc, eloc-sloc); +} + +symbol_iter::iterator& symbol_iter::iterator::operator++() +{ + if (sloc < str.size()) { + sloc = eloc; + UChar32 c; + U16_NEXT(str.data(), eloc, str.size(), c); + if (c == '\\') { + sloc++; + U16_NEXT(str.data(), eloc, str.size(), c); + } else if (c == '<') { + auto i = eloc; + while (c != '>' && i < str.size()) U16_NEXT(str.data(), i, str.size(), c); + if (c == '>') eloc = i; + } + if (eloc > str.size()) eloc = str.size(); + } + return *this; +} + +bool symbol_iter::iterator::operator!=(const symbol_iter::iterator& o) const +{ + return str != o.str || sloc != o.sloc || eloc != o.eloc; +} + +bool symbol_iter::iterator::operator==(const symbol_iter::iterator& o) const +{ + return str == o.str && sloc == o.sloc && eloc == o.eloc; +} + +symbol_iter::iterator symbol_iter::begin() const +{ + return symbol_iter::iterator(str); +} + +symbol_iter::iterator symbol_iter::end() const +{ + symbol_iter::iterator ret(str); + ret.sloc = str.size(); + ret.eloc = str.size(); + return ret; +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/lttoolbox/symbol_iter.h new/lttoolbox-3.7.6/lttoolbox/symbol_iter.h --- old/lttoolbox-3.7.1/lttoolbox/symbol_iter.h 1970-01-01 01:00:00.000000000 +0100 +++ new/lttoolbox-3.7.6/lttoolbox/symbol_iter.h 2023-12-27 21:15:14.000000000 +0100 @@ -0,0 +1,33 @@ +#ifndef __LT_SYMBOL_ITER_H__ +#define __LT_SYMBOL_ITER_H__ + +#include <ustring.h> + +class symbol_iter +{ +private: + UStringView str; +public: + symbol_iter(UStringView s) : str(s) {} + ~symbol_iter() {} + class iterator + { + friend symbol_iter; + private: + UStringView str; + UStringView::size_type sloc = 0; + UStringView::size_type eloc = 0; + public: + iterator(UStringView s); + iterator(const iterator& other); + ~iterator(); + UStringView operator*() const; + iterator& operator++(); + bool operator!=(const symbol_iter::iterator& other) const; + bool operator==(const symbol_iter::iterator& other) const; + }; + iterator begin() const; + iterator end() const; +}; + +#endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/python/CMakeLists.txt new/lttoolbox-3.7.6/python/CMakeLists.txt --- old/lttoolbox-3.7.1/python/CMakeLists.txt 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/python/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 @@ -1,33 +0,0 @@ -find_package(SWIG 3.0 REQUIRED) -find_package(PythonInterp 3.5 REQUIRED) - -get_directory_property(_defs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMPILE_DEFINITIONS) -string(REPLACE ";" " -D" defs "-D${_defs}") - -set(PYTHON_FILE "lttoolbox.py") -set(CPP_WRAP_FILE "lttoolbox_wrap.cpp") -set(top_srcdir ${CMAKE_SOURCE_DIR}) -set(CXXFLAGS "${CMAKE_CXX_FLAGS} ${defs}") -set(PACKAGE ${PROJECT_NAME}) -set(PACKAGE_NAME ${PROJECT_NAME}) -set(PACKAGE_VERSION ${PROJECT_VERSION}) - -configure_file(lttoolbox.i.in lttoolbox.i @ONLY) -configure_file(setup.py.in setup.py @ONLY) - -add_custom_command(OUTPUT ${CPP_WRAP_FILE} ${PYTHON_FILE} - COMMAND ${PYTHON_EXECUTABLE} setup.py build - COMMENT "Building ${PYTHON_FILE}" -) - -add_custom_target(wrapper ALL - DEPENDS ${CPP_WRAP_FILE} ${PYTHON_FILE} - VERBATIM -) - -if(NOT PYTHON_INSTALL_PARAMS) - set(PYTHON_INSTALL_PARAMS "--prefix=${CMAKE_INSTALL_PREFIX} --root=\$ENV{DESTDIR}/") -endif() - -set(INSTALL_WRAPPER "${PYTHON_EXECUTABLE} setup.py install ${PYTHON_INSTALL_PARAMS}") -install(CODE "execute_process(COMMAND ${INSTALL_WRAPPER} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})") diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/lttoolbox-3.7.1/tests/lt_paradigm/__init__.py new/lttoolbox-3.7.6/tests/lt_paradigm/__init__.py --- old/lttoolbox-3.7.1/tests/lt_paradigm/__init__.py 2022-11-01 09:36:47.000000000 +0100 +++ new/lttoolbox-3.7.6/tests/lt_paradigm/__init__.py 2023-12-27 21:15:14.000000000 +0100 @@ -37,3 +37,34 @@ inputs = ['*<n><*>'] expectedOutputs = ['ab<n><def>:abc\nab<n><ind>:ab\nn<n><ind>:n\ny<n><ind>:y'] sortoutput = False + +class ExcludeSingleTest(ParadigmTest): + procdix = 'data/unbalanced-epsilons-mono.dix' + inputs = ['*<vblex><*>', '*<vblex><*-pres>', '*<vblex><*-inf-pret>'] + expectedOutputs = [ + 're<vblex><inf>:re\nre<vblex><pres>:rer\nre<vblex><pres>:res\nre<vblex><pret>:ret', + 're<vblex><inf>:re\nre<vblex><pret>:ret', + 're<vblex><pres>:rer\nre<vblex><pres>:res' + ] + +class OrTagTest(ParadigmTest): + procdix = 'data/unbalanced-epsilons-mono.dix' + inputs = ['re<vblex><|pres|pret>', 're<vblex><|inf>', 're<vblex><|xqz>'] + expectedOutputs = [ + 're<vblex><pres>:rer\nre<vblex><pres>:res\nre<vblex><pret>:ret', + 're<vblex><inf>:re\nre<vblex><pret>:ret', + '' + ] + +class OrTagRepeatTest(ParadigmTest): + procdix = 'data/unbalanced-epsilons-mono.dix' + inputs = [ + 're<*|vblex|pres|pret>', + 're<*|inf|vblex>', + 're<*|n|adj|vblex|inf>' + ] + expectedOutputs = [ + 're<vblex><pres>:rer\nre<vblex><pres>:res\nre<vblex><pret>:ret', + 're<vblex><inf>:re\nre<vblex><pret>:ret', + 're<vblex><inf>:re\nre<vblex><pret>:ret', + ]