Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package kf6-kcodecs for openSUSE:Factory checked in at 2025-12-16 15:50:28 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/kf6-kcodecs (Old) and /work/SRC/openSUSE:Factory/.kf6-kcodecs.new.1939 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "kf6-kcodecs" Tue Dec 16 15:50:28 2025 rev:22 rq:1322697 version:6.21.0 Changes: -------- --- /work/SRC/openSUSE:Factory/kf6-kcodecs/kf6-kcodecs.changes 2025-11-17 12:14:44.375912457 +0100 +++ /work/SRC/openSUSE:Factory/.kf6-kcodecs.new.1939/kf6-kcodecs.changes 2025-12-16 15:55:12.092188263 +0100 @@ -1,0 +2,15 @@ +Fri Dec 12 20:17:20 UTC 2025 - Christophe Marin <[email protected]> + +- Update to 6.21.0 + * New feature release + * For more details please see: + * https://kde.org/announcements/frameworks/6/6.21.0 +- Changes since 6.20.0: + * Update dependency version to 6.21.0 + * [KEncodingProber] Some more tests for UTF-8 + * [KEncodingProber] Make UTF-8 state machine RFC3629 compliant + * [KEncodingProber] Add unit tests for UTF-8/UTF-16 + * [KEncodingProber] Remove DEBUG_PROBE from public header file + * Update version to 6.21.0 + +------------------------------------------------------------------- Old: ---- kcodecs-6.20.0.tar.xz kcodecs-6.20.0.tar.xz.sig New: ---- kcodecs-6.21.0.tar.xz kcodecs-6.21.0.tar.xz.sig ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ kf6-kcodecs.spec ++++++ --- /var/tmp/diff_new_pack.8W9VVT/_old 2025-12-16 15:55:13.364242129 +0100 +++ /var/tmp/diff_new_pack.8W9VVT/_new 2025-12-16 15:55:13.364242129 +0100 @@ -19,11 +19,11 @@ %define qt6_version 6.8.0 %define rname kcodecs -# Full KF6 version (e.g. 6.20.0) +# Full KF6 version (e.g. 6.21.0) %{!?_kf6_version: %global _kf6_version %{version}} %bcond_without released Name: kf6-kcodecs -Version: 6.20.0 +Version: 6.21.0 Release: 0 Summary: Method collection to manipulate strings using various encodings License: LGPL-2.1-or-later ++++++ kcodecs-6.20.0.tar.xz -> kcodecs-6.21.0.tar.xz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/CMakeLists.txt new/kcodecs-6.21.0/CMakeLists.txt --- old/kcodecs-6.20.0/CMakeLists.txt 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/CMakeLists.txt 2025-12-05 14:19:12.000000000 +0100 @@ -1,10 +1,10 @@ cmake_minimum_required(VERSION 3.16) -set(KF_VERSION "6.20.0") # handled by release scripts +set(KF_VERSION "6.21.0") # handled by release scripts project(KCodecs VERSION ${KF_VERSION}) include(FeatureSummary) -find_package(ECM 6.20.0 NO_MODULE) +find_package(ECM 6.21.0 NO_MODULE) set_package_properties(ECM PROPERTIES TYPE REQUIRED DESCRIPTION "Extra CMake Modules." URL "https://commits.kde.org/extra-cmake-modules") feature_summary(WHAT REQUIRED_PACKAGES_NOT_FOUND FATAL_ON_MISSING_REQUIRED_PACKAGES) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/autotests/CMakeLists.txt new/kcodecs-6.21.0/autotests/CMakeLists.txt --- old/kcodecs-6.20.0/autotests/CMakeLists.txt 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/autotests/CMakeLists.txt 2025-12-05 14:19:12.000000000 +0100 @@ -21,6 +21,14 @@ LINK_LIBRARIES KF6::Codecs Qt6::Test ) +ecm_add_test( + TEST_NAME kencodingproberunittest + kencodingproberunittest.cpp + ../src/probers/nsMBCSSM.cpp + LINK_LIBRARIES Qt6::Test +) +target_include_directories(kencodingproberunittest PRIVATE ${CMAKE_BINARY_DIR}/src) + # Benchmark, compiled, but not run automatically with ctest add_executable(base64benchmark base64benchmark.cpp) target_link_libraries(base64benchmark KF6::Codecs Qt6::Test) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/autotests/kencodingproberunittest.cpp new/kcodecs-6.21.0/autotests/kencodingproberunittest.cpp --- old/kcodecs-6.20.0/autotests/kencodingproberunittest.cpp 1970-01-01 01:00:00.000000000 +0100 +++ new/kcodecs-6.21.0/autotests/kencodingproberunittest.cpp 2025-12-05 14:19:12.000000000 +0100 @@ -0,0 +1,226 @@ +/* + SPDX-FileCopyrightText: 2025 Stefan Brüns <[email protected]> + + SPDX-License-Identifier: GPL-2.0-or-later +*/ + +#include <QTest> + +#include "../src/probers/nsCodingStateMachine.h" + +class KEncodingProberUnitTest : public QObject +{ + Q_OBJECT + +private Q_SLOTS: + void testUtf8(); + void testUtf8_data(); + void testUtf16BE(); + void testUtf16BE_data(); + void testUtf16LE(); + void testUtf16LE_data(); + void testUtf16_common_data(); +}; + +void KEncodingProberUnitTest::testUtf8() +{ + QFETCH(QByteArray, data); + QFETCH(bool, utf8Valid); + + using namespace kencodingprober; + + nsCodingStateMachine stateMachine{&UTF8SMModel}; + nsSMState state = eStart; + + for (auto b : data) { + state = stateMachine.NextState(b); + } + + if (utf8Valid) { + QVERIFY(state != eError); + } else { + QVERIFY(state == eError); + } +} + +void KEncodingProberUnitTest::testUtf8_data() +{ + using namespace Qt::StringLiterals; + + QTest::addColumn<QByteArray>("data"); + QTest::addColumn<bool>("utf8Valid"); + + QTest::addRow("UTF-8 Latin1") << "abcdxyzABCDXYZ 0129;,"_ba << true; + QTest::addRow("BOM UTF-8") << "\xef\xbb\xbfZ"_ba << true; // "<UTF-8 BOM>Z" + + // multibyte sequences - length 2 + QTest::addRow("UTF-8 Latin1 Supplement") // + << "Latin1 Text \xC3\xA4\xC3\xB6\xC3\xBC\xC3\x9F"_ba // "Latin1 Text äöüß" + << true; + QTest::addRow("UTF-8 len 2") << "Text \xC3\xA4 "_ba << true; + QTest::addRow("UTF-8 len 2 short") << "Text \xC3 "_ba << false; + QTest::addRow("UTF-8 len 2 invalid range") << "Text \xC0\x90 "_ba << false; + + // multibyte sequences - length 3 + QTest::addRow("UTF-8 CJK") // + << QByteArray::fromHex("e998bfe5b094e58d91e696afe5b1b1e88489") // 阿尔卑斯山脉 + << true; + QTest::addRow("UTF-8 len 3 a") << "Text \xE2\x80\x90 "_ba << true; // "‐" (HYPHEN) + QTest::addRow("UTF-8 len 3-1 short") << "Text \xE2\x80 "_ba << false; + QTest::addRow("UTF-8 len 3-2 short") << "Text \xE2 "_ba << false; + + QTest::addRow("UTF-8 len 3 b") << "Text \xE0\xbf\xbf "_ba << true; // "๏" (THAI CHARACTER FONGMAN) + QTest::addRow("UTF-8 len 3 invalid range") << "Text \xE0\x9f\x90 "_ba << false; + + QTest::addRow("UTF-8 len 3 c") << "Text \xED\x80\x80 "_ba << true; // "퀀" (HANGUL SYLLABLE KWEON) + QTest::addRow("UTF-8 invalid CESU") << "Text \xED\xbf\x80 "_ba << false; + + // multibyte sequences - length 4 + QTest::addRow("UTF-8 SMP Symbols") << "\xF0\x9F\x82\xA1 "_ba << true; // "🂡 " (ACE OF SPADES) + QTest::addRow("UTF-8 len 4-1 short") << "\xF0\x9F\x82 "_ba << false; + QTest::addRow("UTF-8 len 4-2 short") << "\xF0\x9F "_ba << false; + QTest::addRow("UTF-8 len 4-3 short") << "\xF0 "_ba << false; + QTest::addRow("UTF-8 len 4 invalid long") << "\xF0\x8F\x90\x90 "_ba << false; + QTest::addRow("UTF-8 len 4 invalid range") << "\xF5\x90\x90\x90 "_ba << false; + + // multibyte sequences - length 5/6 (invalid) + QTest::addRow("UTF-8 len 5 invalid") << "\xF8\x90\x90\x90\x90 "_ba << false; + QTest::addRow("UTF-8 len 6 invalid") << "\xFC\x90\x90\x90\x90\x90 "_ba << false; + + QTest::addRow("UTF-8 0xFE invalid") << "\xFE "_ba << false; + QTest::addRow("UTF-8 0xFF invalid") << "\xFF "_ba << false; + + // continuation without leading 2/3/4 byte start byte + QTest::addRow("UTF-8 invalid isolate high 0x80") << "\x80 "_ba << false; + QTest::addRow("UTF-8 invalid isolate high 0x92") << "\x92 "_ba << false; + QTest::addRow("UTF-8 invalid isolate high 0xAA") << "\xAA "_ba << false; + QTest::addRow("UTF-8 invalid isolate high 0xBF") << "\xBF "_ba << false; + + // Either Windows-1252/-1254/-1255 (binary identical) + // "One pound, i.e. ½ a kilogramm of butter costs 2 £." + QTest::addRow("Windows-125x English") << "One pound, i.e. \xAF a kilogramm of butter costs 2 \xA3."_ba << false; + // Example texts with Windows-125x encoding which are definitely not UTF-8 -- see Wikipedia "Pangram" + // "Příliš žluťoučký kůň úpěl ďábelské ódy" - "A horse that was too yellow moaned devilish odes" + QTest::addRow("Windows-1250 Czech") << // + "P\xf8\xedli\x9a \x9elu\x9dou\xe8k\xfd k\xf9\xf2 \xfap\xecl \xef\xe1\x62\x65lsk\xe9 \xf3\x64y."_ba << false; + // "Под южно дърво, цъфтящо в синьо, бягаше малко пухкаво зайче" - "Under a southern tree, blooming in blue, ran a little fluffy bunny" + QTest::addRow("Windows-1251 Bulgarian") << QByteArray::fromHex( // + "cfeee420fee6edee20e4faf0e2ee2c20f6faf4f2fff9ee20" + "e220f1e8edfcee2c20e1ffe3e0f8e520ece0ebeaee20eff3" + "f5eae0e2ee20e7e0e9f7e5") << false; + // "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich." - "Victor chases twelve boxers across the Great Levee of Sylt" + QTest::addRow("Windows-1252 German") << // + "Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfc\x62\x65r den gro\xdf\x65n Sylter Deich."_ba << false; + // "שפן אכל קצת גזר בטעם חסה, ודי" - "A bunny ate some lettuce-flavored carrots, and he had enough" + QTest::addRow("Windows-1255 Hebrew") // + << QByteArray::fromHex("f9f4ef20e0ebec20f7f6fa20e2e6f820e1e8f2ed20e7f1e42c20e5e3e9") << false; +} + +void KEncodingProberUnitTest::testUtf16BE() +{ + QFETCH(QByteArray, data); + QFETCH(bool, utf16BEValid); + + using namespace kencodingprober; + + nsCodingStateMachine stateMachine{&UCS2BESMModel}; + nsSMState state = eStart; + + QEXPECT_FAIL("UTF16 Interpunctuation little", "valid codepoint rejected", Abort); + QEXPECT_FAIL("UTF16 Math supplement big", "valid codepoint rejected", Abort); + QEXPECT_FAIL("UTF16 ZWNBSP little", "zero width no-break space rejected", Abort); + for (auto b : data) { + state = stateMachine.NextState(b); + } + + if (utf16BEValid) { + QVERIFY(state != eError); + } else { + QVERIFY(state == eError); + } +} + +void KEncodingProberUnitTest::testUtf16LE() +{ + QFETCH(QByteArray, data); + QFETCH(bool, utf16LEValid); + + using namespace kencodingprober; + + nsCodingStateMachine stateMachine{&UCS2LESMModel}; + nsSMState state = eStart; + + QEXPECT_FAIL("UTF16 Interpunctuation big", "valid codepoint rejected", Abort); + QEXPECT_FAIL("UTF16 Math supplement little", "valid codepoint rejected", Abort); + QEXPECT_FAIL("UTF16 ZWNBSP big", "zero width no-break space rejected", Abort); + for (auto b : data) { + state = stateMachine.NextState(b); + } + + if (utf16LEValid) { + QVERIFY(state != eError); + } else { + QVERIFY(state == eError); + } +} + +void KEncodingProberUnitTest::testUtf16_common_data() +{ + QTest::addColumn<QByteArray>("data"); + QTest::addColumn<bool>("utf16BEValid"); + QTest::addColumn<bool>("utf16LEValid"); + + QTest::addRow("empty") << QByteArray() << true << true; + // BOM must be detected + QTest::addRow("BE BOM") << QByteArray("\xFE\xFF") << true << false; + QTest::addRow("LE BOM") << QByteArray("\xFF\xFE") << false << true; + // swapped endianess does not cause an error, as the codepoint is still valid + QTest::addRow("BE HS+LS") << QByteArray("\xDC\x00\xD8\x00") << true << true; + QTest::addRow("LE HS+LS") << QByteArray("\x00\xDC\x00\xD8") << true << true; + + struct Utf16TestData { + const char *name; + const std::span<const char16_t> data; + bool validBig; + bool validLittle; + }; + using namespace std::string_view_literals; + constexpr std::array<Utf16TestData, 7> utf16TestData = { + // syntactically correct even with wrong endianess + Utf16TestData{"UTF16 XY", u"XY"sv, true, true}, + Utf16TestData{"UTF16 ab", u"ab"sv, true, true}, + Utf16TestData{"UTF16 äöü", u"äöü"sv, true, true}, + Utf16TestData{"UTF16 BOM", u"\xFEFF"sv, true, false}, + // "‛" or "ᬠ" (U+1B20 BALINESE LETTER DA MURDA MAHAPRANA) + Utf16TestData{"UTF16 Interpunctuation", u"\x201B"sv, true, true}, + // "⨯" or "⼪" (U+2F2A KANGXI RADICAL LAME) + Utf16TestData{"UTF16 Math supplement", u"\x2A2F"sv, true, true}, + // ZWNBSP aka BOM inside the document is deprecated, but valid + Utf16TestData{"UTF16 ZWNBSP", u" \xFEFF"sv, true, true}, + }; + + for (const auto &tc : utf16TestData) { + QByteArray data; + data.resize(tc.data.size() * 2); + + qToBigEndian<quint16>(tc.data.data(), tc.data.size(), data.data()); + QTest::addRow("%s big", tc.name) << data << tc.validBig << tc.validLittle; + + qToLittleEndian<quint16>(tc.data.data(), tc.data.size(), data.data()); + QTest::addRow("%s little", tc.name) << data << tc.validLittle << tc.validBig; + } +} + +void KEncodingProberUnitTest::testUtf16BE_data() +{ + testUtf16_common_data(); +} + +void KEncodingProberUnitTest::testUtf16LE_data() +{ + testUtf16_common_data(); +} + +QTEST_MAIN(KEncodingProberUnitTest) + +#include "kencodingproberunittest.moc" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/CMakeLists.txt new/kcodecs-6.21.0/src/CMakeLists.txt --- old/kcodecs-6.20.0/src/CMakeLists.txt 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/src/CMakeLists.txt 2025-12-05 14:19:12.000000000 +0100 @@ -26,6 +26,7 @@ kemailaddress.h kencodingprober.cpp kencodingprober.h + kencodingprober_p.h probers/CharDistribution.cpp probers/CharDistribution.h probers/ChineseGroupProber.cpp diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/kencodingprober.cpp new/kcodecs-6.21.0/src/kencodingprober.cpp --- old/kcodecs-6.20.0/src/kencodingprober.cpp 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/src/kencodingprober.cpp 2025-12-05 14:19:12.000000000 +0100 @@ -7,6 +7,7 @@ */ #include "kencodingprober.h" +#include "kencodingprober_p.h" #include "probers/ChineseGroupProber.h" #include "probers/JapaneseGroupProber.h" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/kencodingprober.h new/kcodecs-6.21.0/src/kencodingprober.h --- old/kcodecs-6.20.0/src/kencodingprober.h 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/src/kencodingprober.h 2025-12-05 14:19:12.000000000 +0100 @@ -8,15 +8,8 @@ #ifndef KENCODINGPROBER_H #define KENCODINGPROBER_H -// enable debug of private probers -// #define DEBUG_PROBE - #include <kcodecs_export.h> -#ifdef DEBUG_PROBE -#include <QDebug> -#endif - #include <QCoreApplication> #include <QString> #include <memory> diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/kencodingprober_p.h new/kcodecs-6.21.0/src/kencodingprober_p.h --- old/kcodecs-6.20.0/src/kencodingprober_p.h 1970-01-01 01:00:00.000000000 +0100 +++ new/kcodecs-6.21.0/src/kencodingprober_p.h 2025-12-05 14:19:12.000000000 +0100 @@ -0,0 +1,18 @@ +/* + This file is part of the KDE libraries + + SPDX-FileCopyrightText: 2008 Wang Hoi <[email protected]> + + SPDX-License-Identifier: LGPL-2.0-or-later +*/ +#ifndef KENCODINGPROBER_P_H +#define KENCODINGPROBER_P_H + +// enable debug of private probers +// #define DEBUG_PROBE + +#ifdef DEBUG_PROBE +#include <QDebug> +#endif + +#endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/probers/nsCharSetProber.h new/kcodecs-6.21.0/src/probers/nsCharSetProber.h --- old/kcodecs-6.20.0/src/probers/nsCharSetProber.h 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/src/probers/nsCharSetProber.h 2025-12-05 14:19:12.000000000 +0100 @@ -7,7 +7,9 @@ #ifndef nsCharSetProber_h__ #define nsCharSetProber_h__ -#include "kencodingprober.h" +#include <kcodecs_export.h> + +#include "../kencodingprober_p.h" namespace kencodingprober { @@ -32,7 +34,7 @@ virtual float GetConfidence(void) = 0; #ifdef DEBUG_PROBE - void DumpStatus() override + virtual void DumpStatus() { } #endif diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/probers/nsCodingStateMachine.h new/kcodecs-6.21.0/src/probers/nsCodingStateMachine.h --- old/kcodecs-6.20.0/src/probers/nsCodingStateMachine.h 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/src/probers/nsCodingStateMachine.h 2025-12-05 14:19:12.000000000 +0100 @@ -7,7 +7,7 @@ #ifndef nsCodingStateMachine_h__ #define nsCodingStateMachine_h__ -#include "kencodingprober.h" +#include "../kencodingprober_p.h" #include "kcodecs_export.h" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/kcodecs-6.20.0/src/probers/nsMBCSSM.cpp new/kcodecs-6.21.0/src/probers/nsMBCSSM.cpp --- old/kcodecs-6.20.0/src/probers/nsMBCSSM.cpp 2025-11-07 19:58:55.000000000 +0100 +++ new/kcodecs-6.21.0/src/probers/nsMBCSSM.cpp 2025-12-05 14:19:12.000000000 +0100 @@ -471,58 +471,56 @@ PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 68 - 6f PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 70 - 77 PCK4BITS(1, 1, 1, 1, 1, 1, 1, 1), // 78 - 7f - PCK4BITS(2, 2, 2, 2, 3, 3, 3, 3), // 80 - 87 - PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 88 - 8f - PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 90 - 97 - PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // 98 - 9f - PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a0 - a7 - PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // a8 - af - PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b0 - b7 - PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // b8 - bf - PCK4BITS(0, 0, 6, 6, 6, 6, 6, 6), // c0 - c7 - PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // c8 - cf - PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d0 - d7 - PCK4BITS(6, 6, 6, 6, 6, 6, 6, 6), // d8 - df - PCK4BITS(7, 8, 8, 8, 8, 8, 8, 8), // e0 - e7 - PCK4BITS(8, 8, 8, 8, 8, 9, 8, 8), // e8 - ef - PCK4BITS(10, 11, 11, 11, 11, 11, 11, 11), // f0 - f7 - PCK4BITS(12, 13, 13, 13, 14, 15, 0, 0) // f8 - ff + PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 80 - 87 + PCK4BITS(2, 2, 2, 2, 2, 2, 2, 2), // 88 - 8f + PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 90 - 97 + PCK4BITS(3, 3, 3, 3, 3, 3, 3, 3), // 98 - 9f + PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // a0 - a7 + PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // a8 - af + PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // b0 - b7 + PCK4BITS(4, 4, 4, 4, 4, 4, 4, 4), // b8 - bf + PCK4BITS(0, 0, 5, 5, 5, 5, 5, 5), // c0 - c7 + PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // c8 - cf + PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // d0 - d7 + PCK4BITS(5, 5, 5, 5, 5, 5, 5, 5), // d8 - df + PCK4BITS(6, 7, 7, 7, 7, 7, 7, 7), // e0 - e7 + PCK4BITS(7, 7, 7, 7, 7, 8, 7, 7), // e7 - ef + PCK4BITS(9, 10, 10, 10, 11, 0, 0, 0), // f0 - f7 + PCK4BITS(0, 0, 0, 0, 0, 0, 0, 0) // f8 - ff }; -static const unsigned int UTF8_st[26] = { - PCK4BITS(eError, eStart, eError, eError, eError, eError, 12, 10), // 00-07 - PCK4BITS(9, 11, 8, 7, 6, 5, 4, 3), // 08-0f - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 10-17 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 18-1f - PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 20-27 - PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // 28-2f - PCK4BITS(eError, eError, 5, 5, 5, 5, eError, eError), // 30-37 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 38-3f - PCK4BITS(eError, eError, eError, 5, 5, 5, eError, eError), // 40-47 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 48-4f - PCK4BITS(eError, eError, 7, 7, 7, 7, eError, eError), // 50-57 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 58-5f - PCK4BITS(eError, eError, eError, eError, 7, 7, eError, eError), // 60-67 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 68-6f - PCK4BITS(eError, eError, 9, 9, 9, 9, eError, eError), // 70-77 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 78-7f - PCK4BITS(eError, eError, eError, eError, eError, 9, eError, eError), // 80-87 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 88-8f - PCK4BITS(eError, eError, 12, 12, 12, 12, eError, eError), // 90-97 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 98-9f - PCK4BITS(eError, eError, eError, eError, eError, 12, eError, eError), // a0-a7 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // a8-af - PCK4BITS(eError, eError, 12, 12, 12, eError, eError, eError), // b0-b7 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // b8-bf - PCK4BITS(eError, eError, eStart, eStart, eStart, eStart, eError, eError), // c0-c7 - PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError) // c8-cf +static const unsigned int UTF8_st[10 * 12 / 8] = { + // clang-format off + // byteclass: 0 1 2 3 4 5 6 7 // State + // 8 9 10 11 | 0 1 2 3 + // 4 5 6 7 8 9 10 11 + PCK4BITS(eError, eStart, eError, eError, eError, 3, 4, 5), // eStart + PCK4BITS( 6, 7, 8, 9, eError, eError, eError, eError), // eStart | eError + PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // eError + + PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe, eItsMe), // eItsMe + PCK4BITS(eItsMe, eItsMe, eItsMe, eItsMe, eError, eError, eStart, eStart), // eItsMe | 3 + PCK4BITS(eStart, eError, eError, eError, eError, eError, eError, eError), // 3 + + PCK4BITS(eError, eError, eError, eError, 3, eError, eError, eError), // 4 + PCK4BITS(eError, eError, eError, eError, eError, eError, 3, 3), // 4 | 5 + PCK4BITS( 3, eError, eError, eError, eError, eError, eError, eError), // 5 + + PCK4BITS(eError, eError, 3, 3, eError, eError, eError, eError), // 6 + PCK4BITS(eError, eError, eError, eError, eError, eError, eError, 5), // 6 | 7 + PCK4BITS( 5, eError, eError, eError, eError, eError, eError, eError), // 7 + + PCK4BITS(eError, eError, 5, 5, 5, eError, eError, eError), // 8 + PCK4BITS(eError, eError, eError, eError, eError, eError, 5, eError), // 8 | 9 + PCK4BITS(eError, eError, eError, eError, eError, eError, eError, eError), // 9 + // clang-format on }; -static const unsigned int UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6}; +static const unsigned int UTF8CharLenTable[] = {0, 1, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4}; const SMModel UTF8SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls}, - 16, + 12, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st}, UTF8CharLenTable, "UTF-8",
