On Thu, 8 Jan 2026 at 18:10, Keith Packard <[email protected]> wrote: > > The C++ standard does not require that each std::ctype_base mask value > have only one bit set; in fact the expository values presented in the > text explicitly include two multi-bit values: > > // numeric values are for exposition only. > static const mask space = 1 << 0; > static const mask print = 1 << 1; > static const mask cntrl = 1 << 2; > static const mask upper = 1 << 3; > static const mask lower = 1 << 4; > static const mask alpha = 1 << 5; > static const mask digit = 1 << 6; > static const mask punct = 1 << 7; > static const mask xdigit = 1 << 8; > static const mask blank = 1 << 9; > static const mask alnum = alpha | digit; > static const mask graph = alnum | punct; > > Adapt the code to handle this possibility in three places.
This will fix at least part of https://gcc.gnu.org/PR51772 (maybe all of it, I need to review it properly and refresh my memory of that bug). > > 1. In _M_initialize_ctype use explicit assignments from the > std::ctype_base mask values into _M_bit array instead of > assuming that iterating over values from 1<<0 to 1<<15 will > match all of the possible std::ctype_base mask values. > > 2. In do_is(mask, char_type) change the condition testing __m against > _M_bit[__bitcur] to require *all* bits in _M_bit[__bitcur] to be > present in __m before calling iswctype with the matching _M_wmask > value. > > For a C library where > alpha = (upper|lower) > > when > __m is upper > > the previous version would have done > iswctype(*__lo, wctype("alpha")) > iswctype(*__lo, wctype("lower")) > > the new version will do > iswctype(*__lo, wctype("upper")) > > 3. In do_is(const wchar_t *, const wchar_t *, mask *), replace the > code setting _M_bit values when iswctype returns true to code > which clears _M_bit values when iswctype returns false. > > For a C library where > alpha = (upper|lower), when > > when > iswctype(wctype("alpha")) returns true > iswctype(wctype("upper")) returns false > iswctype(wctype("lower")) returns true > > the old version would have done > __m = 0 > __m |= alpha > __m |= lower > > return value = (upper|lower) > > the new version will do > __m = upper|lower > __m &= ~upper > > return value = (lower) > > These changes should not introduce significant additional complexity > into these computations nor affect results with C libraries which use > independent values for the std::ctype_base masks. These changes do > reduce the range of mask values used in iterations from 16 to 12, so > there may actually be a tiny performance improvement. > > Signed-off-by: Keith Packard <[email protected]> > --- > .../config/locale/generic/ctype_members.cc | 81 +++++++++++++++---- > 1 file changed, 64 insertions(+), 17 deletions(-) > > diff --git a/libstdc++-v3/config/locale/generic/ctype_members.cc > b/libstdc++-v3/config/locale/generic/ctype_members.cc > index 2d61347feb1..21c25a68915 100644 > --- a/libstdc++-v3/config/locale/generic/ctype_members.cc > +++ b/libstdc++-v3/config/locale/generic/ctype_members.cc > @@ -53,6 +53,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > { } > > #ifdef _GLIBCXX_USE_WCHAR_T > + > + // Values which index into _M_bit and _M_wmask for each classifier > + enum { > + _id_alnum, _id_alpha, _id_blank, _id_cntrl, > + _id_digit, _id_graph, _id_lower, _id_print, > + _id_punct, _id_space, _id_upper, _id_xdigit, > + _id_num, > + }; > + > ctype<wchar_t>::__wmask_type > ctype<wchar_t>::_M_convert_to_wmask(const mask __m) const throw() > { > @@ -138,11 +147,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > do_is(mask __m, char_type __c) const > { > bool __ret = false; > - // Generically, 15 (instead of 11) since we don't know the numerical > - // encoding of the various categories in /usr/include/ctype.h. > - const size_t __bitmasksize = 15; > - for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur) > - if (__m & _M_bit[__bitcur] > + > + // Each _M_bit may have more than one bit set. Only check tests for which > + // all of the matching _M_bit values are set. This skips tests with some > + // bits which are not set. For example: > + // > + // alpha = lower|upper > + // __c = 'a' > + // __m = upper > + // > + // We evaluate iswupper (which returns false), but we skip iswalpha > (which > + // would return true) because __m is missing the lower bit, so the return > + // value is false. > + for (size_t __bitcur = 0; __bitcur <= _id_num; ++__bitcur) > + if ((__m & _M_bit[__bitcur]) == _M_bit[__bitcur] > && iswctype(__c, _M_wmask[__bitcur])) > { > __ret = true; > @@ -157,13 +175,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > { > for (;__lo < __hi; ++__vec, ++__lo) > { > - // Generically, 15 (instead of 11) since we don't know the numerical > - // encoding of the various categories in /usr/include/ctype.h. > - const size_t __bitmasksize = 15; > - mask __m = 0; > - for (size_t __bitcur = 0; __bitcur <= __bitmasksize; ++__bitcur) > - if (iswctype(*__lo, _M_wmask[__bitcur])) > - __m |= _M_bit[__bitcur]; > + > + // Start with all bits set. Iterate over all wctypes, clearing bits > for > + // non-matching values. For example: > + // > + // alpha == lower|upper > + // *__lo == 'a' > + // > + // __m = <all bits> > + // iswalpha and iswlower match > + // __m left alone > + // iswupper does not match > + // __m &= ~upper > + // > + // The return value will satisfy the following conditions: > + // > + // (__m & alpha) != 0 > + // (__m & lower) != 0 > + // (__m & upper) == 0 > + mask __m = (alnum|alpha|blank|cntrl|digit|graph| > + lower|print|punct|space|upper|xdigit); > + for (size_t __bitcur = 0; __bitcur < _id_num; ++__bitcur) > + if (!iswctype(*__lo, _M_wmask[__bitcur])) > + __m &= ~_M_bit[__bitcur]; > *__vec = __m; > } > return __hi; > @@ -264,11 +298,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > __i < sizeof(_M_widen) / sizeof(wint_t); ++__i) > _M_widen[__i] = btowc(__i); > > - for (size_t __i = 0; __i <= 15; ++__i) > - { > - _M_bit[__i] = static_cast<mask>(1 << __i); > - _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]); > - } > + // Set these directly from the OS provided values. > + // We can't simply iterate over bit values because > + // they may have more than one bit set > + _M_bit[_id_alnum] = alnum; > + _M_bit[_id_alpha] = alpha; > + _M_bit[_id_blank] = blank; > + _M_bit[_id_cntrl] = cntrl; > + _M_bit[_id_digit] = digit; > + _M_bit[_id_graph] = graph; > + _M_bit[_id_lower] = lower; > + _M_bit[_id_print] = print; > + _M_bit[_id_punct] = punct; > + _M_bit[_id_space] = space; > + _M_bit[_id_upper] = upper; > + _M_bit[_id_xdigit] = xdigit; > + > + for (size_t __i = 0; __i <= _id_num; __i++) > + _M_wmask[__i] = _M_convert_to_wmask(_M_bit[__i]); > } > #endif // _GLIBCXX_USE_WCHAR_T > > -- > 2.51.0 >
