https://github.com/ahatanak created 
https://github.com/llvm/llvm-project/pull/203621

std::__find routes a search to the libc wmemchr (via
__constexpr_wmemchr) whenever the element type has the same size and
alignment as wchar_t. That is wrong under -fshort-wchar on a platform
whose native wchar_t is 4 bytes: wchar_t shrinks to 2 bytes, so
_Tp=char16_t satisfies that condition, and the search is routed to
wmemchr, which still reads 4-byte elements.

Only take the wmemchr path when wchar_t is still its native type, i.e.,
unmodified by -fshort-wchar. The check uses the new __native_wchar_t
alias in <cwchar> (from __WCHAR_NATIVE_TYPE__, falling back to wchar_t
on older compilers). Normal builds keep the wmemchr fast path unchanged.

Fixes https://github.com/llvm/llvm-project/issues/195149

rdar://175090927

>From 7f9f3322618a25f337fd059ace8b51197b187052 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <[email protected]>
Date: Mon, 8 Jun 2026 18:52:13 -0700
Subject: [PATCH 1/2] [clang] Add __WCHAR_NATIVE_TYPE__ predefined macro

Define a new predefined macro __WCHAR_NATIVE_TYPE__ that expands to the
platform's native type for wchar_t, i.e., the type wchar_t would have
without -fshort-wchar. It matches __WCHAR_TYPE__ unless -fshort-wchar is
in effect.

This lets code detect when wchar_t is different from its native type,
for example to decide whether dispatching to a wchar_t-based runtime
function such as wmemchr is safe (see llvm/llvm-project#195149).

rdar://175090927
---
 clang/docs/LanguageExtensions.rst       |  8 ++++++++
 clang/docs/ReleaseNotes.rst             |  7 +++++++
 clang/include/clang/Basic/TargetInfo.h  | 12 ++++++++++++
 clang/lib/Basic/TargetInfo.cpp          |  8 ++++++++
 clang/lib/Frontend/InitPreprocessor.cpp |  1 +
 clang/test/Preprocessor/init-aarch64.c  |  1 +
 clang/test/Preprocessor/init.c          |  1 +
 7 files changed, 38 insertions(+)

diff --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index fbb9947f39d3e..c0beb6ddecd02 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -393,6 +393,14 @@ Builtin Macros
   Defined to an integral value that is the include depth of the file currently
   being translated.  For the main file, this value is zero.
 
+``__WCHAR_NATIVE_TYPE__``
+  clang-specific extension defined to the platform's native type for
+  ``wchar_t``, i.e., the type ``wchar_t`` would have without ``-fshort-wchar``.
+  This matches ``__WCHAR_TYPE__`` unless ``-fshort-wchar`` is in effect. This
+  lets code detect when ``wchar_t`` is different from its native type,
+  e.g., to decide whether dispatching to a ``wchar_t``-based runtime function
+  such as ``wmemchr`` is safe.
+
 ``__TIMESTAMP__``
   Defined to the date and time of the last modification of the current source
   file.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cf4826f50e5a5..12d09cb361825 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -312,6 +312,13 @@ Non-comprehensive list of changes in this release
 - ``typeid`` on references and pointers of ``final`` types no longer emits a
   vtable lookup at runtime.
 
+- Added a new predefined macro ``__WCHAR_NATIVE_TYPE__``, expanding to the
+  platform's native type for ``wchar_t`` (the type ``wchar_t`` would have
+  without ``-fshort-wchar``). It matches ``__WCHAR_TYPE__`` unless
+  ``-fshort-wchar`` is in effect, letting code detect when ``wchar_t`` is
+  different from its native type.
+
+
 - Updated support for Unicode from 15.1 to 18.0.
 
 New Compiler Flags
diff --git a/clang/include/clang/Basic/TargetInfo.h 
b/clang/include/clang/Basic/TargetInfo.h
index cc226403877e2..8615da60803f7 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -163,6 +163,11 @@ struct TransferrableTargetInfo {
       Char16Type, Char32Type, Int64Type, Int16Type, SigAtomicType,
       ProcessIDType;
 
+  /// The platform's native type for wchar_t, i.e., the type wchar_t would have
+  /// without -fshort-wchar. This matches WCharType unless -fshort-wchar is in
+  /// effect.
+  IntType WideCharNativeType;
+
   /// Whether Objective-C's built-in boolean type should be signed char.
   ///
   /// Otherwise, when this flag is not set, the normal built-in boolean type is
@@ -417,6 +422,13 @@ class TargetInfo : public TransferrableTargetInfo,
     return getCorrespondingUnsignedType(IntPtrType);
   }
   IntType getWCharType() const { return WCharType; }
+
+  /// Return the platform's native type for wchar_t, i.e., the type wchar_t
+  /// would have without -fshort-wchar.
+  IntType getWideCharNativeType() const {
+    return WideCharNativeType == NoInt ? WCharType : WideCharNativeType;
+  }
+
   IntType getWIntType() const { return WIntType; }
   IntType getChar16Type() const { return Char16Type; }
   IntType getChar32Type() const { return Char32Type; }
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 854d23cadaea2..62fb6c8175484 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -137,6 +137,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   IntMaxType = SignedLongLong;
   IntPtrType = SignedLong;
   WCharType = SignedInt;
+  WideCharNativeType = NoInt;
   WIntType = SignedInt;
   Char16Type = UnsignedShort;
   Char32Type = UnsignedInt;
@@ -423,6 +424,13 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, 
LangOptions &Opts,
   if (Opts.NoBitFieldTypeAlign)
     UseBitFieldTypeAlignment = false;
 
+  // Capture the platform-native wchar_t before -fshort-wchar can override
+  // WCharType below. adjust() may run more than once on the same target, so
+  // only record it the first time, while WCharType still holds the target
+  // default.
+  if (WideCharNativeType == NoInt)
+    WideCharNativeType = WCharType;
+
   switch (Opts.WCharSize) {
   default: llvm_unreachable("invalid wchar_t width");
   case 0: break;
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp 
b/clang/lib/Frontend/InitPreprocessor.cpp
index 3f0468a938149..f516c5159dba7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1168,6 +1168,7 @@ static void InitializePredefinedMacros(const TargetInfo 
&TI,
   DefineType("__SIZE_TYPE__", TI.getSizeType(), Builder);
   DefineFmt(LangOpts, "__SIZE", TI.getSizeType(), TI, Builder);
   DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
+  DefineType("__WCHAR_NATIVE_TYPE__", TI.getWideCharNativeType(), Builder);
   DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
   DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
   if (LangOpts.C23)
diff --git a/clang/test/Preprocessor/init-aarch64.c 
b/clang/test/Preprocessor/init-aarch64.c
index 09e3fc926a309..3ec78a7651480 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -393,6 +393,7 @@
 // AARCH64-NEXT: #define __USER_LABEL_PREFIX__
 // AARCH64-NEXT: #define __VERSION__ "{{.*}}"
 // AARCH64-NEXT: #define __WCHAR_MAX__ 4294967295U
+// AARCH64-NEXT: #define __WCHAR_NATIVE_TYPE__ unsigned int
 // AARCH64-NEXT: #define __WCHAR_TYPE__ unsigned int
 // AARCH64-NEXT: #define __WCHAR_UNSIGNED__ 1
 // AARCH64-NEXT: #define __WCHAR_WIDTH__ 32
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 80b7a6399e5f4..cc67db4fa068e 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -2076,6 +2076,7 @@
 // WEBASSEMBLY-NEXT:#define __USER_LABEL_PREFIX__
 // WEBASSEMBLY-NEXT:#define __VERSION__ "{{.*}}"
 // WEBASSEMBLY-NEXT:#define __WCHAR_MAX__ 2147483647
+// WEBASSEMBLY-NEXT:#define __WCHAR_NATIVE_TYPE__ int
 // WEBASSEMBLY-NEXT:#define __WCHAR_TYPE__ int
 // WEBASSEMBLY-NOT:#define __WCHAR_UNSIGNED__
 // WEBASSEMBLY-NEXT:#define __WCHAR_WIDTH__ 32

>From 142f7248cbe6113545a2ec73941f8913d3a62d66 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <[email protected]>
Date: Fri, 12 Jun 2026 12:41:42 -0700
Subject: [PATCH 2/2] [libc++] Don't dispatch find to wmemchr under
 -fshort-wchar

std::__find routes a search to the libc wmemchr (via
__constexpr_wmemchr) whenever the element type has the same size and
alignment as wchar_t. That is wrong under -fshort-wchar on a platform
whose native wchar_t is 4 bytes: wchar_t shrinks to 2 bytes, so
_Tp=char16_t satisfies that condition, and the search is routed to
wmemchr, which still reads 4-byte elements.

Only take the wmemchr path when wchar_t is still its native type, i.e.,
unmodified by -fshort-wchar. The check uses the new __native_wchar_t
alias in <cwchar> (from __WCHAR_NATIVE_TYPE__, falling back to wchar_t
on older compilers). Normal builds keep the wmemchr fast path unchanged.

Fixes llvm/llvm-project#195149

rdar://175090927
---
 libcxx/include/__algorithm/find.h             |  8 +++-
 libcxx/include/cwchar                         |  9 ++++
 .../string_find/short_wchar.pass.cpp          | 42 +++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 
libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp

diff --git a/libcxx/include/__algorithm/find.h 
b/libcxx/include/__algorithm/find.h
index f677fb2c7392d..66657a9056537 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -127,7 +127,13 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* 
__find(_Tp* __first, _T
     return __last;
   }
 #  if _LIBCPP_HAS_WIDE_CHARACTERS
-  else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= 
_LIBCPP_ALIGNOF(wchar_t)) {
+  // __builtin_wmemchr lowers to a libc call that walks native-sized wchar_t
+  // elements. Only take this path when wchar_t still has its platform-native
+  // size and alignment. Otherwise (e.g., under -fshort-wchar) fall through to 
the
+  // vectorized integral path, which honors the current wchar_t size.
+  else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= 
_LIBCPP_ALIGNOF(wchar_t) &&
+                     sizeof(wchar_t) == sizeof(__native_wchar_t) && 
_LIBCPP_ALIGNOF(wchar_t) ==
+                         _LIBCPP_ALIGNOF(__native_wchar_t)) {
     if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - 
__first))
       return __ret;
     return __last;
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index e2534977a7a3c..cc7e7fd8128bd 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -197,6 +197,15 @@ using ::putwchar _LIBCPP_USING_IF_EXISTS;
 using ::vwprintf _LIBCPP_USING_IF_EXISTS;
 using ::wprintf _LIBCPP_USING_IF_EXISTS;
 
+// Names the platform-native wchar_t (the type wchar_t would have without
+// -fshort-wchar). Falls back to wchar_t on compilers that predate
+// __WCHAR_NATIVE_TYPE__ (Clang < 23), preserving prior behavior.
+#  ifdef __WCHAR_NATIVE_TYPE__
+using __native_wchar_t = __WCHAR_NATIVE_TYPE__;
+#  else
+using __native_wchar_t = wchar_t;
+#  endif
+
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t 
__constexpr_wcslen(const wchar_t* __str) {
 #  if __has_builtin(__builtin_wcslen)
   return __builtin_wcslen(__str);
diff --git 
a/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp
 
b/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp
new file mode 100644
index 0000000000000..a261f7fd30c9c
--- /dev/null
+++ 
b/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Regression test for llvm/llvm-project#195149: u16string::find used to
+// dispatch through __builtin_wmemchr when sizeof(char16_t) == sizeof(wchar_t).
+// Under -fshort-wchar on a platform whose native wchar_t is 4 bytes
+// (e.g., Linux/Darwin), the libc wmemchr keeps walking 4-byte elements, so the
+// search returned wrong results. __find now gates the wmemchr fast path on the
+// platform-native wchar_t size (via __WCHAR_NATIVE_TYPE__) so the runtime
+// libcall is taken only when it is binary-compatible with what wmemchr 
expects.
+//
+// Only meaningful where the platform-native wchar_t differs from 2 bytes; on
+// Windows (native 2-byte wchar_t) the optimization is always safe.
+
+// ADDITIONAL_COMPILE_FLAGS: -fshort-wchar
+
+#include <cassert>
+#include <string>
+
+#include "test_macros.h"
+
+TEST_CONSTEXPR_CXX20 bool test() {
+  std::u16string s = u"hello";
+  std::u16string t = u"goodbye";
+  assert(s.find(u'o') == 4);
+  assert(t.find(u'b') == 4);
+  assert(s.find(u'z') == std::u16string::npos);
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 20
+  static_assert(test());
+#endif
+  return 0;
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to