Reported to PCRE[1] with mention of GNU grep being also affected.
[1] https://github.com/PCRE2Project/pcre2/issues/185
From c2d4a43b5b15df7c8853d591bf6ae872c602ed14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <[email protected]>
Date: Fri, 6 Jan 2023 19:34:56 -0800
Subject: [PATCH] pcre: use UCP in UTF mode
* src/pcresearch.c: set PCRE2_UCP together with PCRE2_UTF
* tests/pcre-utf8-w: add test
---
src/pcresearch.c | 2 +-
tests/Makefile.am | 1 +
tests/pcre-utf8-w | 28 ++++++++++++++++++++++++++++
3 files changed, 30 insertions(+), 1 deletion(-)
create mode 100755 tests/pcre-utf8-w
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a107f4d..45b67ee 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -149,7 +149,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
{
if (! localeinfo.using_utf8)
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
- flags |= PCRE2_UTF;
+ flags |= (PCRE2_UTF | PCRE2_UCP);
#if 0
/* Do not match individual code units but only UTF-8. */
flags |= PCRE2_NEVER_BACKSLASH_C;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index e0b0503..a47cf5c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -147,6 +147,7 @@ TESTS = \
pcre-jitstack \
pcre-o \
pcre-utf8 \
+ pcre-utf8-w \
pcre-w \
pcre-wx-backref \
pcre-z \
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
new file mode 100755
index 0000000..431685c
--- /dev/null
+++ b/tests/pcre-utf8-w
@@ -0,0 +1,28 @@
+#!/bin/sh
+# UTF-8 characters are correctly identified as part of a word
+#
+# Copyright (C) 2023-2023 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_en_utf8_locale_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+require_pcre_
+
+fail=0
+
+echo 'Perú'> in || framework_failure_
+
+echo 'ú' > exp || framework_failure_
+grep -Po '.\b' in > out || fail=1
+compare out exp || fail=1
+
+echo 'rú' > exp || framework_failure_
+grep -Po 'r\w' in > out && fail=1
+compare out exp || fail=1
+
+Exit $fail
--
2.30.2