From 52fb5a68a7bf8063039176160f4578fe61670f09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= <carenas@gmail.com>
Date: Fri, 6 Jan 2023 19:34:56 -0800
Subject: [PATCH] pcre: use UCP in UTF mode

This fixes a serious bug affecting word-boundary and word-constituent regular
expressions when the desired match involves non-ASCII UTF8 characters.
* src/pcresearch.c: Set PCRE2_UCP together with PCRE2_UTF
* tests/pcre-utf8-w: New file.
* tests/Makefile.am (TESTS): Add it.
* NEWS (Bug fixes): Mention this.
Reported by Gro-Tsen https://twitter.com/gro_tsen/status/1610972356972875777
This bug was present from grep-2.5, when --perl-regexp (-P) support was added.
---
 NEWS              |  6 ++++++
 src/pcresearch.c  |  2 +-
 tests/Makefile.am |  1 +
 tests/pcre-utf8-w | 28 ++++++++++++++++++++++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100755 tests/pcre-utf8-w

diff --git a/NEWS b/NEWS
index b404708..a865941 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,12 @@ GNU grep NEWS                                    -*- outline -*-

 ** Bug fixes

+  With -P, some non-ASCII UTF8 characters were not recognized as
+  word-consituent due to lack of our use of the PCRE_UCP flag. E.g.,
+  given f(){ echo Perú|LC_ALL=en_US.UTF-8 grep -Po "$1"; } and
+  this command, echo $(f 'r\w'):$(f '.\b'), before it would print ":r".
+  After the fix, it prints the correct results: "rú:ú".
+
   When given multiple patterns the last of which has a back-reference,
   grep no longer sometimes mistakenly matches lines in some cases.
   [Bug#36148#13 introduced in grep 3.4]
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a107f4d..45b67ee 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -149,7 +149,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= PCRE2_UTF;
+      flags |= (PCRE2_UTF | PCRE2_UCP);
 #if 0
       /* Do not match individual code units but only UTF-8.  */
       flags |= PCRE2_NEVER_BACKSLASH_C;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index e0b0503..a47cf5c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -147,6 +147,7 @@ TESTS =						\
   pcre-jitstack					\
   pcre-o					\
   pcre-utf8					\
+  pcre-utf8-w					\
   pcre-w					\
   pcre-wx-backref				\
   pcre-z					\
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
new file mode 100755
index 0000000..4cd7db6
--- /dev/null
+++ b/tests/pcre-utf8-w
@@ -0,0 +1,28 @@
+#!/bin/sh
+# Ensure non-ASCII UTF-8 characters are correctly identified as word-consituent
+#
+# Copyright (C) 2023 Free Software Foundation, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_en_utf8_locale_
+LC_ALL=en_US.UTF-8
+export LC_ALL
+require_pcre_
+
+fail=0
+
+echo 'Perú'> in || framework_failure_
+
+echo 'ú' > exp || framework_failure_
+grep -Po '.\b' in > out || fail=1
+compare exp out || fail=1
+
+echo 'rú' > exp || framework_failure_
+grep -Po 'r\w' in > out || fail=1
+compare exp out || fail=1
+
+Exit $fail
-- 
2.39.0.132.g8a4e8f6a67

