From eff1848fd4adb064673f3c3b35816617e218de89 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@meta.com>
Date: Sun, 12 Apr 2026 12:36:45 -0700
Subject: [PATCH] regex: fix missed short match with backrefs

With a backref pattern like ^(.?)(.?).?\2\1 (no $), the engine
could miss valid short matches.  For example, "ab" should match
via all-empty groups, yet regexec returned no-match because
set_regs failed at the longest structural match (match_last=2)
and never retried at a shorter match_last.
* lib/regexec.c (re_search_internal): When set_regs fails for a
backref pattern, retry prune_impossible_nodes and set_regs at
progressively shorter match lengths.  Save a copy of state_log
before pruning so shorter retries can re-sift from the original
states.
* m4/regex.m4: Also reject system regex with this bug.
* tests/test-regex.c (main): Add a test for this bug.
Reported by Ed Morton in https://bugs.gnu.org/68725
Co-authored-by: Claude <noreply@anthropic.com>
---
 ChangeLog          |  17 +++++++
 lib/regexec.c      | 113 +++++++++++++++++++++++++++++++++++++++------
 m4/regex.m4        |  24 ++++++++--
 tests/test-regex.c |  22 +++++++--
 4 files changed, 155 insertions(+), 21 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c2446f5f16..b1bd9d66eb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2026-04-12  Jim Meyering  <meyering@meta.com>
+
+	regex: fix missed short match with backrefs
+	With a backref pattern like ^(.?)(.?).?\2\1 (no $), the engine
+	could miss valid short matches.  For example, "ab" should match
+	via all-empty groups, yet regexec returned no-match because
+	set_regs failed at the longest structural match (match_last=2)
+	and never retried at a shorter match_last.
+	* lib/regexec.c (re_search_internal): When set_regs fails for a
+	backref pattern, retry prune_impossible_nodes and set_regs at
+	progressively shorter match lengths.  Save a copy of state_log
+	before pruning so shorter retries can re-sift from the original
+	states.
+	* m4/regex.m4: Also reject system regex with this bug.
+	* tests/test-regex.c (main): Add a test for this bug.
+	Reported by Ed Morton in https://bugs.gnu.org/68725
+
 2026-04-12  Collin Funk  <collin.funk1@gmail.com>

 	doc: update documentation about fopen with the 'e' mode character
diff --git a/lib/regexec.c b/lib/regexec.c
index e09fc7698e..76ef80ab07 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -678,6 +678,8 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
 	| (t != NULL ? 1 : 0))
      : 8);

+  re_dfastate_t **save_state_log = NULL;
+
   for (;; match_first += incr)
     {
       err = REG_NOMATCH;
@@ -802,11 +804,32 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 		  || dfa->nbackref)
 		{
+		  /* Save state_log before pruning, in case set_regs
+		     later fails and we need to retry with a shorter
+		     match.  */
+		  re_free (save_state_log);
+		  save_state_log = NULL;
+		  if (!preg->no_sub && nmatch > 1 && dfa->nbackref)
+		    {
+		      save_state_log
+			= re_malloc (re_dfastate_t *,
+				     mctx.match_last + 1);
+		      if (__glibc_unlikely (save_state_log == NULL))
+			{
+			  err = REG_ESPACE;
+			  goto free_return;
+			}
+		      memcpy (save_state_log, mctx.state_log,
+			      sizeof (re_dfastate_t *)
+			      * (mctx.match_last + 1));
+		    }
 		  err = prune_impossible_nodes (&mctx);
 		  if (err == REG_NOERROR)
 		    break;
 		  if (__glibc_unlikely (err != REG_NOMATCH))
 		    goto free_return;
+		  re_free (save_state_log);
+		  save_state_log = NULL;
 		  match_last = -1;
 		}
 	      else
@@ -825,24 +848,87 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
     {
       Idx reg_idx;

-      /* Initialize registers.  */
-      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
-	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
-
-      /* Set the points where matching start/end.  */
-      pmatch[0].rm_so = 0;
-      pmatch[0].rm_eo = mctx.match_last;
-      /* FIXME: This function should fail if mctx.match_last exceeds
-	 the maximum possible regoff_t value.  We need a new error
-	 code REG_OVERFLOW.  */
-
       if (!preg->no_sub && nmatch > 1)
 	{
-	  err = set_regs (preg, &mctx, nmatch, pmatch,
-			  dfa->has_plural_match && dfa->nbackref > 0);
+	  /* When set_regs fails for a backref pattern, the structural
+	     match at match_last has no valid register assignment.  Try
+	     shorter match lengths, since a valid shorter match may
+	     exist (e.g., all groups matching empty).  */
+	  for (;;)
+	    {
+	      /* Initialize registers.  */
+	      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
+		pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
+	      pmatch[0].rm_so = 0;
+	      pmatch[0].rm_eo = mctx.match_last;
+
+	      err = set_regs (preg, &mctx, nmatch, pmatch,
+			      dfa->has_plural_match && dfa->nbackref > 0);
+	      if (__glibc_likely (err == REG_NOERROR)
+		  || save_state_log == NULL
+		  || err != REG_NOMATCH)
+		break;
+
+	      /* set_regs failed; try a shorter match_last.  */
+	      Idx ml = mctx.match_last;
+	      re_free (mctx.state_log);
+	      do
+		{
+		  --ml;
+		  if (ml < 0)
+		    break;
+		}
+	      while (save_state_log[ml] == NULL
+		     || !save_state_log[ml]->halt
+		     || !check_halt_state_context
+			  (&mctx, save_state_log[ml], ml));
+	      if (ml < 0)
+		{
+		  err = REG_NOMATCH;
+		  mctx.state_log = save_state_log;
+		  save_state_log = NULL;
+		  break;
+		}
+	      mctx.state_log
+		= re_malloc (re_dfastate_t *, ml + 1);
+	      if (__glibc_unlikely (mctx.state_log == NULL))
+		{
+		  mctx.state_log = save_state_log;
+		  save_state_log = NULL;
+		  err = REG_ESPACE;
+		  break;
+		}
+	      memcpy (mctx.state_log, save_state_log,
+		      sizeof (re_dfastate_t *) * (ml + 1));
+	      mctx.match_last = ml;
+	      mctx.last_node
+		= check_halt_state_context
+		    (&mctx, save_state_log[ml], ml);
+	      err = prune_impossible_nodes (&mctx);
+	      if (__glibc_unlikely (err != REG_NOERROR))
+		{
+		  if (err == REG_NOMATCH)
+		    {
+		      re_free (mctx.state_log);
+		      mctx.state_log = save_state_log;
+		      save_state_log = NULL;
+		    }
+		  break;
+		}
+	    }
+	  re_free (save_state_log);
+	  save_state_log = NULL;
 	  if (__glibc_unlikely (err != REG_NOERROR))
 	    goto free_return;
 	}
+      else
+	{
+	  /* Initialize registers.  */
+	  for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
+	    pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
+	  pmatch[0].rm_so = 0;
+	  pmatch[0].rm_eo = mctx.match_last;
+	}

       /* At last, add the offset to each register, since we slid
 	 the buffers so that we could assume that the matching starts
@@ -882,6 +968,7 @@ re_search_internal (const regex_t *preg, const char *string, Idx length,
     }

  free_return:
+  re_free (save_state_log);
   re_free (mctx.state_log);
   if (dfa->nbackref)
     match_ctx_free (&mctx);
diff --git a/m4/regex.m4 b/m4/regex.m4
index c36de81011..4a7257d892 100644
--- a/m4/regex.m4
+++ b/m4/regex.m4
@@ -318,9 +318,9 @@ AC_DEFUN([gl_REGEX],
                 free (regs.end);
               }

-            /* This test is derived from bug#68725, reported by Ed Morton.
-               The regex uses backrefs to detect palindromes and "ab"
-               is not a palindrome, so this should not match.  */
+            /* These tests are derived from bug#68725, reported by
+               Ed Morton.  The regex uses backrefs with optional groups
+               to detect palindromes.  */
             {
               regex_t re68725;
               i = regcomp (&re68725,
@@ -330,8 +330,22 @@ AC_DEFUN([gl_REGEX],
                 result |= 64;
               else
                 {
-                  regmatch_t pm;
-                  if (regexec (&re68725, "ab", 1, &pm, 0) == 0)
+                  regmatch_t pm[3];
+                  /* "ab" is not a palindrome, so must not match
+                     with $.  */
+                  if (regexec (&re68725, "ab", 1, pm, 0) == 0)
+                    result |= 64;
+                  /* Without $, a shorter match (e.g., empty or "a")
+                     is valid at position 0.  Ensure set_regs retries
+                     with a shorter match_last when the longest
+                     structural match fails content validation.  */
+                  regfree (&re68725);
+                  i = regcomp (&re68725,
+                               "^(.?)(.?).?\\\\2\\\\1",
+                               REG_EXTENDED);
+                  if (i)
+                    result |= 64;
+                  else if (regexec (&re68725, "ab", 3, pm, 0) != 0)
                     result |= 64;
                   regfree (&re68725);
                 }
diff --git a/tests/test-regex.c b/tests/test-regex.c
index 87c03834f5..d747eefdc7 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -473,8 +473,7 @@ main (void)
     report_error ("%s: %s", pat_badback, s);

   /* bug#68725, reported by Ed Morton.
-     The regex uses backrefs to detect palindromes and "ab"
-     is not a palindrome, so this should not match.  */
+     The regex uses backrefs with optional groups to detect palindromes.  */
   {
     regex_t re68725;
     int ret = regcomp (&re68725, "^(.?)(.?).?\\2\\1$", REG_EXTENDED);
@@ -483,8 +482,25 @@ main (void)
     else
       {
         regmatch_t pm;
+        /* "ab" is not a palindrome, so must not match with $.  */
         if (regexec (&re68725, "ab", 1, &pm, 0) == 0)
-          report_error ("regexec bug#68725: \"ab\" matched, should not");
+          report_error ("regexec bug#68725: \"ab\" matched with $,"
+                        " should not");
+        regfree (&re68725);
+      }
+
+    /* Without $, "ab" should match: the engine must retry with a
+       shorter match_last when set_regs fails at the longest
+       structural match.  */
+    ret = regcomp (&re68725, "^(.?)(.?).?\\2\\1", REG_EXTENDED);
+    if (ret)
+      report_error ("regcomp bug#68725 (no $) failed (%d)", ret);
+    else
+      {
+        regmatch_t pm[3];
+        if (regexec (&re68725, "ab", 3, pm, 0) != 0)
+          report_error ("regexec bug#68725: \"ab\" should match"
+                        " without $");
         regfree (&re68725);
       }
   }
-- 
2.54.0.rc1.65.g8c9303b1ff

