In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/c45df5a16bb5a26a06275cc63f2c3e6b1d708184?hp=3b9b32c914622129d88bd352d8269d046fa5efe5>

- Log -----------------------------------------------------------------
commit c45df5a16bb5a26a06275cc63f2c3e6b1d708184
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 23:19:16 2011 -0700

    regcomp.c: Rmv unused parameter
    
    This silences a compiler warning

M       embed.fnc
M       proto.h
M       regcomp.c

commit b8953805dfeee53cd2300f61834ba32ccaaefaa8
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 23:14:45 2011 -0700

    re/pat.t: Remove TODO message on passing tests
    
    A previous commit fixed these.

M       t/re/pat.t

commit aa19b56b2f07e9eabf57540f00d312d8093e9d28
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 23:13:59 2011 -0700

    regcomp.c: Rmv unused parameter
    
    This silences a compiler warning

M       embed.fnc
M       proto.h
M       regcomp.c

commit 2f88b8574d3a767b1b37edf9927413bbc8ffd0a4
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 23:08:16 2011 -0700

    regcomp.c: Rmv unused parameter
    
    This silences a compiler warning

M       embed.fnc
M       proto.h
M       regcomp.c

commit 1411dba431b74256819ba8c07e7a61e2aa0b1742
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 22:56:02 2011 -0700

    PATCH: [perl #85528], add initialization
    
    Commit 137165a601b852a9679983cdfe8d35be29f0939c omitted
    required initialization for the synthetic start class.  Adding it
    exposed other bugs in cl_and() and cl_or(), which have been fixed
    by a previous commit.

M       regcomp.c
M       t/re/re_tests

commit c6b765375213e9d6dce25829a367fe4ef37da1a4
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 22:35:36 2011 -0700

    regcomp.c: revamp cl_and() and cl_or()
    
    These two routines have not kept pace with the changes in the ANYOF
    flags.  And, I believe there were issues even before them.  I did a
    systematic re-thinking of what their behaviors should be.

M       regcomp.c

commit ace6b0e469777649cb9a908e00e8780b3af366d0
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 21:57:24 2011 -0700

    regcomp.h: #define of ANYOF flags immune from inversion

M       regcomp.h

commit c613755a4b4fc8e64a77639d47d7e208fee68edc
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 17:06:47 2011 -0700

    regex: /l in combo with others in syn start class
    
    Now that regexes can be combinations of different charset modifiers,
    a synthetic start class can match locale and non-locale both.  locale
    should generally match only things in the bitmap for code points < 256.
    But a synthetic start class with a non-locale component can match such
    code points.  This patch makes an exception for synthetic nodes that
    will be resolved if it passes and is matched again for real.

M       regcomp.c
M       regcomp.h
M       regexec.c

commit f0c16e54b3b5efbb4380952c7ba5e8d7626d7cae
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 16:30:00 2011 -0700

    regcomp.c: UTF /l should not use tries
    
    It's unclear if tries will work under /l.  I haven't seen any failures,
    but there have been under /d.  As a precaution, until more testing is
    done, disable tries under anything but /u and UTF.

M       regcomp.c

commit 1051e1c4d07fec1c36934f253d2baa8842339cbf
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 16:20:52 2011 -0700

    regcomp.c: Merge identical functions
    
    These two functions now have identical code, so merge them, but use
    a macro in case they ever need to diverge again.

M       embed.fnc
M       embed.h
M       proto.h
M       regcomp.c

commit cf34198ebe3dd876d67c10caa9acf491ad2a0c51
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 15:28:05 2011 -0700

    regcomp.c: Change start class init for /l
    
    Before /l was added, locale only applied to regular expressions as a
    whole.  Now it can be subsections, so the flag for allowing it
    should be treated as any other flag.

M       regcomp.c

commit 58b5ba03346c70dc37751766fe464485278999a8
Author: Karl Williamson <[email protected]>
Date:   Tue Mar 8 15:25:27 2011 -0700

    regcomp.c: clarify comments

M       regcomp.c

commit c8d3cd88811d23a268c37b61d1c0641a6d42d995
Author: Karl Williamson <[email protected]>
Date:   Sun Mar 6 09:00:52 2011 -0700

    regcomp.c: Move #defines to be be in bit order

M       regcomp.h
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc     |   11 +--
 embed.h       |    1 -
 proto.h       |   27 ++-----
 regcomp.c     |  226 ++++++++++++++++++++++++++++++++++-----------------------
 regcomp.h     |   38 ++++++----
 regexec.c     |   11 ++-
 t/re/pat.t    |    1 -
 t/re/re_tests |    3 +
 8 files changed, 182 insertions(+), 136 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 7dcb82e..d5273ea 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1803,17 +1803,12 @@ Es      |bool   |reg_skipcomment|NN struct RExC_state_t 
*pRExC_state
 Es     |void   |scan_commit    |NN const struct RExC_state_t *pRExC_state \
                                |NN struct scan_data_t *data|NN I32 *minlenp \
                                |int is_inf
-Esn    |void   |cl_anything    |NN const struct RExC_state_t *pRExC_state \
-                               |NN struct regnode_charclass_class *cl
+Esn    |void   |cl_anything    |NN struct regnode_charclass_class *cl
 EsRn   |int    |cl_is_anything |NN const struct regnode_charclass_class *cl
-Esn    |void   |cl_init        |NN const struct RExC_state_t *pRExC_state \
-                               |NN struct regnode_charclass_class *cl
-Esn    |void   |cl_init_zero   |NN const struct RExC_state_t *pRExC_state \
-                               |NN struct regnode_charclass_class *cl
+Esn    |void   |cl_init        |NN struct regnode_charclass_class *cl
 Esn    |void   |cl_and         |NN struct regnode_charclass_class *cl \
                                |NN const struct regnode_charclass_class 
*and_with
-Esn    |void   |cl_or          |NN const struct RExC_state_t *pRExC_state \
-                               |NN struct regnode_charclass_class *cl \
+Esn    |void   |cl_or          |NN struct regnode_charclass_class *cl \
                                |NN const struct regnode_charclass_class 
*or_with
 Es     |I32    |study_chunk    |NN struct RExC_state_t *pRExC_state \
                                |NN regnode **scanp|NN I32 *minlenp \
diff --git a/embed.h b/embed.h
index 743eb46..1ae431d 100644
--- a/embed.h
+++ b/embed.h
@@ -874,7 +874,6 @@
 #define cl_and                 S_cl_and
 #define cl_anything            S_cl_anything
 #define cl_init                        S_cl_init
-#define cl_init_zero           S_cl_init_zero
 #define cl_is_anything         S_cl_is_anything
 #define cl_or                  S_cl_or
 #define invlist_array(a)       S_invlist_array(aTHX_ a)
diff --git a/proto.h b/proto.h
index 80f3bc0..5f8daeb 100644
--- a/proto.h
+++ b/proto.h
@@ -5988,23 +5988,15 @@ STATIC void     S_cl_and(struct regnode_charclass_class 
*cl, const struct regnode_ch
 #define PERL_ARGS_ASSERT_CL_AND        \
        assert(cl); assert(and_with)
 
-STATIC void    S_cl_anything(const struct RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl)
-                       __attribute__nonnull__(1)
-                       __attribute__nonnull__(2);
+STATIC void    S_cl_anything(struct regnode_charclass_class *cl)
+                       __attribute__nonnull__(1);
 #define PERL_ARGS_ASSERT_CL_ANYTHING   \
-       assert(pRExC_state); assert(cl)
+       assert(cl)
 
-STATIC void    S_cl_init(const struct RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl)
-                       __attribute__nonnull__(1)
-                       __attribute__nonnull__(2);
+STATIC void    S_cl_init(struct regnode_charclass_class *cl)
+                       __attribute__nonnull__(1);
 #define PERL_ARGS_ASSERT_CL_INIT       \
-       assert(pRExC_state); assert(cl)
-
-STATIC void    S_cl_init_zero(const struct RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl)
-                       __attribute__nonnull__(1)
-                       __attribute__nonnull__(2);
-#define PERL_ARGS_ASSERT_CL_INIT_ZERO  \
-       assert(pRExC_state); assert(cl)
+       assert(cl)
 
 STATIC int     S_cl_is_anything(const struct regnode_charclass_class *cl)
                        __attribute__warn_unused_result__
@@ -6012,12 +6004,11 @@ STATIC int      S_cl_is_anything(const struct 
regnode_charclass_class *cl)
 #define PERL_ARGS_ASSERT_CL_IS_ANYTHING        \
        assert(cl)
 
-STATIC void    S_cl_or(const struct RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
+STATIC void    S_cl_or(struct regnode_charclass_class *cl, const struct 
regnode_charclass_class *or_with)
                        __attribute__nonnull__(1)
-                       __attribute__nonnull__(2)
-                       __attribute__nonnull__(3);
+                       __attribute__nonnull__(2);
 #define PERL_ARGS_ASSERT_CL_OR \
-       assert(pRExC_state); assert(cl); assert(or_with)
+       assert(cl); assert(or_with)
 
 PERL_STATIC_INLINE UV* S_invlist_array(pTHX_ HV* const invlist)
                        __attribute__warn_unused_result__
diff --git a/regcomp.c b/regcomp.c
index 9357a78..b7a6939 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -720,15 +720,13 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, 
scan_data_t *data, I32 *min
 
 /* Can match anything (initialization) */
 STATIC void
-S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class 
*cl)
+S_cl_anything(struct regnode_charclass_class *cl)
 {
     PERL_ARGS_ASSERT_CL_ANYTHING;
 
     ANYOF_CLASS_ZERO(cl);
     ANYOF_BITMAP_SETALL(cl);
-    cl->flags = 
ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
-    if (LOC)
-       cl->flags |= ANYOF_LOCALE;
+    cl->flags = 
ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_LOCALE;
 }
 
 /* Can match anything (initialization) */
@@ -751,29 +749,21 @@ S_cl_is_anything(const struct regnode_charclass_class *cl)
 
 /* Can match anything (initialization) */
 STATIC void
-S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
+S_cl_init(struct regnode_charclass_class *cl)
 {
     PERL_ARGS_ASSERT_CL_INIT;
 
     Zero(cl, 1, struct regnode_charclass_class);
     cl->type = ANYOF;
-    cl_anything(pRExC_state, cl);
+    cl_anything(cl);
+    ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
 }
 
-STATIC void
-S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class 
*cl)
-{
-    PERL_ARGS_ASSERT_CL_INIT_ZERO;
-
-    Zero(cl, 1, struct regnode_charclass_class);
-    cl->type = ANYOF;
-    cl_anything(pRExC_state, cl);
-    if (LOC)
-       cl->flags |= ANYOF_LOCALE;
-}
+/* These two functions currently do the exact same thing */
+#define cl_init_zero           S_cl_init
 
 /* 'And' a given class with another one.  Can create false positives */
-/* We assume that cl is not inverted */
+/* cl should not be inverted */
 STATIC void
 S_cl_and(struct regnode_charclass_class *cl,
        const struct regnode_charclass_class *and_with)
@@ -782,6 +772,7 @@ S_cl_and(struct regnode_charclass_class *cl,
 
     assert(and_with->type == ANYOF);
 
+    /* I (khw) am not sure all these restrictions are necessary XXX */
     if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
        && !(ANYOF_CLASS_TEST_ANY_SET(cl))
        && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
@@ -796,46 +787,86 @@ S_cl_and(struct regnode_charclass_class *cl,
            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
                cl->bitmap[i] &= and_with->bitmap[i];
     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
-    if (!(and_with->flags & ANYOF_EOS))
-       cl->flags &= ~ANYOF_EOS;
 
-    if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
-       cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
-    if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
-       cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
+    if (and_with->flags & ANYOF_INVERT) {
 
-    if (cl->flags & ANYOF_UNICODE_ALL
-       && ANYOF_NONBITMAP(and_with)
-       && !(and_with->flags & ANYOF_INVERT))
-    {
-       if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+        /* Here, the and'ed node is inverted.  Get the AND of the flags that
+         * aren't affected by the inversion.  Those that are affected are
+         * handled individually below */
+       U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
+       cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
+       cl->flags |= affected_flags;
+
+        /* We currently don't know how to deal with things that aren't in the
+         * bitmap, but we know that the intersection is no greater than what
+         * is already in cl, so let there be false positives that get sorted
+         * out after the synthetic start class succeeds, and the node is
+         * matched for real. */
+
+        /* The inversion of these two flags indicate that the resulting
+         * intersection doesn't have them */
+       if (and_with->flags & ANYOF_UNICODE_ALL) {
            cl->flags &= ~ANYOF_UNICODE_ALL;
        }
-       else {
-
-           /* The intersection of all unicode with something that isn't all
-            * unicode is that something */
-           ARG_SET(cl, ARG(and_with));
+       if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+           cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
        }
     }
-    if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
-       !(and_with->flags & ANYOF_INVERT))
-    {
-       cl->flags &= ~ANYOF_UNICODE_ALL;
+    else {   /* and'd node is not inverted */
        if (! ANYOF_NONBITMAP(and_with)) {
-           ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
+
+            /* Here 'and_with' doesn't match anything outside the bitmap
+             * (except possibly ANYOF_UNICODE_ALL), which means the
+             * intersection can't either, except for ANYOF_UNICODE_ALL, in
+             * which case we don't know what the intersection is, but it's no
+             * greater than what cl already has, so can just leave it alone,
+             * with possible false positives */
+            if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+                ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
+            }
+       }
+       else if (! ANYOF_NONBITMAP(cl)) {
+
+           /* Here, 'and_with' does match something outside the bitmap, and cl
+            * doesn't have a list of things to match outside the bitmap.  If
+             * cl can match all code points above 255, the intersection will
+             * be those above-255 code points that 'and_with' matches.  There
+             * may be false positives from code points in 'and_with' that are
+             * outside the bitmap but below 256, but those get sorted out
+             * after the synthetic start class succeeds).  If cl can't match
+             * all Unicode code points, it means here that it can't match *
+             * anything outside the bitmap, so we leave the bitmap empty */
+           if (cl->flags & ANYOF_UNICODE_ALL) {
+               ARG_SET(cl, ARG(and_with));
+           }
        }
+       else {
+            /* Here, both 'and_with' and cl match something outside the
+             * bitmap.  Currently we do not do the intersection, so just match
+             * whatever cl had at the beginning.  */
+       }
+
+
+        /* Take the intersection of the two sets of flags */
+       cl->flags &= and_with->flags;
     }
 }
 
 /* 'OR' a given class with another one.  Can create false positives */
-/* We assume that cl is not inverted */
+/* cl should not be inverted */
 STATIC void
-S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, 
const struct regnode_charclass_class *or_with)
+S_cl_or(struct regnode_charclass_class *cl, const struct 
regnode_charclass_class *or_with)
 {
     PERL_ARGS_ASSERT_CL_OR;
 
     if (or_with->flags & ANYOF_INVERT) {
+
+        /* Here, the or'd node is to be inverted.  This means we take the
+         * complement of everything not in the bitmap, but currently we don't
+         * know what that is, so give up and match anything */
+       if (ANYOF_NONBITMAP(or_with)) {
+           cl_anything(cl);
+       }
        /* We do not use
         * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
         *   <= (B1 | !B2) | (CL1 | !CL2)
@@ -845,7 +876,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl, con
         *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
         *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
         */
-       if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
+       else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
             && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
             && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
            int i;
@@ -854,9 +885,23 @@ S_cl_or(const RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl, con
                cl->bitmap[i] |= ~or_with->bitmap[i];
        } /* XXXX: logic is complicated otherwise */
        else {
-           cl_anything(pRExC_state, cl);
+           cl_anything(cl);
        }
-    } else {
+
+        /* And, we can just take the union of the flags that aren't affected
+         * by the inversion */
+       cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
+
+        /* For the remaining flags:
+            ANYOF_UNICODE_ALL and inverted means to not match anything above
+                    255, which means that the union with cl should just be
+                    what cl has in it, so can ignore this flag
+            ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
+                    is 127-255 to match them, but then invert that, so the
+                    union with cl should just be what cl has in it, so can
+                    ignore this flag
+         */
+    } else {    /* 'or_with' is not inverted */
        /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
        if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
             && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
@@ -873,27 +918,27 @@ S_cl_or(const RExC_state_t *pRExC_state, struct 
regnode_charclass_class *cl, con
            }
        }
        else { /* XXXX: logic is complicated, leave it along for a moment. */
-           cl_anything(pRExC_state, cl);
+           cl_anything(cl);
        }
-    }
-    if (or_with->flags & ANYOF_EOS)
-       cl->flags |= ANYOF_EOS;
-    if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
-       cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
 
-    if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
-       cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
+        /* Take the union */
+       cl->flags |= or_with->flags;
 
-    /* If both nodes match something outside the bitmap, but what they match
-     * outside is not the same pointer, and hence not easily compared, give up
-     * and allow the start class to match everything outside the bitmap */
-    if (ANYOF_NONBITMAP(cl) && ANYOF_NONBITMAP(or_with) &&
-       ARG(cl) != ARG(or_with)) {
-       cl->flags |= ANYOF_UNICODE_ALL;
-    }
+       if (ANYOF_NONBITMAP(or_with)) {
 
-    if (or_with->flags & ANYOF_UNICODE_ALL) {
-       cl->flags |= ANYOF_UNICODE_ALL;
+           /* Use the added node's outside-the-bit-map match if there isn't a
+            * conflict.  If there is a conflict (both nodes match something
+            * outside the bitmap, but what they match outside is not the same
+            * pointer, and hence not easily compared until XXX we extend
+            * inversion lists this far), give up and allow the start class to
+            * match everything outside the bitmap */
+           if (! ANYOF_NONBITMAP(cl)) {
+               ARG_SET(cl, ARG(or_with));
+           }
+           else if (ARG(cl) != ARG(or_with)) {
+               cl->flags |= ANYOF_UNICODE_ALL;
+           }
+       }
     }
 }
 
@@ -2703,7 +2748,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                if (flags & SCF_DO_SUBSTR)
                    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge 
strings after this. */
                if (flags & SCF_DO_STCLASS)
-                   cl_init_zero(pRExC_state, &accum);
+                   cl_init_zero(&accum);
 
                while (OP(scan) == code) {
                    I32 deltanext, minnext, f = 0, fake;
@@ -2724,7 +2769,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    if (code != BRANCH)
                        scan = NEXTOPER(scan);
                    if (flags & SCF_DO_STCLASS) {
-                       cl_init(pRExC_state, &this_class);
+                       cl_init(&this_class);
                        data_fake.start_class = &this_class;
                        f = SCF_DO_STCLASS_AND;
                    }
@@ -2757,7 +2802,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        data->whilem_c = data_fake.whilem_c;
                    }
                    if (flags & SCF_DO_STCLASS)
-                       cl_or(pRExC_state, &accum, &this_class);
+                       cl_or(&accum, &this_class);
                }
                if (code == IFTHEN && num < 2) /* Empty ELSE branch */
                    min1 = 0;
@@ -2770,7 +2815,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                min += min1;
                delta += max1 - min1;
                if (flags & SCF_DO_STCLASS_OR) {
-                   cl_or(pRExC_state, data->start_class, &accum);
+                   cl_or(data->start_class, &accum);
                    if (min1) {
                        cl_and(data->start_class, and_withp);
                        flags &= ~SCF_DO_STCLASS;
@@ -2950,10 +2995,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
     If/when this is fixed the following define can be swapped
     in below to fully enable trie logic.
 
+    XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
+    not /aa
+
 #define TRIE_TYPE_IS_SAFE 1
 
 */
-#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
 
                                 if ( last && TRIE_TYPE_IS_SAFE ) {
                                     make_trie( pRExC_state, 
@@ -3042,7 +3090,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                     }
                     is_inf = is_inf_internal = 1;
                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
-                        cl_anything(pRExC_state, data->start_class);
+                        cl_anything(data->start_class);
                     flags &= ~SCF_DO_STCLASS;
                }
             } else {
@@ -3303,7 +3351,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        data->flags |= SF_IS_INF;
                }
                if (flags & SCF_DO_STCLASS) {
-                   cl_init(pRExC_state, &this_class);
+                   cl_init(&this_class);
                    oclass = data->start_class;
                    data->start_class = &this_class;
                    f |= SCF_DO_STCLASS_AND;
@@ -3331,7 +3379,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    data->start_class = oclass;
                if (mincount == 0 || minnext == 0) {
                    if (flags & SCF_DO_STCLASS_OR) {
-                       cl_or(pRExC_state, data->start_class, &this_class);
+                       cl_or(data->start_class, &this_class);
                    }
                    else if (flags & SCF_DO_STCLASS_AND) {
                        /* Switch to OR mode: cache the old value of
@@ -3347,7 +3395,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    }
                } else {                /* Non-zero len */
                    if (flags & SCF_DO_STCLASS_OR) {
-                       cl_or(pRExC_state, data->start_class, &this_class);
+                       cl_or(data->start_class, &this_class);
                        cl_and(data->start_class, and_withp);
                    }
                    else if (flags & SCF_DO_STCLASS_AND)
@@ -3597,7 +3645,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                }
                is_inf = is_inf_internal = 1;
                if (flags & SCF_DO_STCLASS_OR)
-                   cl_anything(pRExC_state, data->start_class);
+                   cl_anything(data->start_class);
                flags &= ~SCF_DO_STCLASS;
                break;
            }
@@ -3660,7 +3708,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                  do_default:
                    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode 
%d", OP(scan)); */
                    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
-                       cl_anything(pRExC_state, data->start_class);
+                       cl_anything(data->start_class);
                    break;
                case REG_ANY:
                    if (OP(scan) == SANY)
@@ -3668,7 +3716,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
                                 || 
ANYOF_CLASS_TEST_ANY_SET(data->start_class));
-                       cl_anything(pRExC_state, data->start_class);
+                       cl_anything(data->start_class);
                    }
                    if (flags & SCF_DO_STCLASS_AND || !value)
                        ANYOF_BITMAP_CLEAR(data->start_class,'\n');
@@ -3678,7 +3726,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        cl_and(data->start_class,
                               (struct regnode_charclass_class*)scan);
                    else
-                       cl_or(pRExC_state, data->start_class,
+                       cl_or(data->start_class,
                              (struct regnode_charclass_class*)scan);
                    break;
                case ALNUM:
@@ -3907,7 +3955,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                data_fake.pos_delta = delta;
                 if ( flags & SCF_DO_STCLASS && !scan->flags
                      && OP(scan) == IFMATCH ) { /* Lookahead */
-                    cl_init(pRExC_state, &intrnl);
+                    cl_init(&intrnl);
                     data_fake.start_class = &intrnl;
                     f |= SCF_DO_STCLASS_AND;
                }
@@ -3941,7 +3989,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                         * *** HACK *** for now just treat as "no information".
                         * See [perl #56690].
                         */
-                       cl_init(pRExC_state, data->start_class);
+                       cl_init(data->start_class);
                    }  else {
                        /* AND before and after: combine and continue */
                        const int was = (data->start_class->flags & ANYOF_EOS);
@@ -3992,7 +4040,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                    data_fake.flags |= SF_IS_INF;
                 if ( flags & SCF_DO_STCLASS && !scan->flags
                      && OP(scan) == IFMATCH ) { /* Lookahead */
-                    cl_init(pRExC_state, &intrnl);
+                    cl_init(&intrnl);
                     data_fake.start_class = &intrnl;
                     f |= SCF_DO_STCLASS_AND;
                 }
@@ -4094,7 +4142,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                }
                is_inf = is_inf_internal = 1;
                if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
-                   cl_anything(pRExC_state, data->start_class);
+                   cl_anything(data->start_class);
                flags &= ~SCF_DO_STCLASS;
        }
        else if (OP(scan) == GPOS) {
@@ -4125,7 +4173,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
                 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge 
strings after this. */
             if (flags & SCF_DO_STCLASS)
-                cl_init_zero(pRExC_state, &accum);
+                cl_init_zero(&accum);
                 
             if (!trie->jump) {
                 min1= trie->minlen;
@@ -4148,7 +4196,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                         data_fake.last_closep = &fake;
                    data_fake.pos_delta = delta;
                     if (flags & SCF_DO_STCLASS) {
-                        cl_init(pRExC_state, &this_class);
+                        cl_init(&this_class);
                         data_fake.start_class = &this_class;
                         f = SCF_DO_STCLASS_AND;
                     }
@@ -4192,7 +4240,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                         data->whilem_c = data_fake.whilem_c;
                     }
                     if (flags & SCF_DO_STCLASS)
-                        cl_or(pRExC_state, &accum, &this_class);
+                        cl_or(&accum, &this_class);
                 }
             }
             if (flags & SCF_DO_SUBSTR) {
@@ -4204,7 +4252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
             min += min1;
             delta += max1 - min1;
             if (flags & SCF_DO_STCLASS_OR) {
-                cl_or(pRExC_state, data->start_class, &accum);
+                cl_or(data->start_class, &accum);
                 if (min1) {
                     cl_and(data->start_class, and_withp);
                     flags &= ~SCF_DO_STCLASS;
@@ -4900,7 +4948,7 @@ reStudy:
        data.longest = &(data.longest_fixed);
        first = scan;
        if (!ri->regstclass) {
-           cl_init(pRExC_state, &ch_class);
+           cl_init(&ch_class);
            data.start_class = &ch_class;
            stclass_flag = SCF_DO_STCLASS_AND;
        } else                          /* XXXX Check for BOUND? */
@@ -5022,14 +5070,13 @@ reStudy:
            && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
            ri->regstclass = NULL;
 
-       /* If the synthetic start class were to ever be used when EOS is set,
-        * that bit would have to be cleared, as it is shared with another */
        if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
            && stclass_flag
            && !(data.start_class->flags & ANYOF_EOS)
            && !cl_is_anything(data.start_class))
        {
            const U32 n = add_data(pRExC_state, 1, "f");
+           data.start_class->flags |= ANYOF_IS_SYNTHETIC;
 
            Newx(RExC_rxi->data->data[n], 1,
                struct regnode_charclass_class);
@@ -5084,7 +5131,7 @@ reStudy:
        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
 
        scan = ri->program + 1;
-       cl_init(pRExC_state, &ch_class);
+       cl_init(&ch_class);
        data.start_class = &ch_class;
        data.last_closep = &last_close;
 
@@ -5097,12 +5144,11 @@ reStudy:
        r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
                = r->float_substr = r->float_utf8 = NULL;
 
-       /* If the synthetic start class were to ever be used when EOS is set,
-        * that bit would have to be cleared, as it is shared with another */
        if (!(data.start_class->flags & ANYOF_EOS)
            && !cl_is_anything(data.start_class))
        {
            const U32 n = add_data(pRExC_state, 1, "f");
+           data.start_class->flags |= ANYOF_IS_SYNTHETIC;
 
            Newx(RExC_rxi->data->data[n], 1,
                struct regnode_charclass_class);
diff --git a/regcomp.h b/regcomp.h
index 8e96b75..cc6708b 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -331,26 +331,24 @@ struct regnode_charclass_class {
 
 #define ANYOF_INVERT            0x04
 
-/* EOS, meaning that it can match an empty string too, is used for the
- * synthetic start class (ssc) only.  It looks like it could share the INVERT
- * bit, as the ssc is never inverted.  But doing that caused this reges to
- * not match:
- * 'foo/file.fob' =~ m,^(?=[^\.])[^/]* /(?=[^\.])[^/]*\.fo[^/]$,;
- * (except the space between the * and the / above shouldn't be there; it was
- * inserted to make this comment continue on.)
- * Rather than try to figure out what was going on in the optimizer, I (khw)
- * found a way to save a different bit.  But my original line of reasoning was
- * "The bit just needs to be turned off before regexec.c gets a hold of it so
- * that regexec.c doesn't think it's inverted, but this happens automatically,
- * as if the ssc can match an EOS, the ssc is discarded, and never passed to
- * regexec.c" */
-#define ANYOF_EOS              0x10
-
 /* CLASS is never set unless LOCALE is too: has runtime \d, \w, [:posix:], ...
  * The non-locale ones are resolved at compile-time */
 #define ANYOF_CLASS     0x08
 #define ANYOF_LARGE      ANYOF_CLASS    /* Same; name retained for back compat 
*/
 
+/* EOS, meaning that it can match an empty string too, is used for the
+ * synthetic start class only. */
+#define ANYOF_EOS              0x10
+
+/* ? Is this node the synthetic start class (ssc).  This bit is shared with
+ * ANYOF_EOS, as the latter is used only for the ssc, and then not used by
+ * regexec.c.  And, the code is structured so that if it is set, the ssc is
+ * not used, so it is guaranteed to be 0 for the ssc by the time regexec.c
+ * gets executed, and 0 for a non-ssc ANYOF node, as it only ever gets set for
+ * a potential ssc candidate.  Thus setting it to 1 after it has been
+ * determined that the ssc will be used is not ambiguous */
+#define ANYOF_IS_SYNTHETIC     ANYOF_EOS
+
 /* Can match something outside the bitmap that isn't in utf8 */
 #define ANYOF_NONBITMAP_NON_UTF8 0x20
 
@@ -363,6 +361,16 @@ struct regnode_charclass_class {
 
 #define ANYOF_FLAGS_ALL                0xff
 
+/* These are the flags that ANYOF_INVERT being set or not doesn't affect
+ * whether they are operative or not.  e.g., the node still has LOCALE
+ * regardless of being inverted; whereas ANYOF_UNICODE_ALL means something
+ * different if inverted */
+#define INVERSION_UNAFFECTED_FLAGS (ANYOF_LOCALE                        \
+                                  |ANYOF_LOC_NONBITMAP_FOLD            \
+                                  |ANYOF_CLASS                         \
+                                  |ANYOF_EOS                           \
+                                  |ANYOF_NONBITMAP_NON_UTF8)
+
 /* Character classes for node->classflags of ANYOF */
 /* Should be synchronized with a table in regprop() */
 /* 2n should pair with 2n+1 */
diff --git a/regexec.c b/regexec.c
index 739eba6..76784ee 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6587,16 +6587,21 @@ S_reginclass(pTHX_ const regexp * const prog, register 
const regnode * const n,
     /* If the bitmap didn't (or couldn't) match, and something outside the
      * bitmap could match, try that.  Locale nodes specifiy completely the
      * behavior of code points in the bit map (otherwise, a utf8 target would
-     * cause them to be treated as Unicode and not locale), except XXX in
+     * cause them to be treated as Unicode and not locale), except in
      * the very unlikely event when this node is a synthetic start class, which
-     * could be a combination of locale and non-locale nodes */
+     * could be a combination of locale and non-locale nodes.  So allow locale
+     * to match for the synthetic start class, which will give a false
+     * positive that will be resolved when the match is done again as not part
+     * of the synthetic start class */
     if (!match) {
        if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
            match = TRUE;       /* Everything above 255 matches */
        }
        else if ((flags & ANYOF_NONBITMAP_NON_UTF8
                  || (utf8_target && ANYOF_NONBITMAP(n)
-                     && (c >=256 || ! (flags & ANYOF_LOCALE)))))
+                     && (c >=256
+                         || (! (flags & ANYOF_LOCALE))
+                         || (flags & ANYOF_IS_SYNTHETIC)))))
        {
            AV *av;
            SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
diff --git a/t/re/pat.t b/t/re/pat.t
index a14cb4f..66ce5ea 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -1030,7 +1030,6 @@ sub run_tests {
 
         my $message = '\p property after empty * match';
         {
-            local $::TODO = "Bug 77414";
             like("1", qr/\s*\pN/, $message);
             like("-", qr/\s*\p{Dash}/, $message);
             like(" ", qr/\w*\p{Blank}/, $message);
diff --git a/t/re/re_tests b/t/re/re_tests
index 924434c..b44fb73 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1493,4 +1493,7 @@ abc\N{def -       c       -       \\N{NAME} must be 
resolved by the lexer
 
 (?:(?:)foo|bar|zot|rt78356)    foo     y       $&      foo
 /\xe0\pL/i     \xc0a   y       $&      \xc0a
+
+# RT #85528
+(?{})[\x{100}] \x{100} y       $&      \x{100}
 # vim: softtabstop=0 noexpandtab

--
Perl5 Master Repository

Reply via email to