Patch 7.3.1011
Problem:    New regexp engine is inefficient with multi-byte characters.
Solution:   Handle a character at a time instead of a byte at a time.  Also
            make \Z partly work.
Files:      src/regexp_nfa.c, src/testdir/test95.in, src/testdir/test95.ok


*** ../vim-7.3.1010/src/regexp_nfa.c    2013-05-24 20:25:28.000000000 +0200
--- src/regexp_nfa.c    2013-05-24 21:49:43.000000000 +0200
***************
*** 46,54 ****
      NFA_NCLOSE,                           /* End of subexpr. marked with \%( 
... \) */
      NFA_START_INVISIBLE,
      NFA_END_INVISIBLE,
-     NFA_MULTIBYTE,                /* Next nodes in NFA are part of the same
-                                      multibyte char */
-     NFA_END_MULTIBYTE,                    /* End of multibyte char in the NFA 
*/
      NFA_COMPOSING,                /* Next nodes in NFA are part of the
                                       composing multibyte char */
      NFA_END_COMPOSING,                    /* End of a composing char in the 
NFA */
--- 46,51 ----
***************
*** 195,220 ****
                    *post_ptr++ = c;            \
                } while (0)
  
- #define EMIT_MBYTE(c)                                     \
-                       len = (*mb_char2bytes)(c, buf);     \
-                       EMIT(buf[0]);                       \
-                       for (i = 1; i < len; i++)           \
-                       {                                   \
-                           EMIT(buf[i]);                   \
-                           EMIT(NFA_CONCAT);               \
-                       }                                   \
-                       EMIT(NFA_MULTIBYTE);
- 
- #define EMIT_COMPOSING_UTF(input)                         \
-                       len = utfc_ptr2len(input);          \
-                       EMIT(input[0]);                     \
-                       for (i = 1; i < len; i++)           \
-                       {                                   \
-                           EMIT(input[i]);                 \
-                           EMIT(NFA_CONCAT);               \
-                       }                                   \
-                       EMIT(NFA_COMPOSING);
- 
  /*
   * Initialize internal variables before NFA compilation.
   * Return OK on success, FAIL otherwise.
--- 192,197 ----
***************
*** 611,618 ****
  #ifdef FEAT_MBYTE
      char_u    *old_regparse = regparse;
      int               clen;
-     int               len;
-     static char_u     buf[30];
      int               i;
  #endif
      int               extra = 0;
--- 588,593 ----
***************
*** 845,858 ****
                    return FAIL;
  
                    c = coll_get_char();
! #ifdef FEAT_MBYTE
!                   if ((*mb_char2len)(c) > 1)
!                   {
!                       EMIT_MBYTE(c);
!                   }
!                   else
! #endif
!                       EMIT(c);
                    break;
  
                /* Catch \%^ and \%$ regardless of where they appear in the
--- 820,826 ----
                    return FAIL;
  
                    c = coll_get_char();
!                   EMIT(c);
                    break;
  
                /* Catch \%^ and \%$ regardless of where they appear in the
***************
*** 1135,1146 ****
                             * skip it. */
                            for (c = startc + 1; c <= endc; c++)
                            {
!                               if ((*mb_char2len)(c) > 1)
!                               {
!                                   EMIT_MBYTE(c);
!                               }
!                               else
!                                   EMIT(c);
                                TRY_NEG();
                                EMIT_GLUE();
                            }
--- 1103,1109 ----
                             * skip it. */
                            for (c = startc + 1; c <= endc; c++)
                            {
!                               EMIT(c);
                                TRY_NEG();
                                EMIT_GLUE();
                            }
***************
*** 1187,1200 ****
                        if (got_coll_char == TRUE && startc == 0)
                            EMIT(0x0a);
                        else
! #ifdef FEAT_MBYTE
!                           if ((*mb_char2len)(startc) > 1)
!                           {
!                               EMIT_MBYTE(startc);
!                           }
!                           else
! #endif
!                               EMIT(startc);
                        TRY_NEG();
                        EMIT_GLUE();
                    }
--- 1150,1156 ----
                        if (got_coll_char == TRUE && startc == 0)
                            EMIT(0x0a);
                        else
!                           EMIT(startc);
                        TRY_NEG();
                        EMIT_GLUE();
                    }
***************
*** 1242,1271 ****
                int     plen;
  
  nfa_do_multibyte:
!               /* length of current char, with composing chars,
!                * from pointer */
!               plen = (*mb_ptr2len)(old_regparse);
!               if (enc_utf8 && clen != plen)
!               {
!                   /* A composing character is always handled as a
!                    * separate atom, surrounded by NFA_COMPOSING and
!                    * NFA_END_COMPOSING. Note that right now we are
                     * building the postfix form, not the NFA itself;
                     * a composing char could be: a, b, c, NFA_COMPOSING
!                    * where 'a', 'b', 'c' are chars with codes > 256.
!                    */
!                   EMIT_COMPOSING_UTF(old_regparse);
                    regparse = old_regparse + plen;
                }
                else
-                   /* A multi-byte character is always handled as a
-                    * separate atom, surrounded by NFA_MULTIBYTE and
-                    * NFA_END_MULTIBYTE */
-                   if (plen > 1)
-                   {
-                       EMIT_MBYTE(c);
-                   }
-                   else
  #endif
                {
                    c = no_Magic(c);
--- 1198,1227 ----
                int     plen;
  
  nfa_do_multibyte:
!               /* Length of current char with composing chars. */
!               if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse)))
!               {
!                   /* A base character plus composing characters.
!                    * This requires creating a separate atom as if enclosing
!                    * the characters in (), where NFA_COMPOSING is the ( and
!                    * NFA_END_COMPOSING is the ). Note that right now we are
                     * building the postfix form, not the NFA itself;
                     * a composing char could be: a, b, c, NFA_COMPOSING
!                    * where 'b' and 'c' are chars with codes > 256. */
!                   i = 0;
!                   for (;;)
!                   {
!                       EMIT(c);
!                       if (i > 0)
!                           EMIT(NFA_CONCAT);
!                       if (i += utf_char2len(c) >= plen)
!                           break;
!                       c = utf_ptr2char(old_regparse + i);
!                   }
!                   EMIT(NFA_COMPOSING);
                    regparse = old_regparse + plen;
                }
                else
  #endif
                {
                    c = no_Magic(c);
***************
*** 1702,1710 ****
        case NFA_START_INVISIBLE:   STRCPY(code, "NFA_START_INVISIBLE"); break;
        case NFA_END_INVISIBLE:     STRCPY(code, "NFA_END_INVISIBLE"); break;
  
-       case NFA_MULTIBYTE:         STRCPY(code, "NFA_MULTIBYTE"); break;
-       case NFA_END_MULTIBYTE:     STRCPY(code, "NFA_END_MULTIBYTE"); break;
- 
        case NFA_COMPOSING:         STRCPY(code, "NFA_COMPOSING"); break;
        case NFA_END_COMPOSING:     STRCPY(code, "NFA_END_COMPOSING"); break;
  
--- 1658,1663 ----
***************
*** 2194,2200 ****
            }
            e1 = POP();
            e1.start->negated = TRUE;
!           if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING)
                e1.start->out1->negated = TRUE;
            PUSH(e1);
            break;
--- 2147,2153 ----
            }
            e1 = POP();
            e1.start->negated = TRUE;
!           if (e1.start->c == NFA_COMPOSING)
                e1.start->out1->negated = TRUE;
            PUSH(e1);
            break;
***************
*** 2311,2316 ****
--- 2264,2279 ----
            PUSH(frag(s, list1(&s1->out)));
            break;
  
+       case NFA_COMPOSING:     /* char with composing char */
+ #if 0
+           /* TODO */
+           if (regflags & RF_ICOMBINE)
+           {
+               goto normalchar;
+           }
+ #endif
+           /* FALLTHROUGH */
+ 
        case NFA_MOPEN + 0:     /* Submatch */
        case NFA_MOPEN + 1:
        case NFA_MOPEN + 2:
***************
*** 2322,2329 ****
        case NFA_MOPEN + 8:
        case NFA_MOPEN + 9:
        case NFA_NOPEN:         /* \%( "Invisible Submatch" */
-       case NFA_MULTIBYTE:     /* mbyte char */
-       case NFA_COMPOSING:     /* composing char */
            if (nfa_calc_size == TRUE)
            {
                nstate += 2;
--- 2285,2290 ----
***************
*** 2336,2344 ****
                case NFA_NOPEN:
                    mclose = NFA_NCLOSE;
                    break;
-               case NFA_MULTIBYTE:
-                   mclose = NFA_END_MULTIBYTE;
-                   break;
                case NFA_COMPOSING:
                    mclose = NFA_END_COMPOSING;
                    break;
--- 2297,2302 ----
***************
*** 2377,2385 ****
                goto theend;
            patch(e.out, s1);
  
!           if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING)
!               /* MULTIBYTE->out1 = END_MULTIBYTE
!               * COMPOSING->out1 = END_COMPOSING */
                patch(list1(&s->out1), s1);
  
            PUSH(frag(s, list1(&s1->out)));
--- 2335,2342 ----
                goto theend;
            patch(e.out, s1);
  
!           if (mopen == NFA_COMPOSING)
!               /* COMPOSING->out1 = END_COMPOSING */
                patch(list1(&s->out1), s1);
  
            PUSH(frag(s, list1(&s1->out)));
***************
*** 2540,2556 ****
        case NFA_COMPOSING:
            /* nfa_regmatch() will match all the bytes of this composing char. 
*/
            break;
- 
-       case NFA_MULTIBYTE:
-           /* nfa_regmatch() will match all the bytes of this multibyte char. 
*/
-           break;
  #endif
  
-       case NFA_END_MULTIBYTE:
-           /* Successfully matched this mbyte char */
-           addstate(l, state->out, m, off, lid, match);
-           break;
- 
        case NFA_NOPEN:
        case NFA_NCLOSE:
            addstate(l, state->out, m, off, lid, match);
--- 2497,2504 ----
***************
*** 2841,2847 ****
      regsub_T          *submatch;
      regsub_T          *m;
  {
!     int               c = -1;
      int               n;
      int               i = 0;
      int               result;
--- 2789,2795 ----
      regsub_T          *submatch;
      regsub_T          *m;
  {
!     int               c;
      int               n;
      int               i = 0;
      int               result;
***************
*** 2859,2865 ****
      List      *listtbl[2][2];
      List      *ll;
      int               listid = 1;
-     int               endnode;
      List      *thislist;
      List      *nextlist;
      List      *neglist;
--- 2807,2812 ----
***************
*** 3190,3222 ****
                break;
            }
  
!           case NFA_MULTIBYTE:
            case NFA_COMPOSING:
!               endnode = t->state->c + 1;
                result = OK;
                sta = t->state->out;
!               len = 1;
!               while (sta->c != endnode && len <= n)
                {
!                   if (reginput[len-1] != sta->c)
!                   {
!                       result = FAIL;
                        break;
!                   }
!                   len++;
                    sta = sta->out;
                }
  
                /* if input char length doesn't match regexp char length */
!               if (len -1 < n || sta->c != endnode)
                    result = FAIL;
!               end = t->state->out1;       /* NFA_END_MULTIBYTE or
!                                              NFA_END_COMPOSING */
                /* If \Z was present, then ignore composing characters */
!               if (ireg_icombine && endnode == NFA_END_COMPOSING)
                    result = 1 ^ sta->negated;
                ADD_POS_NEG_STATE(end);
                break;
  
            case NFA_NEWL:
                if (!reg_line_lbr && REG_MULTI
--- 3137,3171 ----
                break;
            }
  
! #ifdef FEAT_MBYTE
            case NFA_COMPOSING:
!           {
!               int mc = c;
! 
                result = OK;
                sta = t->state->out;
!               len = 0;
!               while (sta->c != NFA_END_COMPOSING && len < n)
                {
!                   if (len > 0)
!                       mc = mb_ptr2char(reginput + len);
!                   if (mc != sta->c)
                        break;
!                   len += mb_char2len(mc);
                    sta = sta->out;
                }
  
                /* if input char length doesn't match regexp char length */
!               if (len < n || sta->c != NFA_END_COMPOSING)
                    result = FAIL;
!               end = t->state->out1;       /* NFA_END_COMPOSING */
                /* If \Z was present, then ignore composing characters */
!               if (ireg_icombine)
                    result = 1 ^ sta->negated;
                ADD_POS_NEG_STATE(end);
                break;
+           }
+ #endif
  
            case NFA_NEWL:
                if (!reg_line_lbr && REG_MULTI
***************
*** 3425,3430 ****
--- 3374,3387 ----
                if (!result)
                    result = ireg_ic == TRUE
                                && MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
+ #ifdef FEAT_MBYTE
+               /* If there is a composing character which is not being
+                * ignored there can be no match. Match with composing
+                * character uses NFA_COMPOSING above. */
+               if (result && enc_utf8 && !ireg_icombine
+                                                     && n != utf_char2len(c))
+                   result = FALSE;
+ #endif
                ADD_POS_NEG_STATE(t->state);
                break;
            }
*** ../vim-7.3.1010/src/testdir/test95.in       2013-05-24 20:25:28.000000000 
+0200
--- src/testdir/test95.in       2013-05-24 20:45:08.000000000 +0200
***************
*** 35,40 ****
--- 35,44 ----
  :call add(tl, ['\f\+', '&*Ÿfname ', 'fname'])
  :call add(tl, ['\%#=1\f\+', '&*Ÿfname ', 'fname'])
  
+ :"""" Test composing character matching
+ :call add(tl, ['.ม', 'xม่x yมy', 'yม'])
+ :call add(tl, ['.ม่', 'xม่x yมy', 'xม่'])
+ 
  :"""" Test \Z
  :call add(tl, ['ú\Z', 'x'])
  
*** ../vim-7.3.1010/src/testdir/test95.ok       2013-05-24 20:25:28.000000000 
+0200
--- src/testdir/test95.ok       2013-05-24 20:44:41.000000000 +0200
***************
*** 9,13 ****
--- 9,15 ----
  OK - \%#=1\i\+
  OK - \f\+
  OK - \%#=1\f\+
+ OK - .ม
+ OK - .ม่
  OK - ú\Z
  OK - [^[=a=]]\+
*** ../vim-7.3.1010/src/version.c       2013-05-24 20:25:28.000000000 +0200
--- src/version.c       2013-05-24 21:56:02.000000000 +0200
***************
*** 730,731 ****
--- 730,733 ----
  {   /* Add new patch number below this line */
+ /**/
+     1011,
  /**/

-- 
If you had to identify, in one word, the reason why the
human race has not achieved, and never will achieve, its
full potential, that word would be "meetings."

 /// Bram Moolenaar -- [email protected] -- http://www.Moolenaar.net   \\\
///        sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\  an exciting new programming language -- http://www.Zimbu.org        ///
 \\\            help me help AIDS victims -- http://ICCF-Holland.org    ///

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.


Raspunde prin e-mail lui