jenkins-bot has submitted this change and it was merged.

Change subject: Add frontier pattern (%f[set]) to ustring
......................................................................


Add frontier pattern (%f[set]) to ustring

The "%f[set]" frontier pattern has been in Lua 5.1 since the beginning,
but was undocumented until Lua 5.2. And the code is even unchanged from
5.1.0 to 5.2.1. So there's no reason not to implement it in ustring too.

Note the changes to UstringLibrary.php are somewhat large, because it
splits the "convert a Lua bracketed charset to PCRE" code into a
separate function and it changes the handling of mw.ustring.find's and
mw.ustring.match's 'init' parameter from "substring, match from 0, then
add back on $init" to "use preg_match's $offset and use \G instead of ^
where this matters". Both of these are necessary to properly support
%f.

This also fixes a bug in the pure-Lua code (not used in Scribunto)
exposed by the unit tests for %f where %z was matching '\1' rather than
'\0' and %Z everything except '\1' instead of everything except '\0'.

Bug: 48331
Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08
---
M engines/LuaCommon/UstringLibrary.php
M engines/LuaCommon/lualib/ustring/charsets.lua
M engines/LuaCommon/lualib/ustring/make-tables.php
M engines/LuaCommon/lualib/ustring/ustring.lua
M tests/engines/LuaCommon/UstringLibraryTests.lua
5 files changed, 127 insertions(+), 64 deletions(-)

Approvals:
  Aaron Schulz: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/engines/LuaCommon/UstringLibrary.php 
b/engines/LuaCommon/UstringLibrary.php
index b253e0b..5c96a9a 100644
--- a/engines/LuaCommon/UstringLibrary.php
+++ b/engines/LuaCommon/UstringLibrary.php
@@ -232,7 +232,7 @@
        }
 
        /* Convert a Lua pattern into a PCRE regex */
-       private function patternToRegex( $pattern, $noAnchor = false ) {
+       private function patternToRegex( $pattern, $anchor ) {
                $pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY 
);
 
                static $charsets = null, $brcharsets = null;
@@ -295,7 +295,7 @@
                        switch ( $pat[$i] ) {
                        case '^':
                                $q = $i;
-                               $re .= ( $noAnchor || $q ) ? '\\^' : '^';
+                               $re .= ( $anchor === false || $q ) ? '\\^' : 
$anchor;
                                break;
 
                        case '$':
@@ -345,6 +345,19 @@
                                                $bct++;
                                                $re .= 
"(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
                                        }
+                               } elseif ( $pat[$i] === 'f' ) {
+                                       if ( $i + 1 >= $len || $pat[++$i] !== 
'[' ) {
+                                               throw new Scribunto_LuaError( 
"missing '[' after %f in pattern at pattern character $ii" );
+                                       }
+                                       list( $i, $re2 ) = 
$this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
+                                       // Because %f considers the beginning 
and end of the string
+                                       // to be \0, determine if $re2 matches 
that and take it
+                                       // into account with "^" and "$".
+                                       if ( preg_match( "/$re2/us", "\0" ) ) {
+                                               $re .= 
"(?<!^)(?<!$re2)(?=$re2|$)";
+                                       } else {
+                                               $re .= "(?<!$re2)(?=$re2)";
+                                       }
                                } elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) 
{
                                        $n = ord( $pat[$i] ) - 0x30;
                                        if ( $n === 0 || $n > count( $capt ) || 
in_array( $n, $opencapt ) ) {
@@ -358,34 +371,8 @@
                                break;
 
                        case '[':
-                               $re .= '[';
-                               $i++;
-                               if ( $i < $len && $pat[$i] === '^' ) {
-                                       $re .= '^';
-                                       $i++;
-                               }
-                               for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) {
-                                       if ( $pat[$i] === '%' ) {
-                                               $i++;
-                                               if ( $i >= $len ) {
-                                                       break;
-                                               }
-                                               if ( isset( 
$brcharsets[$pat[$i]] ) ) {
-                                                       $re .= 
$brcharsets[$pat[$i]];
-                                               } else {
-                                                       $re .= preg_quote( 
$pat[$i], '/' );
-                                               }
-                                       } elseif( $i + 2 < $len && $pat[$i + 1] 
=== '-' && $pat[$i + 2] !== ']' ) {
-                                               $re .= preg_quote( $pat[$i], 
'/' ) . '-' . preg_quote( $pat[$i+2], '/' );
-                                               $i += 2;
-                                       } else {
-                                               $re .= preg_quote( $pat[$i], 
'/' );
-                                       }
-                               }
-                               if ( $i >= $len ) {
-                                       throw new Scribunto_LuaError( "Missing 
close-bracket for character set beginning at pattern character $ii" );
-                               }
-                               $re .= ']';
+                               list( $i, $re2 ) = 
$this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
+                               $re .= $re2;
                                $q = true;
                                break;
 
@@ -424,11 +411,44 @@
                return array( $re, $capt, $anypos );
        }
 
-       private function addCapturesFromMatch( $arr, $s, $m, $capt, $offset, 
$m0_if_no_captures ) {
+       private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ){
+               $ii = $i + 1;
+               $re = '[';
+               $i++;
+               if ( $i < $len && $pat[$i] === '^' ) {
+                       $re .= '^';
+                       $i++;
+               }
+               for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) {
+                       if ( $pat[$i] === '%' ) {
+                               $i++;
+                               if ( $i >= $len ) {
+                                       break;
+                               }
+                               if ( isset( $brcharsets[$pat[$i]] ) ) {
+                                       $re .= $brcharsets[$pat[$i]];
+                               } else {
+                                       $re .= preg_quote( $pat[$i], '/' );
+                               }
+                       } elseif( $i + 2 < $len && $pat[$i + 1] === '-' && 
$pat[$i + 2] !== ']' ) {
+                               $re .= preg_quote( $pat[$i], '/' ) . '-' . 
preg_quote( $pat[$i+2], '/' );
+                               $i += 2;
+                       } else {
+                               $re .= preg_quote( $pat[$i], '/' );
+                       }
+               }
+               if ( $i >= $len ) {
+                       throw new Scribunto_LuaError( "Missing close-bracket 
for character set beginning at pattern character $ii" );
+               }
+               $re .= ']';
+               return array( $i, $re );
+       }
+
+       private function addCapturesFromMatch( $arr, $s, $m, $capt, 
$m0_if_no_captures ) {
                if ( count( $capt ) ) {
                        foreach ( $capt as $n => $pos ) {
                                if ( $pos ) {
-                                       $o = mb_strlen( substr( $s, 0, 
$m["m$n"][1] ), 'UTF-8' ) + $offset;
+                                       $o = mb_strlen( substr( $s, 0, 
$m["m$n"][1] ), 'UTF-8' ) + 1;
                                        $arr[] = $o;
                                } else {
                                        $arr[] = $m["m$n"][0];
@@ -454,31 +474,32 @@
                }
 
                if ( $init > 1 ) {
-                       $s = mb_substr( $s, $init - 1, $len - $init + 1, 
'UTF-8' );
+                       $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' 
) );
                } else {
                        $init = 1;
+                       $offset = 0;
                }
 
                if ( $plain ) {
                        if ( $pattern !== '' ) {
-                               $ret = mb_strpos( $s, $pattern, 0, 'UTF-8' );
+                               $ret = mb_strpos( $s, $pattern, $init - 1, 
'UTF-8' );
                        } else {
-                               $ret = 0;
+                               $ret = $init - 1;
                        }
                        if ( $ret === false ) {
                                return array( null );
                        } else {
-                               return array( $ret + $init, $ret + $init + 
mb_strlen( $pattern ) - 1 );
+                               return array( $ret + 1, $ret + mb_strlen( 
$pattern ) );
                        }
                }
 
-               list( $re, $capt ) = $this->patternToRegex( $pattern );
-               if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) {
+               list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' );
+               if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) 
{
                        return array( null );
                }
-               $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ) + $init;
-               $ret = array( $o, $o + mb_strlen( $m[0][0], 'UTF-8' ) - 1 );
-               return $this->addCapturesFromMatch( $ret, $s, $m, $capt, $init, 
false );
+               $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' );
+               $ret = array( $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) );
+               return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false 
);
        }
 
        public function ustringMatch( $s, $pattern, $init = 1 ) {
@@ -493,23 +514,23 @@
                        $init = $len + 1;
                }
                if ( $init > 1 ) {
-                       $s = mb_substr( $s, $init - 1, $len - $init + 1, 
'UTF-8' );
+                       $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' 
) );
                } else {
-                       $init = 1;
+                       $offset = 0;
                }
 
-               list( $re, $capt ) = $this->patternToRegex( $pattern );
-               if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) {
+               list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' );
+               if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) 
{
                        return array( null );
                }
-               return $this->addCapturesFromMatch( array(), $s, $m, $capt, 
$init, true );
+               return $this->addCapturesFromMatch( array(), $s, $m, $capt, 
true );
        }
 
        public function ustringGmatchInit( $s, $pattern ) {
                $this->checkString( 'gmatch', $s );
                $this->checkPattern( 'gmatch', $pattern );
 
-               list( $re, $capt ) = $this->patternToRegex( $pattern, true );
+               list( $re, $capt ) = $this->patternToRegex( $pattern, false );
                return array( $re, $capt );
        }
 
@@ -518,7 +539,7 @@
                        return array( $pos, array() );
                }
                $pos = $m[0][1] + strlen( $m[0][0] );
-               return array( $pos, $this->addCapturesFromMatch( array( null ), 
$s, $m, $capt, 1, true ) );
+               return array( $pos, $this->addCapturesFromMatch( array( null ), 
$s, $m, $capt, true ) );
        }
 
        public function ustringGsub( $s, $pattern, $repl, $n = null ) {
@@ -532,7 +553,7 @@
                        $n = 0;
                }
 
-               list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern );
+               list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, 
'^' );
                $captures = array();
 
                if ( $anypos ) {
@@ -547,7 +568,7 @@
                        for ( $i = 0; $i < $ct; $i++ ) {
                                $m = $mm[$i];
                                $c = array( $m[0][0] );
-                               foreach ( $this->addCapturesFromMatch( array(), 
$s, $m, $capt, 1, false ) as $k => $v ) {
+                               foreach ( $this->addCapturesFromMatch( array(), 
$s, $m, $capt, false ) as $k => $v ) {
                                        $k++;
                                        $c["m$k"] = $v;
                                }
diff --git a/engines/LuaCommon/lualib/ustring/charsets.lua 
b/engines/LuaCommon/lualib/ustring/charsets.lua
index 1903876..5f0f48c 100644
--- a/engines/LuaCommon/lualib/ustring/charsets.lua
+++ b/engines/LuaCommon/lualib/ustring/charsets.lua
@@ -2779,7 +2779,7 @@
                [0x00ff46] = 1,
        },
        [0x7a] = {
-               1,
+               [0x000000] = 1,
        },
        [0x41] = {},
        [0x43] = {},
diff --git a/engines/LuaCommon/lualib/ustring/make-tables.php 
b/engines/LuaCommon/lualib/ustring/make-tables.php
index 9727dce..4caff53 100755
--- a/engines/LuaCommon/lualib/ustring/make-tables.php
+++ b/engines/LuaCommon/lualib/ustring/make-tables.php
@@ -62,18 +62,14 @@
 );
 
 $ranges = array();
-function addRange( $k, $start, $end, $arr ) {
+function addRange( $k, $start, $end ) {
        global $X, $ranges;
        // Speed/memory tradeoff
        if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) {
                $ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, 
$end );
        } else {
                for ( $i = $start; $i < $end; $i++ ) {
-                       if ( $arr ) {
-                               fprintf( $X, "\t\t1,\n" );
-                       } else {
-                               fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
-                       }
+                       fprintf( $X, "\t\t[0x%06x] = 1,\n", $i );
                }
        }
 }
@@ -98,7 +94,6 @@
        }
 
        fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) );
-       $arr = true;
        $rstart = null;
        foreach ( $chars as $i => $c ) {
                if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", 
$c ) ) {
@@ -107,14 +102,13 @@
                        }
                } else {
                        if ( $rstart !== null ) {
-                               addRange( $k, $rstart, $i, $arr );
+                               addRange( $k, $rstart, $i );
                                $rstart = null;
                        }
-                       $arr = false;
                }
        }
        if ( $rstart !== null ) {
-               addRange( $k, $rstart, 0x110000, $arr );
+               addRange( $k, $rstart, 0x110000 );
        }
        fprintf( $X, "\t},\n" );
 }
diff --git a/engines/LuaCommon/lualib/ustring/ustring.lua 
b/engines/LuaCommon/lualib/ustring/ustring.lua
index 87f3b4a..9b090ae 100644
--- a/engines/LuaCommon/lualib/ustring/ustring.lua
+++ b/engines/LuaCommon/lualib/ustring/ustring.lua
@@ -476,8 +476,8 @@
                        if charsets[c] then -- A character set like '%a'
                                return match_charset( sp, pp + 2, charsets[c] )
                        elseif c == 0x62 then -- '%b': balanced delimiter match
-                               d1 = pattern.codepoints[pp + 2]
-                               d2 = pattern.codepoints[pp + 3]
+                               local d1 = pattern.codepoints[pp + 2]
+                               local d2 = pattern.codepoints[pp + 3]
                                if not d1 or not d2 then
                                        error( 'malformed pattern (missing 
arguments to \'%b\')', 3 )
                                end
@@ -499,6 +499,18 @@
                                        elseif c == d1 then
                                                ct = ct + 1
                                        end
+                               end
+                       elseif c == 0x66 then -- '%f': frontier pattern match
+                               if pattern.codepoints[pp + 2] ~= 0x5b then
+                                       error( 'missing \'[\' after %f in 
pattern at pattern character ' .. pp, 3 )
+                               end
+                               local pp, charset = parse_charset( pp + 2 )
+                               local c1 = cps.codepoints[sp - 1] or 0
+                               local c2 = cps.codepoints[sp] or 0
+                               if not charset[c1] and charset[c2] then
+                                       return match( sp, pp )
+                               else
+                                       return nil
                                end
                        elseif c >= 0x30 and c <= 0x39 then -- '%0' to '%9': 
backreference
                                local m, l = getcapt( c - 0x30, 'invalid 
capture index %' .. c .. ' at pattern character ' .. pp, 3 )
@@ -702,7 +714,7 @@
                        return sp, ep - 1, unpack( captures )
                end
                sp = sp + 1
-       until anchor or sp > cps.len
+       until anchor or sp > cps.len + 1
        return nil
 end
 
diff --git a/tests/engines/LuaCommon/UstringLibraryTests.lua 
b/tests/engines/LuaCommon/UstringLibraryTests.lua
index bc16642..c5a4059 100644
--- a/tests/engines/LuaCommon/UstringLibraryTests.lua
+++ b/tests/engines/LuaCommon/UstringLibraryTests.lua
@@ -253,6 +253,42 @@
          args = { "bar ¡foo¡foo¡ bar", '%b¡¡' },
          expect = { 5, 9 }
        },
+       { name = 'find: (%f)', func = mw.ustring.find,
+         args = { "foo ¡foobar ¡foo bar baz", '¡.-%f[%s]' },
+         expect = { 5, 11 }
+       },
+       { name = 'find: (%f 2)', func = mw.ustring.find,
+         args = { "foo ¡foobar ¡foo bar baz", '¡foo%f[%s]' },
+         expect = { 13, 16 }
+       },
+       { name = 'find: (%f 3)', func = mw.ustring.find,
+         args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]' },
+         expect = { 16, 19 }
+       },
+       { name = 'find: (%f 4)', func = mw.ustring.find,
+         args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]', 16 },
+         expect = { 16, 19 }
+       },
+       { name = 'find: (%f 5)', func = mw.ustring.find,
+         args = { "foo ¡bar baz", '%f[%Z]' },
+         expect = { 1, 0 }
+       },
+       { name = 'find: (%f 6)', func = mw.ustring.find,
+         args = { "foo ¡bar baz", '%f[%z]' },
+         expect = { 13, 12 }
+       },
+       { name = 'find: (%f 7)', func = mw.ustring.find,
+         args = { "foo ¡b\0r baz", '%f[%Z]', 2 },
+         expect = { 8, 7 }
+       },
+       { name = 'find: (%f 8)', func = mw.ustring.find,
+         args = { "\0foo ¡b\0r baz", '%f[%z]' },
+         expect = { 8, 7 }
+       },
+       { name = 'find: (%f 9)', func = mw.ustring.find,
+         args = { "\0foo ¡b\0r baz", '%f[%Z]' },
+         expect = { 2, 1 }
+       },
        { name = 'find: (%A)', func = mw.ustring.find,
          args = { "fóó? bar", '%A+' },
          expect = { 4, 5 }

-- 
To view, visit https://gerrit.wikimedia.org/r/63381
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Scribunto
Gerrit-Branch: master
Gerrit-Owner: Anomie <bjor...@wikimedia.org>
Gerrit-Reviewer: Aaron Schulz <asch...@wikimedia.org>
Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to