jenkins-bot has submitted this change and it was merged. Change subject: Add frontier pattern (%f[set]) to ustring ......................................................................
Add frontier pattern (%f[set]) to ustring The "%f[set]" frontier pattern has been in Lua 5.1 since the beginning, but was undocumented until Lua 5.2. And the code is even unchanged from 5.1.0 to 5.2.1. So there's no reason not to implement it in ustring too. Note the changes to UstringLibrary.php are somewhat large, because it splits the "convert a Lua bracketed charset to PCRE" code into a separate function and it changes the handling of mw.ustring.find's and mw.ustring.match's 'init' parameter from "substring, match from 0, then add back on $init" to "use preg_match's $offset and use \G instead of ^ where this matters". Both of these are necessary to properly support %f. This also fixes a bug in the pure-Lua code (not used in Scribunto) exposed by the unit tests for %f where %z was matching '\1' rather than '\0' and %Z everything except '\1' instead of everything except '\0'. Bug: 48331 Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08 --- M engines/LuaCommon/UstringLibrary.php M engines/LuaCommon/lualib/ustring/charsets.lua M engines/LuaCommon/lualib/ustring/make-tables.php M engines/LuaCommon/lualib/ustring/ustring.lua M tests/engines/LuaCommon/UstringLibraryTests.lua 5 files changed, 127 insertions(+), 64 deletions(-) Approvals: Aaron Schulz: Looks good to me, approved jenkins-bot: Verified diff --git a/engines/LuaCommon/UstringLibrary.php b/engines/LuaCommon/UstringLibrary.php index b253e0b..5c96a9a 100644 --- a/engines/LuaCommon/UstringLibrary.php +++ b/engines/LuaCommon/UstringLibrary.php @@ -232,7 +232,7 @@ } /* Convert a Lua pattern into a PCRE regex */ - private function patternToRegex( $pattern, $noAnchor = false ) { + private function patternToRegex( $pattern, $anchor ) { $pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY ); static $charsets = null, $brcharsets = null; @@ -295,7 +295,7 @@ switch ( $pat[$i] ) { case '^': $q = $i; - $re .= ( $noAnchor || $q ) ? '\\^' : '^'; + $re .= ( $anchor === false || $q ) ? '\\^' : $anchor; break; case '$': @@ -345,6 +345,19 @@ $bct++; $re .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)"; } + } elseif ( $pat[$i] === 'f' ) { + if ( $i + 1 >= $len || $pat[++$i] !== '[' ) { + throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" ); + } + list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); + // Because %f considers the beginning and end of the string + // to be \0, determine if $re2 matches that and take it + // into account with "^" and "$". + if ( preg_match( "/$re2/us", "\0" ) ) { + $re .= "(?<!^)(?<!$re2)(?=$re2|$)"; + } else { + $re .= "(?<!$re2)(?=$re2)"; + } } elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) { $n = ord( $pat[$i] ) - 0x30; if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) { @@ -358,34 +371,8 @@ break; case '[': - $re .= '['; - $i++; - if ( $i < $len && $pat[$i] === '^' ) { - $re .= '^'; - $i++; - } - for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) { - if ( $pat[$i] === '%' ) { - $i++; - if ( $i >= $len ) { - break; - } - if ( isset( $brcharsets[$pat[$i]] ) ) { - $re .= $brcharsets[$pat[$i]]; - } else { - $re .= preg_quote( $pat[$i], '/' ); - } - } elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) { - $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' ); - $i += 2; - } else { - $re .= preg_quote( $pat[$i], '/' ); - } - } - if ( $i >= $len ) { - throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" ); - } - $re .= ']'; + list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); + $re .= $re2; $q = true; break; @@ -424,11 +411,44 @@ return array( $re, $capt, $anypos ); } - private function addCapturesFromMatch( $arr, $s, $m, $capt, $offset, $m0_if_no_captures ) { + private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ){ + $ii = $i + 1; + $re = '['; + $i++; + if ( $i < $len && $pat[$i] === '^' ) { + $re .= '^'; + $i++; + } + for ( ; $i < $len && $pat[$i] !== ']'; $i++ ) { + if ( $pat[$i] === '%' ) { + $i++; + if ( $i >= $len ) { + break; + } + if ( isset( $brcharsets[$pat[$i]] ) ) { + $re .= $brcharsets[$pat[$i]]; + } else { + $re .= preg_quote( $pat[$i], '/' ); + } + } elseif( $i + 2 < $len && $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' ) { + $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i+2], '/' ); + $i += 2; + } else { + $re .= preg_quote( $pat[$i], '/' ); + } + } + if ( $i >= $len ) { + throw new Scribunto_LuaError( "Missing close-bracket for character set beginning at pattern character $ii" ); + } + $re .= ']'; + return array( $i, $re ); + } + + private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) { if ( count( $capt ) ) { foreach ( $capt as $n => $pos ) { if ( $pos ) { - $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + $offset; + $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1; $arr[] = $o; } else { $arr[] = $m["m$n"][0]; @@ -454,31 +474,32 @@ } if ( $init > 1 ) { - $s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' ); + $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } else { $init = 1; + $offset = 0; } if ( $plain ) { if ( $pattern !== '' ) { - $ret = mb_strpos( $s, $pattern, 0, 'UTF-8' ); + $ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' ); } else { - $ret = 0; + $ret = $init - 1; } if ( $ret === false ) { return array( null ); } else { - return array( $ret + $init, $ret + $init + mb_strlen( $pattern ) - 1 ); + return array( $ret + 1, $ret + mb_strlen( $pattern ) ); } } - list( $re, $capt ) = $this->patternToRegex( $pattern ); - if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) { + list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' ); + if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { return array( null ); } - $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ) + $init; - $ret = array( $o, $o + mb_strlen( $m[0][0], 'UTF-8' ) - 1 ); - return $this->addCapturesFromMatch( $ret, $s, $m, $capt, $init, false ); + $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ); + $ret = array( $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) ); + return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false ); } public function ustringMatch( $s, $pattern, $init = 1 ) { @@ -493,23 +514,23 @@ $init = $len + 1; } if ( $init > 1 ) { - $s = mb_substr( $s, $init - 1, $len - $init + 1, 'UTF-8' ); + $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); } else { - $init = 1; + $offset = 0; } - list( $re, $capt ) = $this->patternToRegex( $pattern ); - if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE ) ) { + list( $re, $capt ) = $this->patternToRegex( $pattern, '\G' ); + if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { return array( null ); } - return $this->addCapturesFromMatch( array(), $s, $m, $capt, $init, true ); + return $this->addCapturesFromMatch( array(), $s, $m, $capt, true ); } public function ustringGmatchInit( $s, $pattern ) { $this->checkString( 'gmatch', $s ); $this->checkPattern( 'gmatch', $pattern ); - list( $re, $capt ) = $this->patternToRegex( $pattern, true ); + list( $re, $capt ) = $this->patternToRegex( $pattern, false ); return array( $re, $capt ); } @@ -518,7 +539,7 @@ return array( $pos, array() ); } $pos = $m[0][1] + strlen( $m[0][0] ); - return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, 1, true ) ); + return array( $pos, $this->addCapturesFromMatch( array( null ), $s, $m, $capt, true ) ); } public function ustringGsub( $s, $pattern, $repl, $n = null ) { @@ -532,7 +553,7 @@ $n = 0; } - list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern ); + list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^' ); $captures = array(); if ( $anypos ) { @@ -547,7 +568,7 @@ for ( $i = 0; $i < $ct; $i++ ) { $m = $mm[$i]; $c = array( $m[0][0] ); - foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, 1, false ) as $k => $v ) { + foreach ( $this->addCapturesFromMatch( array(), $s, $m, $capt, false ) as $k => $v ) { $k++; $c["m$k"] = $v; } diff --git a/engines/LuaCommon/lualib/ustring/charsets.lua b/engines/LuaCommon/lualib/ustring/charsets.lua index 1903876..5f0f48c 100644 --- a/engines/LuaCommon/lualib/ustring/charsets.lua +++ b/engines/LuaCommon/lualib/ustring/charsets.lua @@ -2779,7 +2779,7 @@ [0x00ff46] = 1, }, [0x7a] = { - 1, + [0x000000] = 1, }, [0x41] = {}, [0x43] = {}, diff --git a/engines/LuaCommon/lualib/ustring/make-tables.php b/engines/LuaCommon/lualib/ustring/make-tables.php index 9727dce..4caff53 100755 --- a/engines/LuaCommon/lualib/ustring/make-tables.php +++ b/engines/LuaCommon/lualib/ustring/make-tables.php @@ -62,18 +62,14 @@ ); $ranges = array(); -function addRange( $k, $start, $end, $arr ) { +function addRange( $k, $start, $end ) { global $X, $ranges; // Speed/memory tradeoff if ( !( $start >= 0x20 && $start < 0x7f ) && $end - $start >= 10 ) { $ranges[$k][] = sprintf( "c >= 0x%06x and c < 0x%06x", $start, $end ); } else { for ( $i = $start; $i < $end; $i++ ) { - if ( $arr ) { - fprintf( $X, "\t\t1,\n" ); - } else { - fprintf( $X, "\t\t[0x%06x] = 1,\n", $i ); - } + fprintf( $X, "\t\t[0x%06x] = 1,\n", $i ); } } } @@ -98,7 +94,6 @@ } fprintf( $X, "\t[0x%02x] = {\n", ord( $k ) ); - $arr = true; $rstart = null; foreach ( $chars as $i => $c ) { if ( preg_match( "/^$re$/u", $c ) && !preg_match( "/^$re2$/u", $c ) ) { @@ -107,14 +102,13 @@ } } else { if ( $rstart !== null ) { - addRange( $k, $rstart, $i, $arr ); + addRange( $k, $rstart, $i ); $rstart = null; } - $arr = false; } } if ( $rstart !== null ) { - addRange( $k, $rstart, 0x110000, $arr ); + addRange( $k, $rstart, 0x110000 ); } fprintf( $X, "\t},\n" ); } diff --git a/engines/LuaCommon/lualib/ustring/ustring.lua b/engines/LuaCommon/lualib/ustring/ustring.lua index 87f3b4a..9b090ae 100644 --- a/engines/LuaCommon/lualib/ustring/ustring.lua +++ b/engines/LuaCommon/lualib/ustring/ustring.lua @@ -476,8 +476,8 @@ if charsets[c] then -- A character set like '%a' return match_charset( sp, pp + 2, charsets[c] ) elseif c == 0x62 then -- '%b': balanced delimiter match - d1 = pattern.codepoints[pp + 2] - d2 = pattern.codepoints[pp + 3] + local d1 = pattern.codepoints[pp + 2] + local d2 = pattern.codepoints[pp + 3] if not d1 or not d2 then error( 'malformed pattern (missing arguments to \'%b\')', 3 ) end @@ -499,6 +499,18 @@ elseif c == d1 then ct = ct + 1 end + end + elseif c == 0x66 then -- '%f': frontier pattern match + if pattern.codepoints[pp + 2] ~= 0x5b then + error( 'missing \'[\' after %f in pattern at pattern character ' .. pp, 3 ) + end + local pp, charset = parse_charset( pp + 2 ) + local c1 = cps.codepoints[sp - 1] or 0 + local c2 = cps.codepoints[sp] or 0 + if not charset[c1] and charset[c2] then + return match( sp, pp ) + else + return nil end elseif c >= 0x30 and c <= 0x39 then -- '%0' to '%9': backreference local m, l = getcapt( c - 0x30, 'invalid capture index %' .. c .. ' at pattern character ' .. pp, 3 ) @@ -702,7 +714,7 @@ return sp, ep - 1, unpack( captures ) end sp = sp + 1 - until anchor or sp > cps.len + until anchor or sp > cps.len + 1 return nil end diff --git a/tests/engines/LuaCommon/UstringLibraryTests.lua b/tests/engines/LuaCommon/UstringLibraryTests.lua index bc16642..c5a4059 100644 --- a/tests/engines/LuaCommon/UstringLibraryTests.lua +++ b/tests/engines/LuaCommon/UstringLibraryTests.lua @@ -253,6 +253,42 @@ args = { "bar ¡foo¡foo¡ bar", '%b¡¡' }, expect = { 5, 9 } }, + { name = 'find: (%f)', func = mw.ustring.find, + args = { "foo ¡foobar ¡foo bar baz", '¡.-%f[%s]' }, + expect = { 5, 11 } + }, + { name = 'find: (%f 2)', func = mw.ustring.find, + args = { "foo ¡foobar ¡foo bar baz", '¡foo%f[%s]' }, + expect = { 13, 16 } + }, + { name = 'find: (%f 3)', func = mw.ustring.find, + args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]' }, + expect = { 16, 19 } + }, + { name = 'find: (%f 4)', func = mw.ustring.find, + args = { "foo foo¡foobar ¡foo bar baz", '%f[%S]¡.-%f[%s]', 16 }, + expect = { 16, 19 } + }, + { name = 'find: (%f 5)', func = mw.ustring.find, + args = { "foo ¡bar baz", '%f[%Z]' }, + expect = { 1, 0 } + }, + { name = 'find: (%f 6)', func = mw.ustring.find, + args = { "foo ¡bar baz", '%f[%z]' }, + expect = { 13, 12 } + }, + { name = 'find: (%f 7)', func = mw.ustring.find, + args = { "foo ¡b\0r baz", '%f[%Z]', 2 }, + expect = { 8, 7 } + }, + { name = 'find: (%f 8)', func = mw.ustring.find, + args = { "\0foo ¡b\0r baz", '%f[%z]' }, + expect = { 8, 7 } + }, + { name = 'find: (%f 9)', func = mw.ustring.find, + args = { "\0foo ¡b\0r baz", '%f[%Z]' }, + expect = { 2, 1 } + }, { name = 'find: (%A)', func = mw.ustring.find, args = { "fóó? bar", '%A+' }, expect = { 4, 5 } -- To view, visit https://gerrit.wikimedia.org/r/63381 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie0b95ef5b734db53d6adc9de5dae4874f8944c08 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Scribunto Gerrit-Branch: master Gerrit-Owner: Anomie <bjor...@wikimedia.org> Gerrit-Reviewer: Aaron Schulz <asch...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits