andrei Mon Aug 28 20:36:50 2006 UTC
Modified files:
/php-src/ext/pcre php_pcre.c php_pcre.h
/php-src/ext/spl spl_iterators.c
Log:
Add Unicode support for preg_match[_all]
http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.c?r1=1.191&r2=1.192&diff_format=u
Index: php-src/ext/pcre/php_pcre.c
diff -u php-src/ext/pcre/php_pcre.c:1.191 php-src/ext/pcre/php_pcre.c:1.192
--- php-src/ext/pcre/php_pcre.c:1.191 Thu Jul 20 22:40:44 2006
+++ php-src/ext/pcre/php_pcre.c Mon Aug 28 20:36:50 2006
@@ -16,7 +16,18 @@
+----------------------------------------------------------------------+
*/
-/* $Id: php_pcre.c,v 1.191 2006/07/20 22:40:44 rrichards Exp $ */
+/* $Id: php_pcre.c,v 1.192 2006/08/28 20:36:50 andrei Exp $ */
+
+/* UTODO
+ * - PCRE_NO_UTF8_CHECK option for Unicode strings
+ *
+ * php_pcre_match_all():
+ * - start_offset needs to count codepoints, probably via U8_FWD_1()
+ * - need to return matched substrings in the type matching the arguments
+ *
+ * php_pcre_split_impl():
+ * - Avoid the /./ bump for Unicode strings with U8_FWD_1()
+ */
#ifdef HAVE_CONFIG_H
#include "config.h"
@@ -174,7 +185,7 @@
/* {{{ pcre_get_compiled_regex_cache
*/
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int
regex_len TSRMLS_DC)
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int
regex_len, zend_bool from_unicode TSRMLS_DC)
{
pcre *re = NULL;
pcre_extra *extra;
@@ -198,7 +209,6 @@
/* Try to lookup the cached regex entry, and if successful, just pass
back the compiled pattern, otherwise go on and compile it. */
- regex_len = strlen(regex);
if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void
**)&pce) == SUCCESS) {
/*
* We use a quick pcre_info() check to see whether cache is
corrupted, and if it
@@ -208,13 +218,15 @@
zend_hash_clean(&PCRE_G(pcre_cache));
} else {
#if HAVE_SETLOCALE
- if (!strcmp(pce->locale, locale)) {
-#endif
+ if (!strcmp(pce->locale, locale) && from_unicode ==
pce->from_unicode) {
+ return pce;
+ }
+#else
+ if (from_unicode == pce->from_unicode) {
return pce;
-#if HAVE_SETLOCALE
}
- }
#endif
+ }
}
p = regex;
@@ -315,6 +327,12 @@
}
}
+ /* force UTF-8 mode for strings known to have been converted from
Unicode
+ (UTF-16) */
+ if (from_unicode) {
+ coptions |= PCRE_UTF8;
+ }
+
#if HAVE_SETLOCALE
if (strcmp(locale, "C"))
tables = pcre_maketables();
@@ -367,6 +385,7 @@
new_entry.extra = extra;
new_entry.preg_options = poptions;
new_entry.compile_options = coptions;
+ new_entry.from_unicode = from_unicode;
#if HAVE_SETLOCALE
new_entry.locale = pestrdup(locale, 1);
new_entry.tables = tables;
@@ -382,7 +401,7 @@
*/
PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int
*preg_options TSRMLS_DC)
{
- pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex,
strlen(regex) TSRMLS_CC);
+ pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex,
strlen(regex), 0 TSRMLS_CC);
if (extra) {
*extra = pce ? pce->extra : NULL;
@@ -399,7 +418,7 @@
*/
PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int
*preg_options, int *compile_options TSRMLS_DC)
{
- pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex,
strlen(regex) TSRMLS_CC);
+ pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex,
strlen(regex), 0 TSRMLS_CC);
if (extra) {
*extra = pce ? pce->extra : NULL;
@@ -416,7 +435,7 @@
/* }}} */
/* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int
offset, char *name)
+static inline void add_offset_pair(zval *result, char *str, int len, int
offset, char *name, zend_bool make_unicode TSRMLS_DC)
{
zval *match_pair;
@@ -425,45 +444,83 @@
INIT_PZVAL(match_pair);
/* Add (match, offset) to the return value */
- add_next_index_stringl(match_pair, str, len, 1);
+ if (make_unicode) {
+ add_next_index_utf8_stringl(match_pair, str, len, 1);
+ } else {
+ add_next_index_stringl(match_pair, str, len, 1);
+ }
add_next_index_long(match_pair, offset);
if (name) {
zval_add_ref(&match_pair);
- zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1,
&match_pair, sizeof(zval *), NULL);
+ if (make_unicode) {
+ UErrorCode status = U_ZERO_ERROR;
+ UChar *u = NULL;
+ int u_len;
+ zend_string_to_unicode_ex(UG(utf8_conv), &u, &u_len,
name, strlen(name), &status);
+ zend_u_hash_update(Z_ARRVAL_P(result), IS_UNICODE,
ZSTR(u), u_len+1, &match_pair, sizeof(zval *), NULL);
+ efree(u);
+ } else {
+ zend_hash_update(Z_ARRVAL_P(result), name,
strlen(name)+1, &match_pair, sizeof(zval *), NULL);
+ }
}
zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair,
sizeof(zval *), NULL);
}
/* }}} */
-static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{
*/
+/* {{{ php_do_pcre_match */
+static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global)
{
/* parameters */
- char *regex; /* Regular
expression */
- char *subject; /* String to
match against */
+ zstr regex; /* Regular
expression */
+ zstr subject; /* String to
match against */
int regex_len;
int subject_len;
pcre_cache_entry *pce; /* Compiled regular
expression */
zval *subpats = NULL; /* Array for
subpatterns */
long flags = 0; /* Match control flags
*/
long start_offset = 0; /* Where the new search
starts */
+ zend_uchar str_type;
+ char *regex_utf8 = NULL, *subject_utf8 = NULL;
+ int regex_utf8_len, subject_utf8_len;
+ UErrorCode status = U_ZERO_ERROR;
- if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, ((global) ?
"ssz|ll" : "ss|zll"), ®ex, ®ex_len,
- &subject,
&subject_len, &subpats, &flags, &start_offset) == FAILURE) {
+ if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, ((global) ?
"TTz|ll" : "TT|zll"), ®ex, ®ex_len, &str_type,
+ &subject,
&subject_len, &str_type, &subpats, &flags, &start_offset) == FAILURE) {
RETURN_FALSE;
}
+
+ if (str_type == IS_UNICODE) {
+ zend_unicode_to_string_ex(UG(utf8_conv), ®ex_utf8,
®ex_utf8_len, regex.u, regex_len, &status);
+ zend_unicode_to_string_ex(UG(utf8_conv), &subject_utf8,
&subject_utf8_len, subject.u, subject_len, &status);
+ regex.s = regex_utf8;
+ regex_len = regex_utf8_len;
+ subject.s = subject_utf8;
+ subject_len = subject_utf8_len;
+ }
/* Compile regex or get it from cache. */
- if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC))
== NULL) {
+ if ((pce = pcre_get_compiled_regex_cache(regex.s, regex_len, (str_type
== IS_UNICODE) TSRMLS_CC)) == NULL) {
+ if (str_type == IS_UNICODE) {
+ efree(regex_utf8);
+ efree(subject_utf8);
+ }
RETURN_FALSE;
}
- php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
- global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
+ php_pcre_match_impl(pce, subject.s, subject_len, return_value, subpats,
+ global, ZEND_NUM_ARGS() >= 4, flags, start_offset, (str_type ==
IS_UNICODE) TSRMLS_CC);
+
+ if (str_type == IS_UNICODE) {
+ efree(regex_utf8);
+ efree(subject_utf8);
+ }
}
+/* }}} */
+/* {{{ php_pcre_match_impl */
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int
subject_len, zval *return_value,
- zval *subpats, int global, int use_flags, long flags, long start_offset
TSRMLS_DC)
+ zval *subpats, int global, int use_flags, long flags, long
start_offset, zend_bool is_utf8 TSRMLS_DC)
{
zval *result_set, /* Holds a set of
subpatterns after
a global match */
@@ -512,11 +569,23 @@
offset_capture = 0;
}
- /* Negative offset counts from the end of the string. */
- if (start_offset < 0) {
- start_offset = subject_len + start_offset;
+ if (is_utf8) {
+ int k = 0;
+ /* Calculate byte offset from codepoint offset */
if (start_offset < 0) {
- start_offset = 0;
+ k = subject_len;
+ U8_BACK_N(subject, 0, k, -start_offset);
+ } else {
+ U8_FWD_N(subject, k, subject_len, start_offset);
+ }
+ start_offset = k;
+ } else {
+ /* Negative offset counts from the end of the string. */
+ if (start_offset < 0) {
+ start_offset = subject_len + start_offset;
+ if (start_offset < 0) {
+ start_offset = 0;
+ }
}
}
@@ -630,10 +699,15 @@
for (i = 0; i < count; i++) {
if (offset_capture) {
add_offset_pair(match_sets[i], (char *)stringlist[i],
-
offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
+
offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, is_utf8
TSRMLS_CC);
} else {
-
add_next_index_stringl(match_sets[i], (char *)stringlist[i],
-
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ if (is_utf8) {
+
add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i],
+
offsets[(i<<1)+1] -
offsets[i<<1], 1);
+ } else {
+
add_next_index_stringl(match_sets[i], (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ }
}
}
/*
@@ -642,8 +716,14 @@
* arrays with empty strings.
*/
if (count < num_subpats) {
- for (; i < num_subpats;
i++) {
-
add_next_index_string(match_sets[i], "", 1);
+ if (is_utf8) {
+ for (; i <
num_subpats; i++) {
+
add_next_index_unicode(match_sets[i], EMPTY_STR, 1);
+ }
+ } else {
+ for (; i <
num_subpats; i++) {
+
add_next_index_string(match_sets[i], "", 1);
+ }
}
}
} else {
@@ -656,14 +736,25 @@
for (i = 0; i < count; i++) {
if (offset_capture) {
add_offset_pair(result_set, (char *)stringlist[i],
-
offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
subpat_names[i]);
+
offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
+
subpat_names[i], is_utf8 TSRMLS_CC);
} else {
if
(subpat_names[i]) {
-
add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+ if
(is_utf8) {
+
add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1],
1);
+ } else {
+
add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ }
+ }
+ if (is_utf8) {
+
add_next_index_utf8_stringl(result_set, (char *)stringlist[i],
+
offsets[(i<<1)+1] -
offsets[i<<1], 1);
+ } else {
+
add_next_index_stringl(result_set, (char *)stringlist[i],
offsets[(i<<1)+1] - offsets[i<<1], 1);
}
-
add_next_index_stringl(result_set, (char *)stringlist[i],
-
offsets[(i<<1)+1] - offsets[i<<1], 1);
}
}
/* And add it to the output
array */
@@ -675,14 +766,24 @@
if (offset_capture) {
add_offset_pair(subpats, (char *)stringlist[i],
offsets[(i<<1)+1] - offsets[i<<1],
-
offsets[i<<1], subpat_names[i]);
+
offsets[i<<1], subpat_names[i], is_utf8 TSRMLS_CC);
} else {
if (subpat_names[i]) {
-
add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
-
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ if (is_utf8) {
+
add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ } else {
+
add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ }
+ }
+ if (is_utf8) {
+
add_next_index_utf8_stringl(subpats, (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1], 1);
+ } else {
+
add_next_index_stringl(subpats, (char *)stringlist[i],
+
offsets[(i<<1)+1] - offsets[i<<1], 1);
}
-
add_next_index_stringl(subpats, (char *)stringlist[i],
-
offsets[(i<<1)+1] - offsets[i<<1], 1);
}
}
}
@@ -696,7 +797,12 @@
to achieve this, unless we're already at the end of
the string. */
if (g_notempty != 0 && start_offset < subject_len) {
offsets[0] = start_offset;
- offsets[1] = start_offset + 1;
+ if (is_utf8) {
+ offsets[1] = start_offset;
+ U8_FWD_1(subject, offsets[1],
subject_len);
+ } else {
+ offsets[1] = start_offset + 1;
+ }
} else
break;
} else {
@@ -921,7 +1027,7 @@
pcre_cache_entry *pce; /* Compiled regular
expression */
/* Compile regex or get it from cache. */
- if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC))
== NULL) {
+ if ((pce = pcre_get_compiled_regex_cache(regex, regex_len, 0
TSRMLS_CC)) == NULL) {
return NULL;
}
@@ -1368,7 +1474,7 @@
}
/* Compile regex or get it from cache. */
- if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC))
== NULL) {
+ if ((pce = pcre_get_compiled_regex_cache(regex, regex_len, 0
TSRMLS_CC)) == NULL) {
RETURN_FALSE;
}
@@ -1452,7 +1558,7 @@
if (offset_capture) {
/* Add (match, offset) pair to the
return value */
- add_offset_pair(return_value,
last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
+ add_offset_pair(return_value,
last_match, &subject[offsets[0]]-last_match, next_offset, NULL, 0 TSRMLS_CC);
} else {
/* Add the piece to the return value */
add_next_index_stringl(return_value,
last_match,
@@ -1474,7 +1580,7 @@
/* If we have matched a delimiter */
if (!no_empty || match_len > 0) {
if (offset_capture) {
-
add_offset_pair(return_value, &subject[offsets[i<<1]], match_len,
offsets[i<<1], NULL);
+
add_offset_pair(return_value, &subject[offsets[i<<1]], match_len,
offsets[i<<1], NULL, 0 TSRMLS_CC);
} else {
add_next_index_stringl(return_value,
&subject[offsets[i<<1]],
@@ -1531,7 +1637,7 @@
{
if (offset_capture) {
/* Add the last (match, offset) pair to the return
value */
- add_offset_pair(return_value, &subject[start_offset],
subject_len - start_offset, start_offset, NULL);
+ add_offset_pair(return_value, &subject[start_offset],
subject_len - start_offset, start_offset, NULL, 0 TSRMLS_CC);
} else {
/* Add the last piece to the return value */
add_next_index_stringl(return_value, last_match,
subject + subject_len - last_match, 1);
@@ -1647,7 +1753,7 @@
}
/* Compile regex or get it from cache. */
- if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC))
== NULL) {
+ if ((pce = pcre_get_compiled_regex_cache(regex, regex_len, 0
TSRMLS_CC)) == NULL) {
RETURN_FALSE;
}
http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.h?r1=1.49&r2=1.50&diff_format=u
Index: php-src/ext/pcre/php_pcre.h
diff -u php-src/ext/pcre/php_pcre.h:1.49 php-src/ext/pcre/php_pcre.h:1.50
--- php-src/ext/pcre/php_pcre.h:1.49 Thu Jul 20 21:19:05 2006
+++ php-src/ext/pcre/php_pcre.h Mon Aug 28 20:36:50 2006
@@ -16,7 +16,7 @@
+----------------------------------------------------------------------+
*/
-/* $Id: php_pcre.h,v 1.49 2006/07/20 21:19:05 helly Exp $ */
+/* $Id: php_pcre.h,v 1.50 2006/08/28 20:36:50 andrei Exp $ */
#ifndef PHP_PCRE_H
#define PHP_PCRE_H
@@ -58,12 +58,13 @@
#endif
int compile_options;
int refcount;
+ zend_bool from_unicode;
} pcre_cache_entry;
-PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int
regex_len TSRMLS_DC);
+PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int
regex_len, zend_bool regex_is_utf8 TSRMLS_DC);
PHPAPI void php_pcre_match_impl( pcre_cache_entry *pce, char *subject, int
subject_len, zval *return_value,
- zval *subpats, int global, int use_flags, long flags, long start_offset
TSRMLS_DC);
+ zval *subpats, int global, int use_flags, long flags, long
start_offset, zend_bool is_utf8 TSRMLS_DC);
PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int
subject_len, zval *return_value,
int is_callable_replace, int *result_len, int limit, int *replace_count
TSRMLS_DC);
http://cvs.php.net/viewvc.cgi/php-src/ext/spl/spl_iterators.c?r1=1.147&r2=1.148&diff_format=u
Index: php-src/ext/spl/spl_iterators.c
diff -u php-src/ext/spl/spl_iterators.c:1.147
php-src/ext/spl/spl_iterators.c:1.148
--- php-src/ext/spl/spl_iterators.c:1.147 Fri Jul 21 21:09:49 2006
+++ php-src/ext/spl/spl_iterators.c Mon Aug 28 20:36:50 2006
@@ -16,7 +16,7 @@
+----------------------------------------------------------------------+
*/
-/* $Id: spl_iterators.c,v 1.147 2006/07/21 21:09:49 helly Exp $ */
+/* $Id: spl_iterators.c,v 1.148 2006/08/28 20:36:50 andrei Exp $ */
#ifdef HAVE_CONFIG_H
# include "config.h"
@@ -1444,7 +1444,7 @@
zval_ptr_dtor(&intern->current.data);
ALLOC_INIT_ZVAL(intern->current.data);
php_pcre_match_impl(intern->u.regex.pce, subject, subject_len,
&zcount,
- intern->current.data, intern->u.regex.mode ==
REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags,
0 TSRMLS_CC);
+ intern->current.data, intern->u.regex.mode ==
REGIT_MODE_ALL_MATCHES, intern->u.regex.use_flags, intern->u.regex.preg_flags,
0, 0 TSRMLS_CC);
count =
zend_hash_num_elements(Z_ARRVAL_P(intern->current.data));
RETVAL_BOOL(count > 0);
break;
--
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php