andrei Mon Oct 2 16:52:22 2006 UTC Modified files: /php-src unicode-progress.txt /php-src/ext/standard string.c Log: Make stripos() work with Unicode strings. http://cvs.php.net/viewvc.cgi/php-src/unicode-progress.txt?r1=1.50&r2=1.51&diff_format=u Index: php-src/unicode-progress.txt diff -u php-src/unicode-progress.txt:1.50 php-src/unicode-progress.txt:1.51 --- php-src/unicode-progress.txt:1.50 Fri Sep 22 19:35:05 2006 +++ php-src/unicode-progress.txt Mon Oct 2 16:52:21 2006 @@ -27,7 +27,6 @@ Params API. Rest - no idea yet. stristr() - stripos() strripos() str_replace() stri_replace() @@ -158,6 +157,7 @@ strip_tags() stripcslashes() stripslashes() + stripos() strpbrk() strpos() strrchr() http://cvs.php.net/viewvc.cgi/php-src/ext/standard/string.c?r1=1.595&r2=1.596&diff_format=u Index: php-src/ext/standard/string.c diff -u php-src/ext/standard/string.c:1.595 php-src/ext/standard/string.c:1.596 --- php-src/ext/standard/string.c:1.595 Mon Oct 2 01:11:04 2006 +++ php-src/ext/standard/string.c Mon Oct 2 16:52:22 2006 @@ -18,7 +18,7 @@ +----------------------------------------------------------------------+ */ -/* $Id: string.c,v 1.595 2006/10/02 01:11:04 pollita Exp $ */ +/* $Id: string.c,v 1.596 2006/10/02 16:52:22 andrei Exp $ */ /* Synced with php 3.0 revision 1.193 1999-06-16 [ssb] */ @@ -2625,7 +2625,7 @@ } /* }}} */ -/* {{{ proto int stripos(string haystack, string needle [, int offset]) +/* {{{ proto int stripos(string haystack, string needle [, int offset]) U Finds position of first occurrence of a string within another, case insensitive */ PHP_FUNCTION(stripos) { @@ -2633,12 +2633,12 @@ long offset = 0; int haystack_len, needle_len = 0; zend_uchar str_type; - void *haystack_dup, *needle_dup = NULL; + void *haystack_dup = NULL, *needle_dup = NULL; char needle_char[2]; char c = 0; - UChar u_needle_char[3]; - UChar32 ch = 0; + UChar u_needle_char[8]; void *found = NULL; + int cu_offset = 0; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZ|l", &haystack, &needle, &offset) == FAILURE) { return; @@ -2662,6 +2662,7 @@ if (!Z_UNILEN_PP(needle) || Z_UNILEN_PP(needle) > haystack_len) { RETURN_FALSE; } + /* convert both strings to the same type */ if (Z_TYPE_PP(haystack) != Z_TYPE_PP(needle)) { str_type = zend_get_unified_string_type(2 TSRMLS_CC, Z_TYPE_PP(haystack), Z_TYPE_PP(needle)); convert_to_explicit_type_ex(haystack, str_type); @@ -2669,11 +2670,9 @@ } needle_len = Z_UNILEN_PP(needle); if (Z_TYPE_PP(haystack) == IS_UNICODE) { - haystack_dup = php_u_strtolower(Z_USTRVAL_PP(haystack), &haystack_len, UG(default_locale)); - needle_dup = php_u_strtolower(Z_USTRVAL_PP(needle), &needle_len, UG(default_locale)); - found = zend_u_memnstr((UChar *)haystack_dup + offset, - (UChar *)needle_dup, needle_len, - (UChar *)haystack_dup + haystack_len); + /* calculate codeunit offset */ + U16_FWD_N(Z_USTRVAL_PP(haystack), cu_offset, haystack_len, offset); + found = php_u_stristr(Z_USTRVAL_PP(haystack) + cu_offset, Z_USTRVAL_PP(needle), haystack_len, needle_len TSRMLS_CC); } else { haystack_dup = estrndup(Z_STRVAL_PP(haystack), haystack_len); php_strtolower((char *)haystack_dup, haystack_len); @@ -2688,14 +2687,22 @@ case IS_LONG: case IS_BOOL: if (Z_TYPE_PP(haystack) == IS_UNICODE) { - ch = u_tolower((UChar32)Z_LVAL_PP(needle)); + if (Z_LVAL_PP(needle) < 0 || Z_LVAL_PP(needle) > 0x10FFFF) { + php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)"); + RETURN_FALSE; + } + needle_len = zend_codepoint_to_uchar((UChar32)Z_LVAL_PP(needle), u_needle_char); } else { c = tolower((char)Z_LVAL_PP(needle)); } break; case IS_DOUBLE: if (Z_TYPE_PP(haystack) == IS_UNICODE) { - ch = u_tolower((UChar32)Z_DVAL_PP(needle)); + if ((UChar32)Z_DVAL_PP(needle) < 0 || (UChar32)Z_DVAL_PP(needle) > 0x10FFFF) { + php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)"); + RETURN_FALSE; + } + needle_len = zend_codepoint_to_uchar((UChar32)Z_DVAL_PP(needle), u_needle_char); } else { c = tolower((char)Z_DVAL_PP(needle)); } @@ -2707,18 +2714,12 @@ } if (Z_TYPE_PP(haystack) == IS_UNICODE) { - if (U_IS_BMP(ch)) { - u_needle_char[needle_len++] = ch; - u_needle_char[needle_len] = 0; - } else { - u_needle_char[needle_len++] = U16_LEAD(ch); - u_needle_char[needle_len++] = U16_TRAIL(ch); - u_needle_char[needle_len] = 0; - } - haystack_dup = php_u_strtolower(Z_USTRVAL_PP(haystack), &haystack_len, UG(default_locale)); - found = zend_u_memnstr((UChar *)haystack_dup + offset, - (UChar *)u_needle_char, needle_len, - (UChar *)haystack_dup + haystack_len); + /* calculate codeunit offset */ + U16_FWD_N(Z_USTRVAL_PP(haystack), cu_offset, haystack_len, offset); + u_needle_char[needle_len] = 0; + found = php_u_stristr(Z_USTRVAL_PP(haystack) + cu_offset, + u_needle_char, haystack_len, needle_len TSRMLS_CC); + } else { needle_char[0] = c; needle_char[1] = '\0'; @@ -2731,14 +2732,21 @@ } } - efree(haystack_dup); + if (haystack_dup) { + efree(haystack_dup); + } if (needle_dup) { efree(needle_dup); } if (found) { if (Z_TYPE_PP(haystack) == IS_UNICODE) { - RETURN_LONG((UChar *)found - (UChar *)haystack_dup); + /* Simple subtraction will not suffice, since there may be + supplementary codepoints. We count how many codepoints there are + between the starting offset and the found location and add them + to the starting codepoint offset. */ + RETURN_LONG(offset + u_countChar32(Z_USTRVAL_PP(haystack) + cu_offset, + (UChar*)found - (Z_USTRVAL_PP(haystack) + cu_offset))); } else { RETURN_LONG((char *)found - (char *)haystack_dup); }
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php