andrei Tue Sep 19 17:37:48 2006 UTC Modified files: /php-src/ext/pcre php_pcre.c Log: Fix offset calculation in add_offset_pair(). http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.c?r1=1.200&r2=1.201&diff_format=u Index: php-src/ext/pcre/php_pcre.c diff -u php-src/ext/pcre/php_pcre.c:1.200 php-src/ext/pcre/php_pcre.c:1.201 --- php-src/ext/pcre/php_pcre.c:1.200 Tue Sep 19 11:48:59 2006 +++ php-src/ext/pcre/php_pcre.c Tue Sep 19 17:37:48 2006 @@ -16,12 +16,10 @@ +----------------------------------------------------------------------+ */ -/* $Id: php_pcre.c,v 1.200 2006/09/19 11:48:59 tony2001 Exp $ */ +/* $Id: php_pcre.c,v 1.201 2006/09/19 17:37:48 andrei Exp $ */ /* UTODO * - PCRE_NO_UTF8_CHECK option for Unicode strings - * - add_offset_pair() should convert offset to refer to codepoints or bytes, - * depending on whether subject string is IS_UNICODE or IS_STRING * * php_pcre_split_impl(): * - Avoid the /./ bump for Unicode strings with U8_FWD_1() @@ -67,6 +65,11 @@ PHP_PCRE_BAD_UTF8_ERROR, }; +typedef struct { + char *str; + int byte_offset; + int cp_offset; +} offset_map_t; ZEND_DECLARE_MODULE_GLOBALS(pcre); @@ -435,25 +438,32 @@ /* }}} */ /* {{{ add_offset_pair */ -static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name, zend_bool make_unicode TSRMLS_DC) +static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name, offset_map_t *prev TSRMLS_DC) { zval *match_pair; + int tmp; ALLOC_ZVAL(match_pair); array_init(match_pair); INIT_PZVAL(match_pair); /* Add (match, offset) to the return value */ - if (make_unicode) { - add_next_index_utf8_stringl(match_pair, str, len, 1); - } else { - add_next_index_stringl(match_pair, str, len, 1); + add_next_index_utf8_stringl(match_pair, str, len, 1); + + /* Calculate codepoint offset from the previous chunk */ + if (offset) { + tmp = prev->byte_offset; + while (tmp < offset) { + U8_FWD_1(prev->str, tmp, offset); + prev->cp_offset++; + } + prev->byte_offset = tmp; } - add_next_index_long(match_pair, offset); + add_next_index_long(match_pair, prev->cp_offset); if (name) { zval_add_ref(&match_pair); - if (make_unicode) { + if (UG(unicode)) { UErrorCode status = U_ZERO_ERROR; UChar *u = NULL; int u_len; @@ -686,6 +696,8 @@ /* If subpatterns array has been passed, fill it in with values. */ if (subpats != NULL) { + offset_map_t map = { subject, 0, 0 }; + /* Try to get the list of substrings and display a warning if failed. */ if (pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) { efree(subpat_names); @@ -700,7 +712,7 @@ for (i = 0; i < count; i++) { if (offset_capture) { add_offset_pair(match_sets[i], (char *)stringlist[i], - offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, UG(unicode) TSRMLS_CC); + offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, &map TSRMLS_CC); } else { add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i], offsets[(i<<1)+1] - offsets[i<<1], 1); @@ -733,7 +745,7 @@ if (offset_capture) { add_offset_pair(result_set, (char *)stringlist[i], offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], - subpat_names[i], UG(unicode) TSRMLS_CC); + subpat_names[i], &map TSRMLS_CC); } else { if (subpat_names[i]) { add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i], @@ -752,7 +764,7 @@ if (offset_capture) { add_offset_pair(subpats, (char *)stringlist[i], offsets[(i<<1)+1] - offsets[i<<1], - offsets[i<<1], subpat_names[i], UG(unicode) TSRMLS_CC); + offsets[i<<1], subpat_names[i], &map TSRMLS_CC); } else { if (subpat_names[i]) { add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i], @@ -1521,6 +1533,7 @@ int no_empty; /* If NO_EMPTY flag is set */ int delim_capture; /* If delimiters should be captured */ int offset_capture; /* If offsets should be captured */ + offset_map_t map = { subject, 0, 0 }; no_empty = flags & PREG_SPLIT_NO_EMPTY; delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE; @@ -1580,7 +1593,7 @@ if (offset_capture) { /* Add (match, offset) pair to the return value */ - add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, UG(unicode) TSRMLS_CC); + add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC); } else { /* Add the piece to the return value */ add_next_index_utf8_stringl(return_value, last_match, @@ -1603,7 +1616,7 @@ if (!no_empty || match_len > 0) { if (offset_capture) { add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, - offsets[i<<1], NULL, UG(unicode) TSRMLS_CC); + offsets[i<<1], NULL, &map TSRMLS_CC); } else { add_next_index_utf8_stringl(return_value, &subject[offsets[i<<1]], match_len, 1); @@ -1647,7 +1660,7 @@ if (offset_capture) { /* Add the last (match, offset) pair to the return value */ add_offset_pair(return_value, &subject[start_offset], - subject_len - start_offset, start_offset, NULL, UG(unicode) TSRMLS_CC); + subject_len - start_offset, start_offset, NULL, &map TSRMLS_CC); } else { /* Add the last piece to the return value */ add_next_index_utf8_stringl(return_value, last_match, subject + subject_len - last_match, 1);
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php