andrei          Tue Sep 19 17:37:48 2006 UTC

  Modified files:              
    /php-src/ext/pcre   php_pcre.c 
  Log:
  Fix offset calculation in add_offset_pair().
  
  
http://cvs.php.net/viewvc.cgi/php-src/ext/pcre/php_pcre.c?r1=1.200&r2=1.201&diff_format=u
Index: php-src/ext/pcre/php_pcre.c
diff -u php-src/ext/pcre/php_pcre.c:1.200 php-src/ext/pcre/php_pcre.c:1.201
--- php-src/ext/pcre/php_pcre.c:1.200   Tue Sep 19 11:48:59 2006
+++ php-src/ext/pcre/php_pcre.c Tue Sep 19 17:37:48 2006
@@ -16,12 +16,10 @@
    +----------------------------------------------------------------------+
  */
 
-/* $Id: php_pcre.c,v 1.200 2006/09/19 11:48:59 tony2001 Exp $ */
+/* $Id: php_pcre.c,v 1.201 2006/09/19 17:37:48 andrei Exp $ */
 
 /* UTODO
  *  - PCRE_NO_UTF8_CHECK option for Unicode strings
- *  - add_offset_pair() should convert offset to refer to codepoints or bytes,
- *    depending on whether subject string is IS_UNICODE or IS_STRING
  *
  *  php_pcre_split_impl():
  *   - Avoid the /./ bump for Unicode strings with U8_FWD_1()
@@ -67,6 +65,11 @@
        PHP_PCRE_BAD_UTF8_ERROR,
 };
 
+typedef struct {
+       char *str;
+       int byte_offset;
+       int cp_offset;
+} offset_map_t;
 
 ZEND_DECLARE_MODULE_GLOBALS(pcre);
 
@@ -435,25 +438,32 @@
 /* }}} */
 
 /* {{{ add_offset_pair */
-static inline void add_offset_pair(zval *result, char *str, int len, int 
offset, char *name, zend_bool make_unicode TSRMLS_DC)
+static inline void add_offset_pair(zval *result, char *str, int len, int 
offset, char *name, offset_map_t *prev TSRMLS_DC)
 {
        zval *match_pair;
+       int tmp;
 
        ALLOC_ZVAL(match_pair);
        array_init(match_pair);
        INIT_PZVAL(match_pair);
 
        /* Add (match, offset) to the return value */
-       if (make_unicode) {
-               add_next_index_utf8_stringl(match_pair, str, len, 1);
-       } else {
-               add_next_index_stringl(match_pair, str, len, 1);
+       add_next_index_utf8_stringl(match_pair, str, len, 1);
+
+       /* Calculate codepoint offset from the previous chunk */
+       if (offset) {
+               tmp = prev->byte_offset;
+               while (tmp < offset) {
+                       U8_FWD_1(prev->str, tmp, offset);
+                       prev->cp_offset++;
+               }
+               prev->byte_offset = tmp;
        }
-       add_next_index_long(match_pair, offset);
+       add_next_index_long(match_pair, prev->cp_offset);
        
        if (name) {
                zval_add_ref(&match_pair);
-               if (make_unicode) {
+               if (UG(unicode)) {
                        UErrorCode status = U_ZERO_ERROR;
                        UChar *u = NULL;
                        int u_len;
@@ -686,6 +696,8 @@
 
                        /* If subpatterns array has been passed, fill it in 
with values. */
                        if (subpats != NULL) {
+                               offset_map_t map = { subject, 0, 0 };
+
                                /* Try to get the list of substrings and 
display a warning if failed. */
                                if (pcre_get_substring_list(subject, offsets, 
count, &stringlist) < 0) {
                                        efree(subpat_names);
@@ -700,7 +712,7 @@
                                                for (i = 0; i < count; i++) {
                                                        if (offset_capture) {
                                                                
add_offset_pair(match_sets[i], (char *)stringlist[i],
-                                                                               
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, 
UG(unicode) TSRMLS_CC);
+                                                                               
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL, &map 
TSRMLS_CC);
                                                        } else {
                                                                
add_next_index_utf8_stringl(match_sets[i], (char *)stringlist[i],
                                                                                
                                        offsets[(i<<1)+1] - offsets[i<<1], 1);
@@ -733,7 +745,7 @@
                                                        if (offset_capture) {
                                                                
add_offset_pair(result_set, (char *)stringlist[i],
                                                                                
                offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1],
-                                                                               
                subpat_names[i], UG(unicode) TSRMLS_CC);
+                                                                               
                subpat_names[i], &map TSRMLS_CC);
                                                        } else {
                                                                if 
(subpat_names[i]) {
                                                                        
add_assoc_utf8_stringl(result_set, subpat_names[i], (char *)stringlist[i],
@@ -752,7 +764,7 @@
                                                if (offset_capture) {
                                                        
add_offset_pair(subpats, (char *)stringlist[i],
                                                                                
        offsets[(i<<1)+1] - offsets[i<<1],
-                                                                               
        offsets[i<<1], subpat_names[i], UG(unicode) TSRMLS_CC);
+                                                                               
        offsets[i<<1], subpat_names[i], &map TSRMLS_CC);
                                                } else {
                                                        if (subpat_names[i]) {
                                                                
add_assoc_utf8_stringl(subpats, subpat_names[i], (char *)stringlist[i],
@@ -1521,6 +1533,7 @@
        int                              no_empty;                      /* If 
NO_EMPTY flag is set */
        int                              delim_capture;         /* If 
delimiters should be captured */
        int                              offset_capture;        /* If offsets 
should be captured */
+       offset_map_t     map = { subject, 0, 0 };
 
        no_empty = flags & PREG_SPLIT_NO_EMPTY;
        delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
@@ -1580,7 +1593,7 @@
 
                                if (offset_capture) {
                                        /* Add (match, offset) pair to the 
return value */
-                                       add_offset_pair(return_value, 
last_match, &subject[offsets[0]]-last_match, next_offset, NULL, UG(unicode) 
TSRMLS_CC);
+                                       add_offset_pair(return_value, 
last_match, &subject[offsets[0]]-last_match, next_offset, NULL, &map TSRMLS_CC);
                                } else {
                                        /* Add the piece to the return value */
                                        
add_next_index_utf8_stringl(return_value, last_match,
@@ -1603,7 +1616,7 @@
                                        if (!no_empty || match_len > 0) {
                                                if (offset_capture) {
                                                        
add_offset_pair(return_value, &subject[offsets[i<<1]], match_len,
-                                                                               
        offsets[i<<1], NULL, UG(unicode) TSRMLS_CC);
+                                                                               
        offsets[i<<1], NULL, &map TSRMLS_CC);
                                                } else {
                                                        
add_next_index_utf8_stringl(return_value, &subject[offsets[i<<1]],
                                                                                
                                match_len, 1);
@@ -1647,7 +1660,7 @@
                if (offset_capture) {
                        /* Add the last (match, offset) pair to the return 
value */
                        add_offset_pair(return_value, &subject[start_offset],
-                                                       subject_len - 
start_offset, start_offset, NULL, UG(unicode) TSRMLS_CC);
+                                                       subject_len - 
start_offset, start_offset, NULL, &map TSRMLS_CC);
                } else {
                        /* Add the last piece to the return value */
                        add_next_index_utf8_stringl(return_value, last_match, 
subject + subject_len - last_match, 1);

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to