Hi:

Enclosed is a patch to allow PCRE's preg_split to return an array of
(match, offset) pairs, if PREG_SPLIT_OFFSET_CAPTURE is or'd into the
flags parameter. Submitted for inclusion, rejection, extensive flaming,
or suggestions. :)


A bit of background:

I'm currently working on a cross-referencing system that uses character
offsets internally, matching entries in a word index to positions in a
file. The system captures it's word list via preg_split, excluding
certain tags, character combinations, and whitespace from indexing.

Not finding an obvious way to capture the match offsets directly, I
tried:

  + Rescanning the input string with strstr(), starting from
    position(last_match) + 1, looking for the current match. While
    reasonably fast at O(n), it has a major problem when the matched
    string was also a part of the delimiter.

  + A somewhat involved sequence of two preg_split() calls and an
    array_diff(). One split is PREG_SPLIT_DELIM_CAPTURE, and the
    array_diff "finds" which strings are delimiters. The resulting array
    is then scanned, keeping a running total of string lengths. This
    works, but has an obviously large memory (and to a lesser extent
    run-time) cost.

Alternatives (especially other plain PHP solutions) are welcome.
Otherwise - is there more than a snowball's chance of something like
this being included in a future release?


Thanks in advance,

- Dave
  [EMAIL PROTECTED]

--- php-4.2.1-dist/ext/pcre/php_pcre.c  Thu Feb 28 03:26:35 2002
+++ php-4.2.1/ext/pcre/php_pcre.c       Fri May 17 11:28:02 2002
@@ -37,6 +37,7 @@
 
 #define        PREG_SPLIT_NO_EMPTY                     (1<<0)
 #define PREG_SPLIT_DELIM_CAPTURE       (1<<1)
+#define PREG_SPLIT_OFFSET_CAPTURE      (1<<2)
 
 #define PREG_REPLACE_EVAL                      (1<<0)
 
@@ -100,6 +101,7 @@
        REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | 
CONST_PERSISTENT);
        REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | 
CONST_PERSISTENT);
        REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, 
CONST_CS | CONST_PERSISTENT);
+       REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, 
+CONST_CS | CONST_PERSISTENT);
        REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | 
CONST_PERSISTENT);
        return SUCCESS;
 }
@@ -1080,8 +1082,10 @@
        int                              limit_val = -1;        /* Integer value of 
limit */
        int                              no_empty = 0;          /* If NO_EMPTY flag is 
set */
        int                              delim_capture = 0; /* If delimiters should be 
captured */
+       int                              offset_capture = 0;/* If offsets should be 
+captured */
        int                              count = 0;                     /* Count of 
matched subpatterns */
        int                              start_offset;          /* Where the new 
search starts */
+       int                              next_offset;           /* End of the last 
+delimiter match + 1 */
        int                              g_notempty = 0;        /* If the match should 
not be empty */
        char                    *match,                         /* The current match */
                                        *last_match;            /* Location of last 
match */
@@ -1102,6 +1106,7 @@
                        convert_to_long_ex(flags);
                        no_empty = Z_LVAL_PP(flags) & PREG_SPLIT_NO_EMPTY;
                        delim_capture = Z_LVAL_PP(flags) & PREG_SPLIT_DELIM_CAPTURE;
+                       offset_capture = Z_LVAL_PP(flags) & PREG_SPLIT_OFFSET_CAPTURE;
                }
        }
        
@@ -1123,6 +1128,7 @@
        
        /* Start at the beginning of the string */
        start_offset = 0;
+       next_offset = 0;
        last_match = Z_STRVAL_PP(subject);
        match = NULL;
        
@@ -1143,9 +1149,27 @@
                        match = Z_STRVAL_PP(subject) + offsets[0];
 
                        if (!no_empty || &Z_STRVAL_PP(subject)[offsets[0]] != 
last_match) {
-                               /* Add the piece to the return value */
-                               add_next_index_stringl(return_value, last_match,
-                                                                          
&Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
+
+                               if (offset_capture) {
+                                       zval *match_pair;
+                                       ALLOC_ZVAL(match_pair);
+                                       array_init(match_pair);
+                                       INIT_PZVAL(match_pair);
+                               
+                                       /* Add (match, offset) to the return value */
+                                       add_next_index_stringl(match_pair, last_match,
+                                                                          
+&Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
+                                       
+                                       add_next_index_long(match_pair, next_offset);
+
+                                       
+zend_hash_next_index_insert(Z_ARRVAL_P(return_value), &match_pair,
+                                                                                      
+         sizeof(zval *), NULL);
+
+                               } else {
+                       /* Add the piece to the return value */
+                                       add_next_index_stringl(return_value, 
+last_match,
+                                                                          
+&Z_STRVAL_PP(subject)[offsets[0]]-last_match, 1);
+                               }
 
                                /* One less left to do */
                                if (limit_val != -1)
@@ -1153,6 +1177,7 @@
                        }
                        
                        last_match = &Z_STRVAL_PP(subject)[offsets[1]];
+            next_offset = offsets[1];
 
                        if (delim_capture) {
                                int i, match_len;
@@ -1185,11 +1210,32 @@
                /* Advance to the position right after the last full match */
                start_offset = offsets[1];
        }
-       
+
+
        if (!no_empty || start_offset != Z_STRLEN_PP(subject))
-               /* Add the last piece to the return value */
-               add_next_index_string(return_value,
-                                                         
&Z_STRVAL_PP(subject)[start_offset], 1);
+       {
+               if (offset_capture) {
+                       zval *match_pair;
+                       ALLOC_ZVAL(match_pair);
+                       array_init(match_pair);
+                       INIT_PZVAL(match_pair);
+               
+                       /* Add the last (match, offset) pair to the return value */
+                       add_next_index_string(match_pair,
+                                                          
+&Z_STRVAL_PP(subject)[start_offset], 1);
+                       
+                       add_next_index_long(match_pair, start_offset);
+
+                       zend_hash_next_index_insert(Z_ARRVAL_P(return_value), 
+&match_pair,
+                                                                               
+sizeof(zval *), NULL);
+
+               } else {
+                       /* Add the last piece to the return value */
+                       add_next_index_string(return_value,
+                                                                       
+&Z_STRVAL_PP(subject)[start_offset], 1);
+               }
+       }
+
        
        /* Clean up */
        efree(offsets);

-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to