pollita         Tue Dec  5 04:13:47 2006 UTC

  Modified files:              
    /php-src/ext/standard       file.c file.h 
    /php-src/ext/standard/tests/file    bug12556.phpt fgetcsv.phpt 
  Log:
  Unicode upgrade for fgetcsv()
  
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/file.c?r1=1.469&r2=1.470&diff_format=u
Index: php-src/ext/standard/file.c
diff -u php-src/ext/standard/file.c:1.469 php-src/ext/standard/file.c:1.470
--- php-src/ext/standard/file.c:1.469   Wed Nov 22 12:56:26 2006
+++ php-src/ext/standard/file.c Tue Dec  5 04:13:46 2006
@@ -21,7 +21,7 @@
    +----------------------------------------------------------------------+
  */
 
-/* $Id: file.c,v 1.469 2006/11/22 12:56:26 pajoye Exp $ */
+/* $Id: file.c,v 1.470 2006/12/05 04:13:46 pollita Exp $ */
 
 /* Synced with php 3.0 revision 1.218 1999-06-16 [ssb] */
 
@@ -1932,43 +1932,6 @@
 }
 /* }}} */
 
-static const char *php_fgetcsv_lookup_trailing_spaces(const char *ptr, size_t 
len, const char delimiter TSRMLS_DC)
-{
-       int inc_len;
-       unsigned char last_chars[2] = { 0, 0 };
-
-       while (len > 0) {
-               inc_len = (*ptr == '\0' ? 1: php_mblen(ptr, len));
-               switch (inc_len) {
-                       case -2:
-                       case -1:
-                               inc_len = 1;
-                               php_mblen(NULL, 0);
-                               break;
-                       case 0:
-                               goto quit_loop;
-                       case 1:
-                       default:
-                               last_chars[0] = last_chars[1];
-                               last_chars[1] = *ptr;
-                               break;
-               }
-               ptr += inc_len;
-               len -= inc_len;
-       }
-quit_loop:
-       switch (last_chars[1]) {
-               case '\n':
-                       if (last_chars[0] == '\r') {
-                               return ptr - 2;
-                       }
-                       /* break is omitted intentionally */
-               case '\r':
-                       return ptr - 1;
-       }
-       return ptr;
-}
-
 #define FPUTCSV_FLD_CHK(c) memchr(Z_STRVAL_PP(field), c, Z_STRLEN_PP(field))
 
 /* {{{ proto int fputcsv(resource fp, array fields [, string delimiter [, 
string enclosure]])
@@ -2072,87 +2035,149 @@
 }
 /* }}} */
 
-/* {{{ proto array fgetcsv(resource fp [,int length [, string delimiter [, 
string enclosure]]])
+/* {{{ proto array fgetcsv(resource fp [,int length [, string delimiter [, 
string enclosure[, string escape]]]]) U
    Get line from file pointer and parse for CSV fields */
-/* UTODO: Accept unicode contents */
+#define PHP_FGETCSV_TRUNCATE(field) \
+if (argc > 4) { \
+       /* Caller knows about new semantics since they're using new param, 
allow multichar */ \
+} else if (field##_type == IS_STRING && field##_len > 1) { \
+       php_error_docref(NULL TSRMLS_CC, E_NOTICE, #field " must be a single 
character"); \
+       delimiter_len = 1; \
+} else if (field##_type == IS_UNICODE && u_countChar32((UChar*)field, 
field##_len) > 1) { \
+       int __tmp = 0; \
+       php_error_docref(NULL TSRMLS_CC, E_NOTICE, #field " must be a single 
character"); \
+       U16_FWD_1(((UChar*)field), __tmp, field##_len); \
+       field##_len = __tmp; \
+}
+
 PHP_FUNCTION(fgetcsv)
 {
-       char delimiter = ',';   /* allow this to be set as parameter */
-       char enclosure = '"';   /* allow this to be set as parameter */
-       /* first section exactly as php_fgetss */
-
-       long len = 0;
-       size_t buf_len;
-       char *buf;
+       zend_uchar delimiter_type = IS_STRING, enclosure_type = IS_STRING, 
escape_type = IS_STRING;
+       char *delimiter = ",", *enclosure = "\"", *escape = "\\";
+       int delimiter_len = 1, enclosure_len = 1, escape_len = 1;
+       long len = -1;
+       zstr buf;
+       int buf_len, argc = ZEND_NUM_ARGS();
        php_stream *stream;
+       zval *zstream;
+       zend_uchar delimiter_free = 0, enclosure_free = 0, escape_free = 0;
 
-       {
-               zval *fd, **len_zv = NULL;
-               char *delimiter_str = NULL;
-               int delimiter_str_len = 0;
-               char *enclosure_str = NULL;
-               int enclosure_str_len = 0;
-
-               if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r|Zss",
-                                       &fd, &len_zv, &delimiter_str, 
&delimiter_str_len,
-                                       &enclosure_str, &enclosure_str_len) == 
FAILURE) {
-                       return;
-               }       
+       if (zend_parse_parameters(argc TSRMLS_CC, "r|l!ttt", &zstream, &len,
+                                               &delimiter, &delimiter_len, 
&delimiter_type,
+                                               &enclosure, &enclosure_len, 
&enclosure_type,
+                                               &escape,    &escape_len,    
&escape_type) == FAILURE) {
+               return;
+       }
 
-               if (delimiter_str != NULL) {
-                       /* Make sure that there is at least one character in 
string */
-                       if (delimiter_str_len < 1) {
-                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"delimiter must be a character");
-                               RETURN_FALSE;
-                       } else if (delimiter_str_len > 1) {
-                               php_error_docref(NULL TSRMLS_CC, E_NOTICE, 
"delimiter must be a single character");
-                       }
+       PHP_STREAM_TO_ZVAL(stream, &zstream);
+
+       /* Make sure that there is at least one character in string,
+        * For userspace BC purposes we generally limit delimiters and 
enclosures to 1 character,
+        * though the code now supports multiple characters
+        *
+        * If this function is called with all five parameters however,
+        * then multiple characters are allowed for all subarguments */
+       if (delimiter_len < 1) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "delimiter must be 
a character");
+               RETURN_FALSE;
+       } else PHP_FGETCSV_TRUNCATE(delimiter);
+
+       if (enclosure_len < 1) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "enclosure must be 
a character");
+               RETURN_FALSE;
+       } else PHP_FGETCSV_TRUNCATE(enclosure);
+
+       if (escape_len < 1) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "escape must be a 
character");
+               RETURN_FALSE;
+       }
+
+       if (len < -1) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Length parameter 
may not be negative");
+               RETURN_FALSE;
+       } else if (len == 0) {
+               len = -1;
+       }
 
-                       /* use first character from string */
-                       delimiter = delimiter_str[0];
+       if (stream->readbuf_type == IS_STRING) {
+               /* Binary mode stream needs binary delmiter/enclosure */
+               if (delimiter_type == IS_UNICODE) {
+                       if (FAILURE == 
zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &delimiter, 
&delimiter_len, (UChar*)delimiter, delimiter_len TSRMLS_CC)) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Failed converting delimiter from unicode");
+                               RETVAL_FALSE;
+                               goto cleanup;
+                       }
+                       delimiter_free = 1;
+               }
+               if (enclosure_type == IS_UNICODE) {
+                       if (FAILURE == 
zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &enclosure, 
&enclosure_len, (UChar*)enclosure, enclosure_len TSRMLS_CC)) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Failed converting enclosure from unicode");
+                               RETVAL_FALSE;
+                               goto cleanup;
+                       }
+                       enclosure_free = 1;
+               }
+               if (escape_type == IS_UNICODE) {
+                       if (FAILURE == 
zend_unicode_to_string(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), &escape, 
&escape_len, (UChar*)escape, escape_len TSRMLS_CC)) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Failed converting escape from unicode");
+                               RETVAL_FALSE;
+                               goto cleanup;
+                       }
+                       escape_free = 1;
                }
-       
-               if (enclosure_str != NULL) {
-                       if (enclosure_str_len < 1) {
-                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"enclosure must be a character");
-                               RETURN_FALSE;
-                       } else if (enclosure_str_len > 1) {
-                               php_error_docref(NULL TSRMLS_CC, E_NOTICE, 
"enclosure must be a single character");
+       } else {
+               /* Unicode mode stream needs unicode delimiter/enclosure */
+               if (delimiter_type == IS_STRING) {
+                       if (FAILURE == 
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), 
(UChar**)&delimiter, &delimiter_len, delimiter, delimiter_len TSRMLS_CC)) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Failed converting delimiter to unicode");
+                               RETVAL_FALSE;
+                               goto cleanup;
                        }
-
-                       /* use first character from string */
-                       enclosure = enclosure_str[0];
+                       delimiter_free = 1;
                }
-
-               if (len_zv != NULL && Z_TYPE_PP(len_zv) != IS_NULL) {
-                       convert_to_long_ex(len_zv);
-                       len = Z_LVAL_PP(len_zv);
-                       if (len < 0) {
-                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Length parameter may not be negative");
-                               RETURN_FALSE;
-                       } else if (len == 0) {
-                               len = -1;
+               if (enclosure_type == IS_STRING) {
+                       if (FAILURE == 
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), 
(UChar**)&enclosure, &enclosure_len, enclosure, enclosure_len TSRMLS_CC)) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Failed converting enclosure to unicode");
+                               RETVAL_FALSE;
+                               goto cleanup;
                        }
-               } else {
-                       len = -1;
+                       enclosure_free = 1;
+               }
+               if (escape_type == IS_STRING) {
+                       if (FAILURE == 
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), 
(UChar**)&escape, &escape_len, escape, escape_len TSRMLS_CC)) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING, 
"Failed converting escape to unicode");
+                               RETVAL_FALSE;
+                               goto cleanup;
+                       }
+                       escape_free = 1;
                }
+       }
 
-               PHP_STREAM_TO_ZVAL(stream, &fd);
+       buf.v = php_stream_get_line_ex(stream, stream->readbuf_type, NULL_ZSTR, 
0, len, &buf_len);
+       if (!buf.v) {
+               /* No data */
+               RETVAL_FALSE;
+               goto cleanup;
        }
 
-       if (len < 0) {
-               if ((buf = php_stream_get_line(stream, NULL_ZSTR, 0, &buf_len)) 
== NULL) {
-                       RETURN_FALSE;
-               }
+       if (stream->readbuf_type == IS_UNICODE) {
+               /* Unicode mode */
+               php_u_fgetcsv(stream, (UChar*)delimiter, delimiter_len, 
(UChar*)enclosure, enclosure_len, (UChar*)escape, escape_len, buf.u, buf_len, 
return_value TSRMLS_CC);
        } else {
-               buf = emalloc(len + 1);
-               if (php_stream_get_line(stream, ZSTR(buf), len + 1, &buf_len) 
== NULL) {
-                       efree(buf);
-                       RETURN_FALSE;
-               }
+               /* Binary mode */
+               php_fgetcsv_ex(stream, delimiter, delimiter_len, enclosure, 
enclosure_len, escape, escape_len, buf.s, buf_len, return_value TSRMLS_CC);
        }
 
-       php_fgetcsv(stream, delimiter, enclosure, buf_len, buf, return_value 
TSRMLS_CC);
+cleanup:
+       if (delimiter_free) {
+               efree(delimiter);
+       }
+       if (enclosure_free) {
+               efree(enclosure);
+       }
+       if (escape_free) {
+               efree(escape);
+       }
 }
 /* }}} */
 
@@ -2161,266 +2186,442 @@
                size_t buf_len, char *buf,
                zval *return_value TSRMLS_DC)
 {
-       char *temp, *tptr, *bptr, *line_end, *limit;
-       const char escape_char = '\\';
+       char *delim = &delimiter, *enc = &enclosure, *buffer = buf;
+       int delim_len = 1, enc_len = 1, buffer_len = buf_len;
+       zend_uchar type = IS_STRING;
 
-       size_t temp_len, line_end_len;
-       int inc_len;
+       if (stream) {
+               type = stream->readbuf_type;
+       }
 
-       /* initialize internal state */
-       php_mblen(NULL, 0);
+       if (type == IS_UNICODE) {
+               UChar esc = '\\';
 
-       /* Now into new section that parses buf for delimiter/enclosure fields 
*/
+               /* Unicode stream, but binary delimiter/enclosures/prefetch, 
promote to unicode */
+               if (FAILURE == 
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), 
(UChar**)&delim, &delim_len, &delimiter, 1 TSRMLS_CC)) {
+                       INIT_PZVAL(return_value);
+                       return;
+               }
+               if (FAILURE == 
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), 
(UChar**)&enc, &enc_len, &enclosure, 1 TSRMLS_CC)) {
+                       efree(delim);
+                       INIT_PZVAL(return_value);
+                       return;
+               }
+               if (FAILURE == 
zend_string_to_unicode(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), 
(UChar**)&buffer, &buffer_len, buf, buf_len TSRMLS_CC)) {
+                       efree(delim);
+                       efree(enc);
+                       INIT_PZVAL(return_value);
+                       return;
+               }
 
-       /* Strip trailing space from buf, saving end of line in case required 
for enclosure field */
+               php_u_fgetcsv(stream, (UChar*)delim, delim_len, (UChar*)enc, 
enc_len, &esc, 1,
+                               (UChar*)buffer, buffer_len, return_value 
TSRMLS_CC);
 
-       bptr = buf;
-       tptr = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, 
delimiter TSRMLS_CC);
-       line_end_len = buf_len - (size_t)(tptr - buf);
-       line_end = limit = tptr;
+               /* Types converted, free storage */
+               efree(delim);
+               efree(enc);
+               efree(buffer);
+       } else {
+               /* Binary stream with binary delimiter/enclosures/prefetch */
+               php_fgetcsv_ex(stream, delim, delim_len, enc, enc_len, "\\", 1, 
buffer, buffer_len, return_value TSRMLS_CC);
+       }
+}
 
-       /* reserve workspace for building each individual field */
-       temp_len = buf_len;
-       temp = emalloc(temp_len + line_end_len + 1);
+typedef enum _php_fgetcsv_state {
+       PHP_FGETCSV_READY,
+       PHP_FGETCSV_FIELD_NO_ENC,
+       PHP_FGETCSV_FIELD_WITH_ENC,
+       PHP_FGETCSV_POST_ENC,
+} php_fgetcsv_state;
+
+#define PHP_FGETCSV_BIN_CHECK(p, e, m, mlen) ((p) < (e) && (((mlen) == 1 && 
*(p) == *(m)) || ((mlen) > 1 && (((e) - (p)) >= (mlen)) && memcmp((p), (m), 
(mlen)) == 0)))
+
+/* Binary mode fgetcsv */
+PHPAPI void php_fgetcsv_ex(php_stream *stream,
+               char *delimiter, int delimiter_len,
+               char *enclosure, int enclosure_len,
+               char *escape, int escape_len,
+               char *buffer, int buffer_len,
+               zval *return_value TSRMLS_DC)
+{
+       php_fgetcsv_state state = PHP_FGETCSV_READY;
+       char *p = buffer, *e = buffer + buffer_len, *field_start = NULL, 
*field_end = NULL;
 
-       /* Initialize return array */
        array_init(return_value);
 
-       /* Main loop to read CSV fields */
-       /* NB this routine will return a single null entry for a blank line */
-
-       do {
-               char *comp_end, *hunk_begin;
+       while(p < e) {
+               switch (state) {
+                       case PHP_FGETCSV_READY:
+ready_state:
+                               /* Ready to start a new field */
+
+                               /* Is there nothing left to scan? */
+                               if (*p == '\r' || *p == '\n') {
+                                       /* Terminal delimiter, treat as empty 
field */
+                                       p++;
+                                       add_next_index_stringl(return_value, 
"", 0, 1);
+                                       break;
+                               }
 
-               tptr = temp;
+                               /* Is it enclosed? */
+                               if (PHP_FGETCSV_BIN_CHECK(p, e, enclosure, 
enclosure_len)) {
+                                       /* Enclosure encountered, switch state 
*/
+                                       state = PHP_FGETCSV_FIELD_WITH_ENC;
+                                       p += enclosure_len;
+                                       field_start = p;
+                                       break;
+                               }
 
-               /* 1. Strip any leading space */
-               for (;;) {
-                       inc_len = (bptr < limit ? (*bptr == '\0' ? 1: 
php_mblen(bptr, limit - bptr)): 0);
-                       switch (inc_len) {
-                               case -2:
-                               case -1:
-                                       inc_len = 1;
-                                       php_mblen(NULL, 0);
+                               /* Is it an immediate delimiter? */
+                               if (PHP_FGETCSV_BIN_CHECK(p, e, delimiter, 
delimiter_len)) {
+                                       /* Immediate delimiter, treate as empty 
field */
+                                       p += delimiter_len;
+                                       add_next_index_stringl(return_value, 
"", 0, 1);
                                        break;
-                               case 0:
-                                       goto quit_loop_1;
-                               case 1:
-                                       if (!isspace((int)*(unsigned char 
*)bptr) || *bptr == delimiter) {
-                                               goto quit_loop_1;
+                               }
+
+                               /* Whitespace? */
+                               if (*p == ' ' || *p == '\t') {
+                                       p++;
+                                       if (p >= e) break;
+                                       goto ready_state;
+                               }
+
+                               /* Is it an escape character? */
+                               if (PHP_FGETCSV_BIN_CHECK(p, e, escape, 
escape_len)) {
+                                       /* Skip escape sequence and let next 
char be treated as literal */
+                                       p += escape_len;
+                                       /* FALL THROUGH */
+                               }
+
+                               /* Otherwise, starting a new field without 
enclosures */
+                               state = PHP_FGETCSV_FIELD_NO_ENC;
+                               field_start = p;
+                               field_end = NULL;
+                               p++;
+                               break;
+
+                       case PHP_FGETCSV_FIELD_WITH_ENC:
+with_enc:
+                               /* Check for ending enclosure */
+                               if (PHP_FGETCSV_BIN_CHECK(p, e, enclosure, 
enclosure_len)) {
+                                       /* Enclosure encountered, is it paired? 
*/
+                                       if (PHP_FGETCSV_BIN_CHECK(p + 
enclosure_len, e, enclosure, enclosure_len)) {
+                                               /* Double enclosure gets 
translated to single enclosure */
+                                               memmove(p, p + enclosure_len, 
(e - p) - enclosure_len);
+                                               e -= enclosure_len;
+                                               p += enclosure_len;
+                                               goto with_enc;
+                                       } else {
+                                               /* Genuine end enclosure, 
switch state */
+                                               field_end = p;
+                                               p += enclosure_len;
+                                               state = PHP_FGETCSV_POST_ENC;
+                                               goto post_enc;
                                        }
+                               }
+
+                               /* Check for field escapes */
+                               if (PHP_FGETCSV_BIN_CHECK(p, e, escape, 
escape_len)) {
+                                       p += escape_len + 1;
+
+                                       /* Reprocess for ending enclosures */
+                                       goto with_enc;
+                               }
+
+                               /* Simple character */
+                               if (e - p) {
+                                       p++;
+                               }
+
+                               /* Hungry? */
+                               if (((e - p) < enclosure_len) && stream) {
+                                       /* Feed me! */
+                                       int new_len;
+                                       char *new_buf = 
php_stream_get_line(stream, NULL_ZSTR, 0, &new_len);
+
+                                       if (new_buf) {
+                                               int tmp_len = new_len + e - 
field_start;
+                                               char *tmp = emalloc(tmp_len);
+
+                                               /* Realign scan buffer */
+                                               memcpy(tmp, field_start, e - 
field_start);
+                                               memcpy(tmp + (e - field_start), 
new_buf, new_len);
+                                               field_start = tmp;
+                                               if (field_end) {
+                                                       field_end = tmp + 
(field_end - field_start);
+                                               }
+                                               efree(buffer);
+                                               efree(new_buf);
+                                               buffer = tmp;
+                                               buffer_len = tmp_len;
+                                               p = buffer;
+                                               e = buffer + buffer_len;
+                                       }
+                               }
+
+                               if ((e - p) == 0) {
+                                       /* Nothing left to consume the buffer, 
use it */
+                                       add_next_index_stringl(return_value, 
field_start, p - field_start, 1);
+
+                                       /* Loop is dying anyway, but be 
pedantic */
+                                       state = PHP_FGETCSV_READY;
+                                       field_start = field_end = NULL;
                                        break;
-                               default:
-                                       goto quit_loop_1;
-                       }
-                       bptr += inc_len;
+                               }
+                               break;
+
+                       case PHP_FGETCSV_POST_ENC:
+post_enc:
+                               /* Check for delimiters or EOL */
+                               if (p >= e || *p == '\r' || *p == '\n' || 
PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
+                                       int field_len = field_end - field_start;
+                                       char *field;
+
+                                       if ((p - enclosure_len) > field_end) {
+                                               /* There's cruft, append it to 
the proper field */
+                                               int cruft_len = p - (field_end 
+ enclosure_len);
+
+                                               field = emalloc(field_len + 
cruft_len + 1);
+                                               memcpy(field, field_start, 
field_len);
+                                               memcpy(field + field_len, 
field_end + enclosure_len, cruft_len);
+
+                                               field_len += cruft_len;
+                                               field[field_len] = 0;
+                                       } else {
+                                               field = estrndup(field_start, 
field_end - field_start);
+                                       }
+                                       add_next_index_stringl(return_value, 
field, field_len, 0);
+
+                                       /* Reset scanner */
+                                       state = PHP_FGETCSV_READY;
+                                       field_start = field_end = NULL;
+                                       p += delimiter_len;
+                                       goto ready_state;
+                               }
+
+                               /* Queue anything else as cruft */
+                               p++;
+                               break;
+
+                       case PHP_FGETCSV_FIELD_NO_ENC:
+                               /* Check for escapes */
+                               if (PHP_FGETCSV_BIN_CHECK(p, e, escape, 
escape_len)) {
+                                       p += escape_len + 1;
+                               }
+
+                               /* Check for delimiter */
+                               if (p >= e || *p == '\r' || *p == '\n' || 
PHP_FGETCSV_BIN_CHECK(p, e, delimiter, delimiter_len)) {
+                                       add_next_index_stringl(return_value, 
field_start, p - field_start, 1);
+
+                                       /* Reset scanner */
+                                       state = PHP_FGETCSV_READY;
+                                       field_start = field_end = NULL;
+                                       p += delimiter_len;
+                                       goto ready_state;
+                               }
+
+                               /* Simple character */
+                               p++;
+                               break;
                }
+       }
 
-       quit_loop_1:
-               /* 2. Read field, leaving bptr pointing at start of next field 
*/
-               if (inc_len != 0 && *bptr == enclosure) {
-                       int state = 0;
-
-                       bptr++; /* move on to first character in field */
-                       hunk_begin = bptr;
-
-                       /* 2A. handle enclosure delimited field */
-                       for (;;) {
-                               switch (inc_len) {
-                                       case 0:
-                                               switch (state) {
-                                                       case 2:
-                                                               memcpy(tptr, 
hunk_begin, bptr - hunk_begin - 1);
-                                                               tptr += (bptr - 
hunk_begin - 1);
-                                                               hunk_begin = 
bptr;
-                                                               goto 
quit_loop_2;
-
-                                                       case 1:
-                                                               memcpy(tptr, 
hunk_begin, bptr - hunk_begin);
-                                                               tptr += (bptr - 
hunk_begin);
-                                                               hunk_begin = 
bptr;
-                                                               /* break is 
omitted intentionally */
-
-                                                       case 0: {
-                                                               char *new_buf;
-                                                               size_t new_len;
-                                                               char *new_temp;
-
-                                                               memcpy(tptr, 
hunk_begin, bptr - hunk_begin);
-                                                               tptr += (bptr - 
hunk_begin);
-                                                               hunk_begin = 
bptr;
-                                                               if (hunk_begin 
!= line_end) {
-                                                                       
memcpy(tptr, hunk_begin, bptr - hunk_begin);
-                                                                       tptr += 
(bptr - hunk_begin);
-                                                                       
hunk_begin = bptr;
-                                                               }
-
-                                                               /* add the 
embedded line end to the field */
-                                                               memcpy(tptr, 
line_end, line_end_len);
-                                                               tptr += 
line_end_len;
-
-                                                               if ((new_buf = 
php_stream_get_line(stream, NULL_ZSTR, 0, &new_len)) == NULL) {
-                                                                       /* 
we've got an unterminated enclosure,
-                                                                        * 
assign all the data from the start of
-                                                                        * the 
enclosure to end of data to the
-                                                                        * last 
element */
-                                                                       if 
((size_t)temp_len > (size_t)(limit - buf)) { 
-                                                                               
goto quit_loop_2;
-                                                                       }
-                                                                       
zval_dtor(return_value);
-                                                                       
RETVAL_FALSE;
-                                                                       goto 
out;
-                                                               }
-                                                               temp_len += 
new_len;
-                                                               new_temp = 
erealloc(temp, temp_len);
-                                                               tptr = new_temp 
+ (size_t)(tptr - temp);
-                                                               temp = new_temp;
-
-                                                               efree(buf);
-                                                               buf_len = 
new_len;
-                                                               bptr = buf = 
new_buf;
-                                                               hunk_begin = 
buf;
+       efree(buffer);
+}
+/* }}} */
 
-                                                               line_end = 
limit = (char *)php_fgetcsv_lookup_trailing_spaces(buf, buf_len, delimiter 
TSRMLS_CC);
-                                                               line_end_len = 
buf_len - (size_t)(limit - buf); 
+#define PHP_FGETCSV_UNI_CHECK(p, e, m, mlen) ((p) < (e) && (((mlen) == 1 && 
*(p) == *(m)) || ((mlen) > 1 && (((e) - (p)) >= (mlen)) && memcmp((p), (m), 
UBYTES(mlen)) == 0)))
 
-                                                               state = 0;
-                                                       } break;
-                                               }
-                                               break;
+/* Unicode mode fgetcsv */
+PHPAPI void php_u_fgetcsv(php_stream *stream,
+               UChar *delimiter, int delimiter_len,
+               UChar *enclosure, int enclosure_len,
+               UChar *escape, int escape_len,
+               UChar *buffer, int buffer_len,
+               zval *return_value TSRMLS_DC)
+{
+       php_fgetcsv_state state = PHP_FGETCSV_READY;
+       UChar *p = buffer, *e = buffer + buffer_len, *field_start = NULL, 
*field_end = NULL;
 
-                                       case -2:
-                                       case -1:
-                                               php_mblen(NULL, 0);
-                                               /* break is omitted 
intentionally */
-                                       case 1:
-                                               /* we need to determine if the 
enclosure is
-                                                * 'real' or is it escaped */
-                                               switch (state) {
-                                                       case 1: /* escaped */
-                                                               bptr++;
-                                                               state = 0;
-                                                               break;
-                                                       case 2: /* embedded 
enclosure ? let's check it */
-                                                               if (*bptr != 
enclosure) {
-                                                                       /* real 
enclosure */
-                                                                       
memcpy(tptr, hunk_begin, bptr - hunk_begin - 1);
-                                                                       tptr += 
(bptr - hunk_begin - 1);
-                                                                       
hunk_begin = bptr;
-                                                                       goto 
quit_loop_2;
-                                                               }
-                                                               memcpy(tptr, 
hunk_begin, bptr - hunk_begin);
-                                                               tptr += (bptr - 
hunk_begin);
-                                                               bptr++;
-                                                               hunk_begin = 
bptr;
-                                                               state = 0;
-                                                               break;
-                                                       default:
-                                                               if (*bptr == 
escape_char) {
-                                                                       state = 
1;
-                                                               } else if 
(*bptr == enclosure) {
-                                                                       state = 
2;
-                                                               }
-                                                               bptr++;
-                                                               break;
-                                               }
-                                               break;
+       array_init(return_value);
 
-                                       default:
-                                               switch (state) {
-                                                       case 2:
-                                                               /* real 
enclosure */
-                                                               memcpy(tptr, 
hunk_begin, bptr - hunk_begin - 1);
-                                                               tptr += (bptr - 
hunk_begin - 1);
-                                                               hunk_begin = 
bptr;
-                                                               goto 
quit_loop_2;
-                                                       case 1:
-                                                               bptr += inc_len;
-                                                               memcpy(tptr, 
hunk_begin, bptr - hunk_begin);
-                                                               tptr += (bptr - 
hunk_begin);
-                                                               hunk_begin = 
bptr;
-                                                               break;
-                                                       default:
-                                                               bptr += inc_len;
-                                                               break;
-                                               }
-                                               break;
+       while(p < e) {
+               switch (state) {
+                       case PHP_FGETCSV_READY:
+ready_state:
+                               /* Ready to start a new field */
+
+                               /* Is there nothing left to scan? */
+                               if (*p == '\r' || *p == '\n') {
+                                       /* Terminal delimiter, treat as empty 
field */
+                                       p++;
+                                       add_next_index_stringl(return_value, 
"", 0, 1);
+                                       break;
                                }
-                               inc_len = (bptr < limit ? (*bptr == '\0' ? 1: 
php_mblen(bptr, limit - bptr)): 0);
-                       }
 
-               quit_loop_2:
-                       /* look up for a delimiter */
-                       for (;;) {
-                               switch (inc_len) {
-                                       case 0:
-                                               goto quit_loop_3;
-
-                                       case -2:
-                                       case -1:
-                                               inc_len = 1;
-                                               php_mblen(NULL, 0);
-                                               /* break is omitted 
intentionally */
-                                       case 1:
-                                               if (*bptr == delimiter) {
-                                                       goto quit_loop_3;
-                                               }
-                                               break;
-                                       default:
-                                               break;
+                               /* Is it enclosed? */
+                               if (PHP_FGETCSV_UNI_CHECK(p, e, enclosure, 
enclosure_len)) {
+                                       /* Enclosure encountered, switch state 
*/
+                                       state = PHP_FGETCSV_FIELD_WITH_ENC;
+                                       p += enclosure_len;
+                                       field_start = p;
+                                       break;
                                }
-                               bptr += inc_len;
-                               inc_len = (bptr < limit ? (*bptr == '\0' ? 1: 
php_mblen(bptr, limit - bptr)): 0);
-                       }
 
-               quit_loop_3:
-                       memcpy(tptr, hunk_begin, bptr - hunk_begin);
-                       tptr += (bptr - hunk_begin);
-                       bptr += inc_len;
-                       comp_end = tptr;
-               } else {
-                       /* 2B. Handle non-enclosure field */
+                               /* Is it an immediate delimiter? */
+                               if (PHP_FGETCSV_UNI_CHECK(p, e, delimiter, 
delimiter_len)) {
+                                       /* Immediate delimiter, treate as empty 
field */
+                                       p += delimiter_len;
+                                       add_next_index_unicodel(return_value, 
(UChar*)"", 0, 1);
+                                       break;
+                               }
+
+                               /* Whitespace? */
+                               if (*p == ' ' || *p == '\t') {
+                                       p++;
+                                       if (p >= e) break;
+                                       goto ready_state;
+                               }
+
+                               /* Is it an escape character? */
+                               if (PHP_FGETCSV_UNI_CHECK(p, e, escape, 
escape_len)) {
+                                       /* Skip escape sequence and let next 
char be treated as literal */
+                                       p += escape_len;
+                                       /* FALL THROUGH */
+                               }
+
+                               /* Otherwise, starting a new field without 
enclosures */
+                               state = PHP_FGETCSV_FIELD_NO_ENC;
+                               field_start = p;
+                               field_end = NULL;
+                               p++;
+                               break;
+
+                       case PHP_FGETCSV_FIELD_WITH_ENC:
+with_enc:
+                               /* Check for ending enclosure */
+                               if (PHP_FGETCSV_UNI_CHECK(p, e, enclosure, 
enclosure_len)) {
+                                       /* Enclosure encountered, is it paired? 
*/
+                                       if (PHP_FGETCSV_UNI_CHECK(p + 
enclosure_len, e, enclosure, enclosure_len)) {
+                                               /* Double enclosure gets 
translated to single enclosure */
+                                               memmove(p, p + enclosure_len, 
(e - p) - enclosure_len);
+                                               e -= enclosure_len;
+                                               p += enclosure_len;
+                                               goto with_enc;
+                                       } else {
+                                               /* Genuine end enclosure, 
switch state */
+                                               field_end = p;
+                                               p += enclosure_len;
+                                               state = PHP_FGETCSV_POST_ENC;
+                                               goto post_enc;
+                                       }
+                               }
+
+                               /* Check for field escapes */
+                               if (PHP_FGETCSV_UNI_CHECK(p, e, escape, 
escape_len)) {
+                                       p += escape_len + 1;
 
-                       hunk_begin = bptr;
+                                       /* Reprocess for ending enclosures */
+                                       goto with_enc;
+                               }
 
-                       for (;;) {
-                               switch (inc_len) {
-                                       case 0:
-                                               goto quit_loop_4;
-                                       case -2:
-                                       case -1:
-                                               inc_len = 1;
-                                               php_mblen(NULL, 0);
-                                               /* break is omitted 
intentionally */
-                                       case 1:
-                                               if (*bptr == delimiter) {
-                                                       goto quit_loop_4;
+                               /* Simple character */
+                               if (e - p) {
+                                       p++;
+                               }
+
+                               /* Hungry? */
+                               if (((e - p) < enclosure_len) && stream) {
+                                       /* Feed me! */
+                                       int new_len;
+                                       UChar *new_buf = 
(UChar*)php_stream_get_line_ex(stream, IS_UNICODE, NULL_ZSTR, 0, 0, &new_len);
+
+                                       if (new_buf) {
+                                               int tmp_len = new_len + e - 
field_start;
+                                               UChar *tmp = eumalloc(tmp_len);
+
+                                               /* Realign scan buffer, ick -- 
expensive */
+                                               memcpy(tmp, field_start, 
UBYTES(e - field_start));
+                                               memcpy(tmp + (e - field_start), 
new_buf, UBYTES(new_len));
+                                               field_start = tmp;
+                                               if (field_end) {
+                                                       field_end = tmp + 
(field_end - field_start);
                                                }
-                                               break;
-                                       default:
-                                               break;
+                                               efree(buffer);
+                                               efree(new_buf);
+                                               buffer = tmp;
+                                               buffer_len = tmp_len;
+                                               p = buffer;
+                                               e = buffer + buffer_len;
+                                       }
                                }
-                               bptr += inc_len;
-                               inc_len = (bptr < limit ? (*bptr == '\0' ? 1: 
php_mblen(bptr, limit - bptr)): 0);
-                       }
-               quit_loop_4:
-                       memcpy(tptr, hunk_begin, bptr - hunk_begin);
-                       tptr += (bptr - hunk_begin);
-
-                       comp_end = (char 
*)php_fgetcsv_lookup_trailing_spaces(temp, tptr - temp, delimiter TSRMLS_CC);
-                       if (*bptr == delimiter) {
-                               bptr++;
-                       }
-               }
 
-               /* 3. Now pass our field back to php */
-               *comp_end = '\0';
-               add_next_index_stringl(return_value, temp, comp_end - temp, 1);
-       } while (inc_len > 0);
+                               if ((e - p) == 0) {
+                                       /* Nothing left to consume the buffer */
+                                       add_next_index_unicodel(return_value, 
field_start, p - field_start, 1);
+
+                                       /* Loop is dying, but cleanup anyway */
+                                       state = PHP_FGETCSV_READY;
+                                       field_start = field_end = NULL;
+                                       break;
+                               }
+                               break;
 
-out:
-       efree(temp);
-       efree(buf);
+                       case PHP_FGETCSV_POST_ENC:
+post_enc:
+                               /* Check for delimiters or EOL */
+                               if (p >= e || *p == '\r' || *p == '\n' || 
PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
+                                       int field_len = field_end - field_start;
+                                       UChar *field;
+
+                                       if ((p - enclosure_len) > field_end) {
+                                               /* There's cruft, append it to 
the regular field */
+                                               int cruft_len = p - (field_end 
+ enclosure_len);
+
+                                               field = eumalloc(field_len + 
cruft_len + 1);
+                                               memcpy(field, field_start, 
field_len);
+                                               memcpy(field + field_len, 
field_end + enclosure_len, UBYTES(cruft_len));
+                                               field_len += cruft_len;
+                                               field[field_len] = 0;
+                                       } else {
+                                               field = eustrndup(field_start, 
field_len);
+                                       }
+                                       add_next_index_unicodel(return_value, 
field, field_len, 0);
+
+                                       /* Reset scanner state */
+                                       state = PHP_FGETCSV_READY;
+                                       field_start = field_end = NULL;
+                                       p += delimiter_len;
+                                       goto ready_state;
+                               }
+
+                               /* Queue anything else as cruft */
+                               p++;
+                               break;
+
+                       case PHP_FGETCSV_FIELD_NO_ENC:
+                               /* Check for escapes */
+                               if (PHP_FGETCSV_UNI_CHECK(p, e, escape, 
escape_len)) {
+                                       p += escape_len + 1;
+                               }
+
+                               /* Check for delimiter */
+                               if (p >= e || *p == '\r' || *p == '\n' || 
PHP_FGETCSV_UNI_CHECK(p, e, delimiter, delimiter_len)) {
+                                       add_next_index_unicodel(return_value, 
field_start, p - field_start, 1);
+                                       state = PHP_FGETCSV_READY;
+                                       field_start = field_end = NULL;
+                                       p += delimiter_len;
+                                       goto ready_state;
+                               }
+
+                               /* Simple character */
+                               p++;
+                               break;
+               }
+       }
+
+       efree(buffer);
 }
 /* }}} */
 
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/file.h?r1=1.101&r2=1.102&diff_format=u
Index: php-src/ext/standard/file.h
diff -u php-src/ext/standard/file.h:1.101 php-src/ext/standard/file.h:1.102
--- php-src/ext/standard/file.h:1.101   Fri Oct 13 09:55:48 2006
+++ php-src/ext/standard/file.h Tue Dec  5 04:13:46 2006
@@ -16,7 +16,7 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: file.h,v 1.101 2006/10/13 09:55:48 bjori Exp $ */
+/* $Id: file.h,v 1.102 2006/12/05 04:13:46 pollita Exp $ */
 
 /* Synced with php 3.0 revision 1.30 1999-06-16 [ssb] */
 
@@ -77,6 +77,11 @@
 PHPAPI int php_mkdir_ex(char *dir, long mode, int options TSRMLS_DC);
 PHPAPI int php_mkdir(char *dir, long mode TSRMLS_DC);
 PHPAPI void php_fgetcsv(php_stream *stream, char delimiter, char enclosure, 
size_t buf_len, char *buf, zval *return_value TSRMLS_DC);
+PHPAPI void php_fgetcsv_ex(php_stream *stream, char *delimiter, int 
delimiter_len, char *enclosure, int enclosure_len, char *escape, int escape_len,
+               char *buffer, int buffer_len, zval *return_value TSRMLS_DC);
+PHPAPI void php_u_fgetcsv(php_stream *stream, UChar *delimiter, int 
delimiter_len, UChar *enclosure, int enclosure_len, UChar *escape, int 
escape_len,
+               UChar *buffer, int buffer_len, zval *return_value TSRMLS_DC);
+
 
 #define META_DEF_BUFSIZE 8192
 
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/tests/file/bug12556.phpt?r1=1.5&r2=1.6&diff_format=u
Index: php-src/ext/standard/tests/file/bug12556.phpt
diff -u php-src/ext/standard/tests/file/bug12556.phpt:1.5 
php-src/ext/standard/tests/file/bug12556.phpt:1.6
--- php-src/ext/standard/tests/file/bug12556.phpt:1.5   Wed May 19 08:54:51 2004
+++ php-src/ext/standard/tests/file/bug12556.phpt       Tue Dec  5 04:13:47 2006
@@ -46,3 +46,41 @@
 2,4,5,line3
 "
 }
+--UEXPECT--
+array(4) {
+  [0]=>
+  unicode(1) "6"
+  [1]=>
+  unicode(1) "7"
+  [2]=>
+  unicode(1) "8"
+  [3]=>
+  unicode(5) "line1"
+}
+array(4) {
+  [0]=>
+  unicode(1) "1"
+  [1]=>
+  unicode(1) "2"
+  [2]=>
+  unicode(1) "3"
+  [3]=>
+  unicode(186) "line2
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+2,4,5,line3
+"
+}
+
http://cvs.php.net/viewvc.cgi/php-src/ext/standard/tests/file/fgetcsv.phpt?r1=1.1&r2=1.2&diff_format=u
Index: php-src/ext/standard/tests/file/fgetcsv.phpt
diff -u php-src/ext/standard/tests/file/fgetcsv.phpt:1.1 
php-src/ext/standard/tests/file/fgetcsv.phpt:1.2
--- php-src/ext/standard/tests/file/fgetcsv.phpt:1.1    Mon Jan 19 03:55:29 2004
+++ php-src/ext/standard/tests/file/fgetcsv.phpt        Tue Dec  5 04:13:47 2006
@@ -28,7 +28,7 @@
        $file = dirname(__FILE__) . 'fgetcsv.csv';
        @unlink($file);
        foreach ($list as $v) {
-               $fp = fopen($file, "w");
+               $fp = fopen($file, "wt");
                fwrite($fp, $v . "\n");
                fclose($fp);
 

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to