pollita Tue Oct 17 21:42:29 2006 UTC Modified files: /php-src README.UNICODE-UPGRADES Log: More unicode upgrading notes http://cvs.php.net/viewvc.cgi/php-src/README.UNICODE-UPGRADES?r1=1.8&r2=1.9&diff_format=u Index: php-src/README.UNICODE-UPGRADES diff -u php-src/README.UNICODE-UPGRADES:1.8 php-src/README.UNICODE-UPGRADES:1.9 --- php-src/README.UNICODE-UPGRADES:1.8 Tue Oct 17 20:56:28 2006 +++ php-src/README.UNICODE-UPGRADES Tue Oct 17 21:42:28 2006 @@ -407,8 +407,8 @@ This functions returns part of a string based on offset and length parameters. - void *str; - int32_t str_len, cp_len; + zstr str; + int str_len, cp_len; zend_uchar str_type; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "tl|l", &str, &str_len, &str_type, &f, &l) == FAILURE) { @@ -417,11 +417,11 @@ The first thing we notice is that the incoming string specifier is 't', which means that we can accept all 3 string types. The 'str' variable is -declared as void*, because it can point to either UChar* or char*. +declared as zstr, because it can point to either UChar* or char*. The actual type of the incoming string is stored in 'str_type' variable. if (str_type == IS_UNICODE) { - cp_len = u_countChar32(str, str_len); + cp_len = u_countChar32(str.u, str_len); } else { cp_len = str_len; } @@ -435,10 +435,10 @@ if (str_type == IS_UNICODE) { int32_t start = 0, end = 0; - U16_FWD_N((UChar*)str, end, str_len, f); + U16_FWD_N(str.u, end, str_len, f); start = end; - U16_FWD_N((UChar*)str, end, str_len, l); - RETURN_UNICODEL((UChar*)str + start, end-start, 1); + U16_FWD_N(str.u, end, str_len, l); + RETURN_UNICODEL(str.u + start, end-start, ZSTR_DUPLICATE); Since codepoint (character) #n is not necessarily at offset #n in Unicode strings, we start at the beginning and iterate forward until we have gone @@ -448,10 +448,10 @@ segment as a Unicode string. } else { - RETURN_STRINGL((char*)str + f, l, 1); + RETURN_STRINGL(str.s + f, l, ZSTR_DUPLICATE); } -For native and binary types, we can return the segment directly. +For native strings, we can return the segment directly. strrev() @@ -486,9 +486,9 @@ Unicode type, processes it exactly as before, simply swapping bytes around. For Unicode case, the magic is like this: - int32_t i, x1, x2; - UChar32 ch; - UChar *u_s, *u_n, *u_p; + int32_t i, x1, x2; + UChar32 ch; + UChar *u_s, *u_n, *u_p; u_n = eumalloc(Z_USTRLEN_PP(str)+1); u_p = u_n; @@ -525,6 +525,98 @@ characters (UChar32 type) to 1 or 2 UTF-16 code units (UChar type). +realpath() +---------- + +Filenames use their own converter as it's not uncommon, for example, +to need to access files on a filesystem with latin1 entries while outputting +UTF8 runtime content. + +The most common approach to parsing filenames can be found in realpath(): + +zval **ppfilename; +char *filename; +int filename_len; + +if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Z", &ppfilename) == FAILURE || + php_stream_path_param_encode(ppfilename, &filename, &filename_len, REPORT_ERRORS, FG(default_context)) == FAILURE) { + return; +} + +Here, the filename is taken first as a generic zval**, then converted (separating if necessary) +and populated into local char* and int storage. The filename will be converted according to +unicode.filesystem_encoding unless the wrapper specified overrides this with its own conversion +function (The http:// wrapper, for example, enforces utf8 conversion). + + +rmdir() +------- + +If the function accepts a context parameter, then this context should be used in place of FG(default_context) + +zval **ppdir, *zcontext = NULL; +char *dir; +int dir_len; + +if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "Z|r", &ppdir, &zcontext) == FAILURE) { + return; +} + +context = php_stream_context_from_zval(zcontext, 0); +if (php_stream_path_param_encode(ppdir, &dir, &dir_len, REPORT_ERRORS, context) == FAILURE) { + return; +} + + +sqlite_query() +-------------- + +If the function's underlying library expects a particular encoding (i.e. UTF8), then the alternate form of +the string parameter may be used with zend_parse_parameters(). + +char *sql; +int sql_len; + +if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s&", &sql, &sql_len, UG(utf8_conv)) == FAILURE) { + return; +} + +Converters +========== + +Standard Converters +------------------- + +The following converters (UConverter*) are initialized by Zend and are always available (regardless of UG(unicode) mode): + UG(utf8_conv) + UG(ascii_conv) + UG(fallback_encoding_conv) - UTF8 unless overridden by INI setting unicode.fallback_encoding + +Additional converters will be optionally initialized depending on INI settings: + UG(runtime_encoding_conv) - unicode.runtime_encoding + . Unicode output generated by a script will be encoding using this converter + + UG(script_encoding_conv) - unicode.script_encoding + . Scripts read from disk will be decoded using this converter + + UG(http_input_encoding_conv) - unicode.http_input_encoding + . HTTP Request data ($_GET / $_POST) will be decoded using this converter + + UG(filesystem_encoding_conv) - unicode.filesystem_encoding + . Filenames and paths will be encoding using this converter + + +Since these additional converters may not be instatiated (because their INI value is not set), all uses of these converters must +be wrapped in ZEND_U_CONVERTER() for safety. If the converter hasn't been instantiated, then UG(fallback_encoding_conv) will be +used instead. + +For example, RETURN_RT_STRING("foo", ZSTR_DUPLICATE); expands out to: + RETURN_U_STRING(ZEND_U_CONVERTER(UG(runtime_encoding_conv)), "foo", ZSTR_DUPLICATE); + +Which uses UG(runtime_encoding_conv) if it's been set, otherwise using UG(fallback_encoding_conv). + +Note that the INI setting unicode.stream_encoding does not instantiate a UConverter* automatically for use by the process/thread, +it stores the value as a string for use during fopen() style calls where a UConverter* is instantiated for that particular stream. References ==========
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php