cataphract Wed, 27 Oct 2010 18:13:25 +0000 Revision: http://svn.php.net/viewvc?view=revision&revision=304959
Log: - Fixed bug #49687 (utf8_decode vulnerabilities and deficiencies in the number of reported malformed sequences). (Gustavo) #Made a public interface for get_next_char/utf-8 in trunk to use in utf8_decode. #In PHP 5.3, trunk's get_next_char was copied to xml.c because 5.3's #get_next_char is different and is not prepared to recover appropriately from #errors. Bug: http://bugs.php.net/49687 (Assigned) utf8_decode xml_utf8_decode vuln Changed paths: U php/php-src/branches/PHP_5_3/NEWS A php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt U php/php-src/branches/PHP_5_3/ext/xml/xml.c U php/php-src/trunk/ext/standard/html.c U php/php-src/trunk/ext/standard/html.h A php/php-src/trunk/ext/xml/tests/bug49687.phpt U php/php-src/trunk/ext/xml/xml.c
Modified: php/php-src/branches/PHP_5_3/NEWS =================================================================== --- php/php-src/branches/PHP_5_3/NEWS 2010-10-27 14:56:51 UTC (rev 304958) +++ php/php-src/branches/PHP_5_3/NEWS 2010-10-27 18:13:25 UTC (rev 304959) @@ -152,6 +152,8 @@ other platforms). (Pierre) - Fixed bug #50345 (nanosleep not detected properly on some solaris versions). (Ulf, Tony) +- Fixed bug #49687 (utf8_decode vulnerabilities and deficiencies in the number + of reported malformed sequences). (Gustavo) - Fixed bug #49407 (get_html_translation_table doesn't handle UTF-8). (Gustavo) - Fixed bug #49215 (make fails on glob_wrapper). (Felipe) - Fixed bug #48831 (php -i has different output to php --ini). (Richard, Added: php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt =================================================================== --- php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt (rev 0) +++ php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt 2010-10-27 18:13:25 UTC (rev 304959) @@ -0,0 +1,24 @@ +--TEST-- +Bug #49687 Several utf8_decode deficiencies and vulnerabilities +--SKIPIF-- +<?php +require_once("skipif.inc"); +if (!extension_loaded('xml')) die ("skip xml extension not available"); +?> +--FILE-- +<?php + +$tests = array( + "\x41\xC2\x3E\x42", + "\xE3\x80\x22", + "\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98", +); +foreach ($tests as $t) { + echo bin2hex(utf8_decode($t)), "\n"; +} +echo "Done.\n"; +--EXPECT-- +413f3e42 +3f22 +413f3f423f433f3f +Done. Modified: php/php-src/branches/PHP_5_3/ext/xml/xml.c =================================================================== --- php/php-src/branches/PHP_5_3/ext/xml/xml.c 2010-10-27 14:56:51 UTC (rev 304958) +++ php/php-src/branches/PHP_5_3/ext/xml/xml.c 2010-10-27 18:13:25 UTC (rev 304959) @@ -659,10 +659,111 @@ } /* }}} */ +/* copied from trunk's implementation of get_next_char in ext/standard/html.c */ +#define MB_FAILURE(pos, advance) do { \ + *cursor = pos + (advance); \ + *status = FAILURE; \ + return 0; \ +} while (0) + +#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need)) +#define utf8_lead(c) ((c) < 0x80 || ((c) >= 0xC2 && (c) <= 0xF4)) +#define utf8_trail(c) ((c) >= 0x80 && (c) <= 0xBF) + +/* {{{ php_next_utf8_char + */ +static inline unsigned int php_next_utf8_char( + const unsigned char *str, + size_t str_len, + size_t *cursor, + int *status) +{ + size_t pos = *cursor; + unsigned int this_char = 0; + unsigned char c; + + *status = SUCCESS; + + if (!CHECK_LEN(pos, 1)) + MB_FAILURE(pos, 1); + + /* We'll follow strategy 2. from section 3.6.1 of UTR #36: + * "In a reported illegal byte sequence, do not include any + * non-initial byte that encodes a valid character or is a leading + * byte for a valid sequence.» */ + c = str[pos]; + if (c < 0x80) { + this_char = c; + pos++; + } else if (c < 0xc2) { + MB_FAILURE(pos, 1); + } else if (c < 0xe0) { + if (!CHECK_LEN(pos, 2)) + MB_FAILURE(pos, 1); + + if (!utf8_trail(str[pos + 1])) { + MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2); + } + this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f); + if (this_char < 0x80) { /* non-shortest form */ + MB_FAILURE(pos, 2); + } + pos += 2; + } else if (c < 0xf0) { + size_t avail = str_len - pos; + + if (avail < 3 || + !utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) { + if (avail < 2 || utf8_lead(str[pos + 1])) + MB_FAILURE(pos, 1); + else if (avail < 3 || utf8_lead(str[pos + 2])) + MB_FAILURE(pos, 2); + else + MB_FAILURE(pos, 3); + } + + this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f); + if (this_char < 0x800) { /* non-shortest form */ + MB_FAILURE(pos, 3); + } else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */ + MB_FAILURE(pos, 3); + } + pos += 3; + } else if (c < 0xf5) { + size_t avail = str_len - pos; + + if (avail < 4 || + !utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) || + !utf8_trail(str[pos + 3])) { + if (avail < 2 || utf8_lead(str[pos + 1])) + MB_FAILURE(pos, 1); + else if (avail < 3 || utf8_lead(str[pos + 2])) + MB_FAILURE(pos, 2); + else if (avail < 4 || utf8_lead(str[pos + 3])) + MB_FAILURE(pos, 3); + else + MB_FAILURE(pos, 4); + } + + this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); + if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */ + MB_FAILURE(pos, 4); + } + pos += 4; + } else { + MB_FAILURE(pos, 1); + } + + *cursor = pos; + return this_char; +} +/* }}} */ + + /* {{{ xml_utf8_decode */ PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding) { - int pos = len; + size_t pos = 0; char *newbuf = emalloc(len + 1); unsigned int c; char (*decoder)(unsigned short) = NULL; @@ -681,36 +782,15 @@ newbuf[*newlen] = '\0'; return newbuf; } - while (pos > 0) { - c = (unsigned char)(*s); - if (c >= 0xf0) { /* four bytes encoded, 21 bits */ - if(pos-4 >= 0) { - c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63); - } else { - c = '?'; - } - s += 4; - pos -= 4; - } else if (c >= 0xe0) { /* three bytes encoded, 16 bits */ - if(pos-3 >= 0) { - c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63); - } else { - c = '?'; - } - s += 3; - pos -= 3; - } else if (c >= 0xc0) { /* two bytes encoded, 11 bits */ - if(pos-2 >= 0) { - c = ((s[0]&63)<<6) | (s[1]&63); - } else { - c = '?'; - } - s += 2; - pos -= 2; - } else { - s++; - pos--; + + while (pos < (size_t)len) { + int status = FAILURE; + c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status); + + if (status == FAILURE || c > 0xFFU) { + c = '?'; } + newbuf[*newlen] = decoder ? decoder(c) : c; ++*newlen; } Modified: php/php-src/trunk/ext/standard/html.c =================================================================== --- php/php-src/trunk/ext/standard/html.c 2010-10-27 14:56:51 UTC (rev 304958) +++ php/php-src/trunk/ext/standard/html.c 2010-10-27 18:13:25 UTC (rev 304959) @@ -92,9 +92,9 @@ /* {{{ get_next_char */ -static unsigned int get_next_char( +static inline unsigned int get_next_char( enum entity_charset charset, - unsigned char *str, + const unsigned char *str, size_t str_len, size_t *cursor, int *status) @@ -352,6 +352,18 @@ } /* }}} */ +/* {{{ php_next_utf8_char + * Public interface for get_next_char used with UTF-8 */ + PHPAPI unsigned int php_next_utf8_char( + const unsigned char *str, + size_t str_len, + size_t *cursor, + int *status) +{ + return get_next_char(cs_utf_8, str, str_len, cursor, status); +} +/* }}} */ + /* {{{ entity_charset determine_charset * returns the charset identifier based on current locale or a hint. * defaults to UTF-8 */ Modified: php/php-src/trunk/ext/standard/html.h =================================================================== --- php/php-src/trunk/ext/standard/html.h 2010-10-27 14:56:51 UTC (rev 304958) +++ php/php-src/trunk/ext/standard/html.h 2010-10-27 18:13:25 UTC (rev 304959) @@ -57,5 +57,6 @@ PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC); PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC); PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC); +PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, int *status); #endif /* HTML_H */ Added: php/php-src/trunk/ext/xml/tests/bug49687.phpt =================================================================== --- php/php-src/trunk/ext/xml/tests/bug49687.phpt (rev 0) +++ php/php-src/trunk/ext/xml/tests/bug49687.phpt 2010-10-27 18:13:25 UTC (rev 304959) @@ -0,0 +1,24 @@ +--TEST-- +Bug #49687 Several utf8_decode deficiencies and vulnerabilities +--SKIPIF-- +<?php +require_once("skipif.inc"); +if (!extension_loaded('xml')) die ("skip xml extension not available"); +?> +--FILE-- +<?php + +$tests = array( + "\x41\xC2\x3E\x42", + "\xE3\x80\x22", + "\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98", +); +foreach ($tests as $t) { + echo bin2hex(utf8_decode($t)), "\n"; +} +echo "Done.\n"; +--EXPECT-- +413f3e42 +3f22 +413f3f423f433f3f +Done. Modified: php/php-src/trunk/ext/xml/xml.c =================================================================== --- php/php-src/trunk/ext/xml/xml.c 2010-10-27 14:56:51 UTC (rev 304958) +++ php/php-src/trunk/ext/xml/xml.c 2010-10-27 18:13:25 UTC (rev 304959) @@ -32,6 +32,7 @@ #include "zend_variables.h" #include "ext/standard/php_string.h" #include "ext/standard/info.h" +#include "ext/standard/html.h" #if HAVE_XML @@ -662,7 +663,7 @@ /* {{{ xml_utf8_decode */ PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding) { - int pos = len; + size_t pos = 0; char *newbuf = emalloc(len + 1); unsigned int c; char (*decoder)(unsigned short) = NULL; @@ -681,36 +682,15 @@ newbuf[*newlen] = '\0'; return newbuf; } - while (pos > 0) { - c = (unsigned char)(*s); - if (c >= 0xf0) { /* four bytes encoded, 21 bits */ - if(pos-4 >= 0) { - c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63); - } else { - c = '?'; - } - s += 4; - pos -= 4; - } else if (c >= 0xe0) { /* three bytes encoded, 16 bits */ - if(pos-3 >= 0) { - c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63); - } else { - c = '?'; - } - s += 3; - pos -= 3; - } else if (c >= 0xc0) { /* two bytes encoded, 11 bits */ - if(pos-2 >= 0) { - c = ((s[0]&63)<<6) | (s[1]&63); - } else { - c = '?'; - } - s += 2; - pos -= 2; - } else { - s++; - pos--; + + while (pos < (size_t)len) { + int status = FAILURE; + c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status); + + if (status == FAILURE || c > 0xFFU) { + c = '?'; } + newbuf[*newlen] = decoder ? decoder(c) : c; ++*newlen; }
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php