cataphract                               Wed, 27 Oct 2010 18:13:25 +0000

Revision: http://svn.php.net/viewvc?view=revision&revision=304959

Log:
- Fixed bug #49687 (utf8_decode vulnerabilities and deficiencies in the number
  of reported malformed sequences). (Gustavo)
#Made a public interface for get_next_char/utf-8 in trunk to use in utf8_decode.
#In PHP 5.3, trunk's get_next_char was copied to xml.c because 5.3's
#get_next_char is different and is not prepared to recover appropriately from
#errors.

Bug: http://bugs.php.net/49687 (Assigned) utf8_decode xml_utf8_decode vuln
      
Changed paths:
    U   php/php-src/branches/PHP_5_3/NEWS
    A   php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt
    U   php/php-src/branches/PHP_5_3/ext/xml/xml.c
    U   php/php-src/trunk/ext/standard/html.c
    U   php/php-src/trunk/ext/standard/html.h
    A   php/php-src/trunk/ext/xml/tests/bug49687.phpt
    U   php/php-src/trunk/ext/xml/xml.c

Modified: php/php-src/branches/PHP_5_3/NEWS
===================================================================
--- php/php-src/branches/PHP_5_3/NEWS	2010-10-27 14:56:51 UTC (rev 304958)
+++ php/php-src/branches/PHP_5_3/NEWS	2010-10-27 18:13:25 UTC (rev 304959)
@@ -152,6 +152,8 @@
   other platforms). (Pierre)
 - Fixed bug #50345 (nanosleep not detected properly on some solaris versions).
   (Ulf, Tony)
+- Fixed bug #49687 (utf8_decode vulnerabilities and deficiencies in the number
+  of reported malformed sequences). (Gustavo)
 - Fixed bug #49407 (get_html_translation_table doesn't handle UTF-8). (Gustavo)
 - Fixed bug #49215 (make fails on glob_wrapper). (Felipe)
 - Fixed bug #48831 (php -i has different output to php --ini). (Richard,

Added: php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt
===================================================================
--- php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt	                        (rev 0)
+++ php/php-src/branches/PHP_5_3/ext/xml/tests/bug49687.phpt	2010-10-27 18:13:25 UTC (rev 304959)
@@ -0,0 +1,24 @@
+--TEST--
+Bug #49687 Several utf8_decode deficiencies and vulnerabilities
+--SKIPIF--
+<?php
+require_once("skipif.inc");
+if (!extension_loaded('xml')) die ("skip xml extension not available");
+?>
+--FILE--
+<?php
+
+$tests = array(
+    "\x41\xC2\x3E\x42",
+    "\xE3\x80\x22",
+    "\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98",
+);
+foreach ($tests as $t) {
+    echo bin2hex(utf8_decode($t)), "\n";
+}
+echo "Done.\n";
+--EXPECT--
+413f3e42
+3f22
+413f3f423f433f3f
+Done.

Modified: php/php-src/branches/PHP_5_3/ext/xml/xml.c
===================================================================
--- php/php-src/branches/PHP_5_3/ext/xml/xml.c	2010-10-27 14:56:51 UTC (rev 304958)
+++ php/php-src/branches/PHP_5_3/ext/xml/xml.c	2010-10-27 18:13:25 UTC (rev 304959)
@@ -659,10 +659,111 @@
 }
 /* }}} */

+/* copied from trunk's implementation of get_next_char in ext/standard/html.c */
+#define MB_FAILURE(pos, advance) do { \
+	*cursor = pos + (advance); \
+	*status = FAILURE; \
+	return 0; \
+} while (0)
+
+#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))
+#define utf8_lead(c)  ((c) < 0x80 || ((c) >= 0xC2 && (c) <= 0xF4))
+#define utf8_trail(c) ((c) >= 0x80 && (c) <= 0xBF)
+
+/* {{{ php_next_utf8_char
+ */
+static inline unsigned int php_next_utf8_char(
+		const unsigned char *str,
+		size_t str_len,
+		size_t *cursor,
+		int *status)
+{
+	size_t pos = *cursor;
+	unsigned int this_char = 0;
+	unsigned char c;
+
+	*status = SUCCESS;
+
+	if (!CHECK_LEN(pos, 1))
+		MB_FAILURE(pos, 1);
+
+	/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
+		* "In a reported illegal byte sequence, do not include any
+		*  non-initial byte that encodes a valid character or is a leading
+		*  byte for a valid sequence.» */
+	c = str[pos];
+	if (c < 0x80) {
+		this_char = c;
+		pos++;
+	} else if (c < 0xc2) {
+		MB_FAILURE(pos, 1);
+	} else if (c < 0xe0) {
+		if (!CHECK_LEN(pos, 2))
+			MB_FAILURE(pos, 1);
+
+		if (!utf8_trail(str[pos + 1])) {
+			MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
+		}
+		this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
+		if (this_char < 0x80) { /* non-shortest form */
+			MB_FAILURE(pos, 2);
+		}
+		pos += 2;
+	} else if (c < 0xf0) {
+		size_t avail = str_len - pos;
+
+		if (avail < 3 ||
+				!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
+			if (avail < 2 || utf8_lead(str[pos + 1]))
+				MB_FAILURE(pos, 1);
+			else if (avail < 3 || utf8_lead(str[pos + 2]))
+				MB_FAILURE(pos, 2);
+			else
+				MB_FAILURE(pos, 3);
+		}
+
+		this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
+		if (this_char < 0x800) { /* non-shortest form */
+			MB_FAILURE(pos, 3);
+		} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
+			MB_FAILURE(pos, 3);
+		}
+		pos += 3;
+	} else if (c < 0xf5) {
+		size_t avail = str_len - pos;
+
+		if (avail < 4 ||
+				!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
+				!utf8_trail(str[pos + 3])) {
+			if (avail < 2 || utf8_lead(str[pos + 1]))
+				MB_FAILURE(pos, 1);
+			else if (avail < 3 || utf8_lead(str[pos + 2]))
+				MB_FAILURE(pos, 2);
+			else if (avail < 4 || utf8_lead(str[pos + 3]))
+				MB_FAILURE(pos, 3);
+			else
+				MB_FAILURE(pos, 4);
+		}
+
+		this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
+		if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
+			MB_FAILURE(pos, 4);
+		}
+		pos += 4;
+	} else {
+		MB_FAILURE(pos, 1);
+	}
+
+	*cursor = pos;
+	return this_char;
+}
+/* }}} */
+
+
 /* {{{ xml_utf8_decode */
 PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding)
 {
-	int pos = len;
+	size_t pos = 0;
 	char *newbuf = emalloc(len + 1);
 	unsigned int c;
 	char (*decoder)(unsigned short) = NULL;
@@ -681,36 +782,15 @@
 		newbuf[*newlen] = '\0';
 		return newbuf;
 	}
-	while (pos > 0) {
-		c = (unsigned char)(*s);
-		if (c >= 0xf0) { /* four bytes encoded, 21 bits */
-			if(pos-4 >= 0) {
-				c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
-			} else {
-				c = '?';
-			}
-			s += 4;
-			pos -= 4;
-		} else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
-			if(pos-3 >= 0) {
-				c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
-			} else {
-				c = '?';
-			}
-			s += 3;
-			pos -= 3;
-		} else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
-			if(pos-2 >= 0) {
-				c = ((s[0]&63)<<6) | (s[1]&63);
-			} else {
-				c = '?';
-			}
-			s += 2;
-			pos -= 2;
-		} else {
-			s++;
-			pos--;
+
+	while (pos < (size_t)len) {
+		int status = FAILURE;
+		c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status);
+
+		if (status == FAILURE || c > 0xFFU) {
+			c = '?';
 		}
+
 		newbuf[*newlen] = decoder ? decoder(c) : c;
 		++*newlen;
 	}

Modified: php/php-src/trunk/ext/standard/html.c
===================================================================
--- php/php-src/trunk/ext/standard/html.c	2010-10-27 14:56:51 UTC (rev 304958)
+++ php/php-src/trunk/ext/standard/html.c	2010-10-27 18:13:25 UTC (rev 304959)
@@ -92,9 +92,9 @@

 /* {{{ get_next_char
  */
-static unsigned int get_next_char(
+static inline unsigned int get_next_char(
 		enum entity_charset charset,
-		unsigned char *str,
+		const unsigned char *str,
 		size_t str_len,
 		size_t *cursor,
 		int *status)
@@ -352,6 +352,18 @@
 }
 /* }}} */

+/* {{{ php_next_utf8_char
+ * Public interface for get_next_char used with UTF-8 */
+ PHPAPI unsigned int php_next_utf8_char(
+		const unsigned char *str,
+		size_t str_len,
+		size_t *cursor,
+		int *status)
+{
+	return get_next_char(cs_utf_8, str, str_len, cursor, status);
+}
+/* }}} */
+
 /* {{{ entity_charset determine_charset
  * returns the charset identifier based on current locale or a hint.
  * defaults to UTF-8 */

Modified: php/php-src/trunk/ext/standard/html.h
===================================================================
--- php/php-src/trunk/ext/standard/html.h	2010-10-27 14:56:51 UTC (rev 304958)
+++ php/php-src/trunk/ext/standard/html.h	2010-10-27 18:13:25 UTC (rev 304959)
@@ -57,5 +57,6 @@
 PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
 PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC);
 PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
+PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, int *status);

 #endif /* HTML_H */

Added: php/php-src/trunk/ext/xml/tests/bug49687.phpt
===================================================================
--- php/php-src/trunk/ext/xml/tests/bug49687.phpt	                        (rev 0)
+++ php/php-src/trunk/ext/xml/tests/bug49687.phpt	2010-10-27 18:13:25 UTC (rev 304959)
@@ -0,0 +1,24 @@
+--TEST--
+Bug #49687 Several utf8_decode deficiencies and vulnerabilities
+--SKIPIF--
+<?php
+require_once("skipif.inc");
+if (!extension_loaded('xml')) die ("skip xml extension not available");
+?>
+--FILE--
+<?php
+
+$tests = array(
+    "\x41\xC2\x3E\x42",
+    "\xE3\x80\x22",
+    "\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98",
+);
+foreach ($tests as $t) {
+    echo bin2hex(utf8_decode($t)), "\n";
+}
+echo "Done.\n";
+--EXPECT--
+413f3e42
+3f22
+413f3f423f433f3f
+Done.

Modified: php/php-src/trunk/ext/xml/xml.c
===================================================================
--- php/php-src/trunk/ext/xml/xml.c	2010-10-27 14:56:51 UTC (rev 304958)
+++ php/php-src/trunk/ext/xml/xml.c	2010-10-27 18:13:25 UTC (rev 304959)
@@ -32,6 +32,7 @@
 #include "zend_variables.h"
 #include "ext/standard/php_string.h"
 #include "ext/standard/info.h"
+#include "ext/standard/html.h"

 #if HAVE_XML

@@ -662,7 +663,7 @@
 /* {{{ xml_utf8_decode */
 PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding)
 {
-	int pos = len;
+	size_t pos = 0;
 	char *newbuf = emalloc(len + 1);
 	unsigned int c;
 	char (*decoder)(unsigned short) = NULL;
@@ -681,36 +682,15 @@
 		newbuf[*newlen] = '\0';
 		return newbuf;
 	}
-	while (pos > 0) {
-		c = (unsigned char)(*s);
-		if (c >= 0xf0) { /* four bytes encoded, 21 bits */
-			if(pos-4 >= 0) {
-				c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
-			} else {
-				c = '?';
-			}
-			s += 4;
-			pos -= 4;
-		} else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
-			if(pos-3 >= 0) {
-				c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
-			} else {
-				c = '?';
-			}
-			s += 3;
-			pos -= 3;
-		} else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
-			if(pos-2 >= 0) {
-				c = ((s[0]&63)<<6) | (s[1]&63);
-			} else {
-				c = '?';
-			}
-			s += 2;
-			pos -= 2;
-		} else {
-			s++;
-			pos--;
+
+	while (pos < (size_t)len) {
+		int status = FAILURE;
+		c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status);
+
+		if (status == FAILURE || c > 0xFFU) {
+			c = '?';
 		}
+
 		newbuf[*newlen] = decoder ? decoder(c) : c;
 		++*newlen;
 	}
-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to