[patch] fixed access to uninitialized memory with truncated utf-8 sequence

Dominique Pelle Sat, 28 Jun 2008 01:27:33 -0700

Hi,

Valgrind memory checker detects the following access to uninitialized
memory:


==15698== Conditional jump or move depends on uninitialised value(s)
==15698==    at 0x811CC12: utfc_ptr2len (mbyte.c:1709)
==15698==    by 0x805CB9F: str_foldcase (charset.c:493)
==15698==    by 0x819AEDE: check_keyword_id (syntax.c:3192)
==15698==    by 0x8198DD9: syn_current_attr (syntax.c:1907)
==15698==    by 0x8198B19: get_syntax_attr (syntax.c:1771)
==15698==    by 0x8165729: win_line (screen.c:3895)
==15698==    by 0x81614E5: win_update (screen.c:1765)
==15698==    by 0x815F6F0: update_screen (screen.c:522)
==15698==    by 0x80D4CD6: vgetorpeek (getchar.c:2672)
==15698==    by 0x80D396F: vgetc (getchar.c:1710)
==15698==    by 0x80D3A15: safe_vgetc (getchar.c:1757)
==15698==    by 0x8063FB0: edit (edit.c:711)
==15698==    by 0x812DDB0: invoke_edit (normal.c:8813)
==15698==    by 0x812DD56: nv_edit (normal.c:8786)
==15698==    by 0x81216DE: normal_cmd (normal.c:1152)
==15698==    by 0x80E4E2E: main_loop (main.c:1177)
==15698==    by 0x80E497E: main (main.c:936)
(...then follow many other errors...)

I can reproduce it 100% of the time but the way to reproduce it
is too complicated to attempt to explain it here.  I have not found
a simpler test case.  Attached patch should hopefully suffice to
clarify the problem anyway.

The code in charset.c is:

 447 #ifdef FEAT_MBYTE
 448     if (enc_utf8 || (has_mbyte && MB_BYTE2LEN(STR_CHAR(i)) > 1))
 449     {
 450         if (enc_utf8)
 451         {
 452             int     c, lc;
 453
 454             c = utf_ptr2char(STR_PTR(i));
 455             lc = utf_tolower(c);
 456             if (c != lc)
 457             {
 458                 int     ol = utf_char2len(c);
 459                 int     nl = utf_char2len(lc);
 460
 461                 /* If the byte length changes need to shift the following
 462                  * characters forward or backward. */
 463                 if (ol != nl)
 464                 {
 465                     if (nl > ol)
 466                     {
 467                         if (buf == NULL ? ga_grow(&ga, nl - ol + 1) == FAIL
 468                                                 : len + nl - ol >= buflen)
 469                         {
 470                             /* out of memory, keep old char */
 471                             lc = c;
 472                             nl = ol;
 473                         }
 474                     }
 475                     if (ol != nl)
 476                     {
 477                         if (buf == NULL)
 478                         {
 479                             STRMOVE(GA_PTR(i) + nl, GA_PTR(i) + ol);
 480                             ga.ga_len += nl - ol;
 481                         }
 482                         else
 483                         {
 484                             STRMOVE(buf + i + nl, buf + i + ol);
 485                             len += nl - ol;
 486                         }
 487                     }
 488                 }
!489                 (void)utf_char2bytes(lc, STR_PTR(i));
 490             }
 491         }
 492         /* skip to next multi-byte char */
!493         i += (*mb_ptr2len)(STR_PTR(i));
 494     }
 495     else
 496 #endif


Bug happens when string STR_PTR(i) contains a truncated utf-8 sequence.  In
that case, utf_ptr2char(STR_PTR(i)) at line 454 returns only the first byte
of the truncated utf-8 sequence. If this first byte is >= 0x80, then
utf_char2len(c) at line 458 sets 'ol' to something > 1 (which is
inconsistent with the fact that line 454 only returned the first byte).
Then call to utf_char2bytes(lc, STR_PTR(i)) at line 489 can write
multiple bytes, hence overwriting NUL end of string and then function
keep accessing several bytes beyond end of string (which is uninitialized).

Attached patch fixes it. It uses utf_ptr2len(STR_PTR(i)) rather than
utf_char2len(c) to compute the length of the utf-8 sequence, so invalid
utf-8 sequence can be detected and skipped (converting case for invalid
utf-8 sequence does not make sense anyway).

I'm using Vim-7.2a BETA (huge) with patches 1-5 on Linux x86, utf-8 locale.

-- Dominique

--~--~---------~--~----~------------~-------~--~----~
You received this message from the "vim_dev" maillist.
For more information, visit http://www.vim.org/maillist.php
-~----------~----~----~----~------~----~------~--~---

Index: charset.c
===================================================================
RCS file: /cvsroot/vim/vim7/src/charset.c,v
retrieving revision 1.29
diff -c -r1.29 charset.c
*** charset.c	24 Jun 2008 21:25:31 -0000	1.29
--- charset.c	28 Jun 2008 07:50:55 -0000
***************
*** 449,461 ****
  	{
  	    if (enc_utf8)
  	    {
! 		int	c, lc;
! 
! 		c = utf_ptr2char(STR_PTR(i));
! 		lc = utf_tolower(c);
! 		if (c != lc)
  		{
- 		    int	    ol = utf_char2len(c);
  		    int	    nl = utf_char2len(lc);
  
  		    /* If the byte length changes need to shift the following
--- 449,468 ----
  	{
  	    if (enc_utf8)
  	    {
! 		int	c = utf_ptr2char(STR_PTR(i));
! 		int	ol = utf_ptr2len(STR_PTR(i));
! 		int	lc = utf_tolower(c);
! 
! 		/* Above call to utf_ptr2char(STR_PTR(i)) may return the
! 		 * first byte when STR_PTR(i) points to an invalid/truncated
! 		 * utf-8 sequence.  Guard against invalid/truncated utf-8
! 		 * sequence or else call to utf_char2bytes(lc, STR_PTR(i))
! 		 * could overwrite NUL and then function would access beyond
! 		 * end of string. Converting case does not make sense for
! 		 * invalid/truncated utf-8 sequence anyway. */
! 		if (c != lc
! 			&& (c < 0x80 || ol > 1)) /* valid utf-8 sequence */
  		{
  		    int	    nl = utf_char2len(lc);
  
  		    /* If the byte length changes need to shift the following

[patch] fixed access to uninitialized memory with truncated utf-8 sequence

Raspunde prin e-mail lui