Ah, thanks very much. I will get this fixed in the released version shortly.
Bill On Wed, Aug 24, 2011 at 10:55 PM, Benjamin Kaduk <ka...@mit.edu> wrote: > tags 633704 patch > thanks > > On Wed, 24 Aug 2011, Bill Poser wrote: > > I'm working on this, but, curiously, on my system I don't get a segfault. >> Instead, I get infinitely many copies of the error message. >> > > It also manifests as an infinite loop for me. The issue is that > sscanf("...%n",...) does not fully match the pattern, so the %n value is not > set, and the value of the variable passed in is unchanged. I think in some > circumstances it could be uninitialized memory (which would explain the > segfaults), but has always started out as zero in my simple test cases. > > Working around the lack of a %n value is fairly trivial, just setting a > sentinel value and checking for it; however, the code attempts to explicitly > check for such "partial matches", so this would be a feature regression (in > some sense). > > I attach my current patch (I think the snprintf return value checks may > need to be >=; I didn't test that part). I worked on this bug at a > bugsquashathon here at MIT this weekend, and kcr was going to help me get > the change uploaded, but it is kind of a nasty patch to review ... > > Annotations inline: > % diff -ruN uni2ascii-4.18.orig//**ascii2uni.c uni2ascii-4.18/ascii2uni.c > % --- uni2ascii-4.18.orig//**ascii2uni.c 2011-05-14 22:15:20.000000000 > -0400 > % +++ uni2ascii-4.18/ascii2uni.c 2011-08-23 20:07:29.000000000 -0400 > % @@ -208,6 +208,7 @@ > % char aHfmt [8+2+1]; > % char aDfmt [8+2+1]; > % char cbuf[5]; > % + char fmt_itoa[12]; > > The scanf pattern %ul stores into an integral type, but we need to know the > number of characters it occupies in the string. To do so, we'll need to > printf it again, into this buffer. It's statically allocated because the > value should fit in 32 bits (if I read the UTF32 type correctly), and the > decimal representation therein will fit in this size, with NUL terminator. > > % FILE *infp; > % % UTF32 num; > % @@ -555,45 +556,64 @@ > % } > % else if (FType == CHENT) { > % if (AllHTMLP){ > % + NConsumed = -1; > % if(sscanf(iptr,aHfmt,&num,&**NConsumed) > 0) { > % - if(*(iptr+NConsumed-1) != ';') { > % + if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') { > > The sentinel value, and we check for a partial match in both ways. > > % MicrosoftStyle++; > % + if (NConsumed == -1) { > % + if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%x", num) > > sizeof(fmt_itoa)-1) { > % + fprintf(stderr, "UTF32 codepoint overflowed static > buffer\n"); > % + exit(BADRECORD); > % + } > % + NConsumed = 3 /* "&#x" */ + strlen(fmt_itoa) + 1 /* ";" > */; > % + } > > We need to know how many characters were used for the partial match; make > sure to not overflow the static buffer. > > % fprintf(stderr, > % _("The HTML/HDML entity %1$s at token %2$lu of line > %3$lu lacks the requisite final semicolon.\n"), > % ExtractSubstring(tmpstr,iptr,** > iptr+NConsumed-3),TokenNumber,**LineNo); > % if(StrictP) {putchar(*iptr++); continue;} > % - else {putu8(num);iptr+=NConsumed;} > % + else {putu8(num);iptr+=NConsumed-1;**} > > The ';' was not matched, so the character in that position is really a part > of something else. > > % } > % else {putu8(num);iptr+=NConsumed;} > % TokenNumber++; > % continue; > % } > % + NConsumed = -1; > % if(sscanf(iptr,aDfmt,&num,&**NConsumed) > 0) { > % - if(*(iptr+NConsumed-1) != ';') { > % + if(NConsumed == -1 || *(iptr+NConsumed-1) != ';') { > % MicrosoftStyle++; > % + if (NConsumed == -1) { > % + if (snprintf(fmt_itoa, sizeof(fmt_itoa), "%u", num) > > sizeof(fmt_itoa)-1) { > % + fprintf(stderr, "UTF32 codepoint overflowed static > buffer\n"); > % + exit(BADRECORD); > % + } > % + NConsumed = 2 /* "&#" */ + strlen(fmt_itoa) + 1 /* ";" */; > % + } > > The same dance, but with a different-length prefix. > > % fprintf(stderr, > % _("The HTML/HDML entity %1$s at token %2$lu of line > %3$lu lacks the requisite final semicolon.\n"), > % ExtractSubstring(tmpstr,iptr,** > iptr+NConsumed-3),TokenNumber,**LineNo); > % if (StrictP) {putchar(*iptr++); continue;} > % - else {putu8(num);iptr+=NConsumed;} > % + else {putu8(num);iptr+=NConsumed-1;**} > % } > % else {putu8(num);iptr+=NConsumed;} > % TokenNumber++; > % continue; > % } > % } > % + NConsumed = -1; > % if(sscanf(iptr,afmt,&enam,&**NConsumed) > 0) { > % + if (NConsumed == -1) NConsumed = 1 /* "&" */ + strlen(enam) + 1 > /* ";" */; > > This case is slightly different, in that we accept any match but check if > it is a valid entity. Update the number of characters consumed by the > partial match early, to ease the check for MicrosoftStyle input below. > > % if( (num = LookupCodeForEntity(enam))) { > % if(*(iptr+NConsumed-1) != ';') { > % MicrosoftStyle++; > % fprintf(stderr,_("The HTML/HDML entity %1$s at token %2$lu > of line %3$lu lacks the requisite final semicolon.\n"),** > ExtractSubstring(tmpstr,iptr,**iptr+NConsumed-3),TokenNumber,**LineNo); > % if(StrictP) {putchar(*iptr++);continue;} > % - else {putu8(num);iptr+=NConsumed;} > % + else {putu8(num);iptr+=NConsumed-1;**} > % } > % else {putu8(num);iptr+=NConsumed;} > % TokenNumber++; > % } > % else { > % + if(*(iptr+NConsumed-1) != ';') NConsumed--; > > Again, don't eat the extra unmatched character. > > -Ben Kaduk > > > % fprintf(stderr,"ascii2uni: unknown HTML/HDML character entity > \"&%s;\" at line %lu\n", > % enam,LineNo); > % putu8(UNI_REPLACEMENT_CHAR); > % diff -ruN uni2ascii-4.18.orig//debian/**changelog uni2ascii-4.18/debian/ > **changelog > % --- uni2ascii-4.18.orig//debian/**changelog 2011-05-17 > 01:30:08.000000000 -0400 > % +++ uni2ascii-4.18/debian/**changelog 2011-08-23 19:24:25.000000000 > -0400 > % @@ -1,3 +1,11 @@ > % +uni2ascii (4.18-1.1) unstable; urgency=low > % + > % + * Non-maintainer upload. > % + * Check for partially-matched sscanf() patterns and consume an > appropriate > % + number of characters (Closes: #633704) > % + > % + -- Benjamin Kaduk <ka...@mit.edu> Tue, 23 Aug 2011 19:22:19 -0400 > % + > % uni2ascii (4.18-1) unstable; urgency=low > % % * New upstream release: