Problem: When calling the out() method of a codecvt facet for a locale that specifies UTF-8 encoding, the method fails to recognize partial (i.e., incomplete) UTF-8 encoding sequences at the end of the source string. Instead of returning the expected std::codecvt_base::partial status code with the returned source position (arg-4) indexing the start of the incomplete sequence, the method returns std::codecvt_base::ok with the returned source position just past the end of the source string. Nothing from the partial sequence ends up in the destination wide string (as expected).
Compilation: gcc -v --save-temps -Wall -ansi -pedantic -g -o localetest localetest.cxx Compilation output: Using built-in specs. Target: i386-redhat-linux Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/u sr/share/info --enable-shared --enable-threads=posix --enable-checking=release - -with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable- libgcj-multifile --enable-languages=c,c++,objc,obj-c++,java,fortran,ada --enable -java-awt=gtk --disable-dssi --enable-plugin --with-java-home=/usr/lib/jvm/java- 1.4.2-gcj-1.4.2.0/jre --with-cpu=generic --host=i386-redhat-linux Thread model: posix gcc version 4.1.1 20070105 (Red Hat 4.1.1-51) /usr/libexec/gcc/i386-redhat-linux/4.1.1/cc1plus -E -quiet -v -D_GNU_SOURCE loc aletest.cxx -mtune=generic -ansi -Wall -pedantic -fworking-directory -fpch-prepr ocess -o localetest.ii ignoring nonexistent directory "/usr/lib/gcc/i386-redhat-linux/4.1.1/../../../.. /i386-redhat-linux/include" #include "..." search starts here: #include <...> search starts here: /usr/lib/gcc/i386-redhat-linux/4.1.1/../../../../include/c++/4.1.1 /usr/lib/gcc/i386-redhat-linux/4.1.1/../../../../include/c++/4.1.1/i386-redhat- linux /usr/lib/gcc/i386-redhat-linux/4.1.1/../../../../include/c++/4.1.1/backward /usr/local/include /usr/lib/gcc/i386-redhat-linux/4.1.1/include /usr/include End of search list. /usr/libexec/gcc/i386-redhat-linux/4.1.1/cc1plus -fpreprocessed localetest.ii - quiet -dumpbase localetest.cxx -mtune=generic -ansi -auxbase localetest -g -Wall -pedantic -ansi -version -o localetest.s GNU C++ version 4.1.1 20070105 (Red Hat 4.1.1-51) (i386-redhat-linux) compiled by GNU C version 4.1.1 20070105 (Red Hat 4.1.1-51). GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 Compiler executable checksum: 4720743fdfefd64206c8550433f6e508 as -V -Qy -o localetest.o localetest.s GNU assembler version 2.17.50.0.6-2.fc6 (i386-redhat-linux) using BFD version 2. 17.50.0.6-2.fc6 20061020 /usr/libexec/gcc/i386-redhat-linux/4.1.1/collect2 --eh-frame-hdr -m elf_i386 -- hash-style=gnu -dynamic-linker /lib/ld-linux.so.2 -o localetest /usr/lib/gcc/i38 6-redhat-linux/4.1.1/../../../crt1.o /usr/lib/gcc/i386-redhat-linux/4.1.1/../../ ../crti.o /usr/lib/gcc/i386-redhat-linux/4.1.1/crtbegin.o -L/usr/lib/gcc/i386-re dhat-linux/4.1.1 -L/usr/lib/gcc/i386-redhat-linux/4.1.1 -L/usr/lib/gcc/i386-redh at-linux/4.1.1/../../.. localetest.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgc c /usr/lib/gcc/i386-redhat-linux/4.1.1/crtend.o /usr/lib/gcc/i386-redhat-linux/4 .1.1/../../../crtn.o Test Source File (localetest.cxx): // // This test demonstrates that UTF-8 codecvt facets are ignoring incomplete // trailing encoding sequences. The expected behavior is a return of the // status value std::codecvt_base::partial, with the returned current source // position at the start of the failed sequence. The actual behavior is a // return of std::codecvt_base::ok, with the returned current source position // at the end of the source string (i.e., the incomplete sequence is ignored). // #include <iostream> #include <string> #include <locale> using namespace std; // // Some typedefs to help with facet access. // typedef codecvt_base::result Result; typedef string::traits_type::state_type State; typedef codecvt<wstring::value_type, string::value_type, State> Converter; wchar_t to[256]; // Destination buffer. // // Perform each test iteration fresh, just to make sure that there isn't any // lingering context between tests. // void dotest( const string &test_name, const char *const locale_name, const string &test_string ) { State q; // Shift state context. const string::value_type *me = 0; // Multibyte source current postion. wstring::value_type *we = 0; // Wide destination current position. Result status; // Conversion status. // // Set the current locale. // locale loc(locale_name); locale::global(loc); cout.imbue(loc); // // Start with a clear output buffer. // memset(to, 0, sizeof(to)); // // Do the conversion from narrow multibyte to wide unicode. // const Converter& cvt = use_facet<Converter>(loc); memset(&q, 0, sizeof(q)); string::size_type src_size = test_string.size(); status = cvt.in( q, test_string.data(), test_string.data() + src_size, me, to, to + sizeof(to)/sizeof(to[0]), we ); string::size_type mpos = me - test_string.data(); wstring::size_type wpos = we - to; // // Display the results: // cout << endl; cout << test_name << ": " << loc.name() << endl; cout << " Input:"; for ( string::const_iterator i = test_string.begin(); i != test_string.end(); ++i ) cout << " " << hex << ((*i)&0xFF); cout << " \"" << test_string << "\"" << endl; cout << dec << " Result=" << status << " Source=" << mpos << " Dest=" << wpos << endl; cout << " Output:"; for (size_t i = 0; i < wpos; ++i) cout << " " << hex << to[i]; cout << endl; cout << endl; return; } // // Do three tests for each locale: one with a good string, one with a partial // string, and one with an error string. // string from_ok("\xC2\xA1Hasta ma\xC3\xB1\x61na!"); // Whole string, with complete lowercase en-yay // sequence (\xC3\xB1). string from_partial("\xC2\xA1Hasta ma\xC3"); // Partial string, with lowercase en-yay cut // off after the first byte of the two-byte // sequence. string from_error("\xC2\xA1Hasta\xFF ma\xC3\xB1\x61na!"); // An error in the middle of the string, for // comparison purposes. void dolocale(const char *const locale_name) { dotest("Complete", locale_name, from_ok); dotest("Partial", locale_name, from_partial); dotest("Error", locale_name, from_error); return; } // // Do the test across 3 different locales, all with UTF-8 encoding. // int main(int argc, char *argv[]) { dolocale("en_US.UTF-8"); dolocale("es_US.UTF-8"); dolocale("es_CR.UTF-8"); return 0; } Test Output: Complete: en_US.UTF-8 Input: c2 a1 48 61 73 74 61 20 6d 61 c3 b1 61 6e 61 21 "¡Hasta mañana!" Result=0 Source=16 Dest=14 Output: a1 48 61 73 74 61 20 6d 61 f1 61 6e 61 21 Partial: en_US.UTF-8 Input: c2 a1 48 61 73 74 61 20 6d 61 c3 "¡Hasta ma�" Result=0 Source=11 Dest=9 Output: a1 48 61 73 74 61 20 6d 61 Error: en_US.UTF-8 Input: c2 a1 48 61 73 74 61 ff 20 6d 61 c3 b1 61 6e 61 21 "¡Hasta� mañana!" Result=2 Source=7 Dest=6 Output: a1 48 61 73 74 61 Complete: es_US.UTF-8 Input: c2 a1 48 61 73 74 61 20 6d 61 c3 b1 61 6e 61 21 "¡Hasta mañana!" Result=0 Source=16 Dest=14 Output: a1 48 61 73 74 61 20 6d 61 f1 61 6e 61 21 Partial: es_US.UTF-8 Input: c2 a1 48 61 73 74 61 20 6d 61 c3 "¡Hasta ma�" Result=0 Source=11 Dest=9 Output: a1 48 61 73 74 61 20 6d 61 Error: es_US.UTF-8 Input: c2 a1 48 61 73 74 61 ff 20 6d 61 c3 b1 61 6e 61 21 "¡Hasta� mañana!" Result=2 Source=7 Dest=6 Output: a1 48 61 73 74 61 Complete: es_CR.UTF-8 Input: c2 a1 48 61 73 74 61 20 6d 61 c3 b1 61 6e 61 21 "¡Hasta mañana!" Result=0 Source=16 Dest=14 Output: a1 48 61 73 74 61 20 6d 61 f1 61 6e 61 21 Partial: es_CR.UTF-8 Input: c2 a1 48 61 73 74 61 20 6d 61 c3 "¡Hasta ma�" Result=0 Source=11 Dest=9 Output: a1 48 61 73 74 61 20 6d 61 Error: es_CR.UTF-8 Input: c2 a1 48 61 73 74 61 ff 20 6d 61 c3 b1 61 6e 61 21 "¡Hasta� mañana!" Result=2 Source=7 Dest=6 Output: a1 48 61 73 74 61 Test Results: Note that each of the error cases properly reports the invalid encoding byte in the source string (\xFF) at source position 7; however, the partial test cases improperly ignore the partial encoding sequence (\xC3) at the end of the partial test strings. -- Summary: Codecvt facets with UTF-8 encoding fail to recognize partial encoding sequences Product: gcc Version: 4.1.1 Status: UNCONFIRMED Severity: major Priority: P3 Component: libstdc++ AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: jcavalla at postini dot com GCC target triplet: i386-redhat-linux http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31643