https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86419

--- Comment #7 from Dimitrij Mijoski <dmjpp at hotmail dot com> ---
I think a found a related bug in the UTF8 to UCS2 codecvt,
codecvt_utf8<char16_t>. It can be tested with the following example:

#include <codecvt>

auto test_u8_ucs2_in()
{
        // 2 code points, one is 3 bytes and the other is 4 bytes in UTF-8.
        // in UTF-16 the first is sinlge unit, the second is surrogate pair
        // in UCS2 only the first CP is allowed.
        const char* in = u8"\uAAAA\U0010AAAA";
        char16_t out[2] = { 'y' , 'y' };

        auto cvt_ptr = make_unique<codecvt_utf8<char16_t>>();
        auto& cvt = *cvt_ptr;
        auto state = mbstate_t{};
        auto in_ptr = in;
        auto out_ptr = out;

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        auto res = cvt.in(state, in, in + 2, in_ptr, out, out, out_ptr);
        assert(res == cvt.partial); //BUG, returns OK, should be Partial 
        assert(out_ptr == out);
        assert(in_ptr == in);

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 2, in_ptr, out, out + 1, out_ptr);
        assert(res == cvt.partial); // BUG, returns ERROR, should be Partial
        assert(out_ptr == out);
        assert(in_ptr == in);

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 3, in_ptr, out, out, out_ptr);
        assert(res == cvt.partial); //BUG, return OK, should be Partial
        assert(out_ptr == out);
        assert(in_ptr == in);


        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 3, in_ptr, out, out + 1, out_ptr);
        assert(res == cvt.ok);
        assert(out_ptr == out + 1);
        assert(in_ptr == in + 3);
        cout << "UCS2 sequence: " << hex << out[0] << ' ' << out[1] << '\n';

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 6, in_ptr, out, out + 1, out_ptr);
        assert(res == cvt.partial); // BUG, return OK, should be Partial
        assert(out_ptr == out + 1);
        assert(in_ptr == in + 3);

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 6, in_ptr, out, out + 2, out_ptr);
        assert(res == cvt.partial); // BUG, returns ERROR, should be Partial
        assert(out_ptr == out + 1);
        assert(in_ptr == in + 3);

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 7, in_ptr, out, out + 1, out_ptr);
        assert(res == cvt.partial); // BUG, returns OK, should be Partial
        assert(out_ptr == out + 1);
        assert(in_ptr == in + 3);

        state = {};
        in_ptr = nullptr;
        out_ptr = nullptr;
        res = cvt.in(state, in, in + 7, in_ptr, out, out + 2, out_ptr);
        assert(res == cvt.error);
        assert(out_ptr == out + 1);
        assert(in_ptr == in + 3);
}


The bug lies in the same function utf16_in() I mentioned in comment #5, in
lines 544-547
https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/src/c%2B%2B11/codecvt.cc;h=0311b15177d0439757e0347f7934b5a09b78f8e3;hb=HEAD#l544

Those lines:

 544             if (s == surrogates::allowed)
 545               return codecvt_base::partial;
 546             else
 547               return codecvt_base::error; // No surrogates in UCS2

Should simply be one line: 

 544               return codecvt_base::partial;

Reply via email to