Hi!

This paper voted in as DR makes some multi-character literals ill-formed.
'abcd' stays valid, but e.g. 'รก' is newly invalid in UTF-8 exec charset
while valid e.g. in ISO-8859-1, because it is a single character which needs
2 bytes to be encoded.

The following patch does that by checking (only pedantically, especially
because it is a DR) if we'd emit a -Wmultichar warning because character
constant has more than one byte in it whether the number of bytes in the
narrow string matches number of bytes in CPP_STRING32 divided by char32_t
size in bytes.  If it is, it is normal multi-character literal constant
and is diagnosed normally with -Wmultichar, if the number of bytes is
larger, at least one of the c-chars in the sequence was encoded as 2+
bytes.

Now, doing this way has 2 drawbacks, some of the diagnostics which doesn't
result in cpp_interpret_string_1 failures can be printed twice, once
when calling cpp_interpret_string_1 for CPP_CHAR, once for CPP_STRING32.
And, functionally I think it must work 100% correctly if host source
character set is UTF-8 (because all valid UTF-8 chars are encodable in
UTF-32), but might not work for some control codes in UTF-EBCDIC if
that is the source character set (though I don't know if we really actually
support it, e.g. Linux iconv certainly doesn't).
All we actually need is count the number of c-chars in the literal,
alternative would be to write custom character counter which would quietly
interpret/skip over + count escape sequences and decode UTF-8 characters
in between those escape sequences.  But we'd need to have something similar
also for UTF-EBCDIC if it works at all, and from what I've looked, we don't
have anyything like that implemented in libcpp nor anywhere else in GCC.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Or ok with some tweaks to avoid the second round of diagnostics from
cpp_interpret_string_1/convert_escape?  Or reimplement that second time and
count manually?

2023-08-25  Jakub Jelinek  <ja...@redhat.com>

        PR c++/110341
libcpp/
        * charset.cc: Implement C++ 26 P1854R4 - Making non-encodable string
        literals ill-formed.
        (narrow_str_to_charconst): Change last type from cpp_ttype to
        const cpp_token *.  For C++ if pedantic and i > 1 in CPP_CHAR
        interpret token also as CPP_STRING32 and if number of characters
        in the CPP_STRING32 is larger than number of bytes in CPP_CHAR,
        pedwarn on it.
        (cpp_interpret_charconst): Adjust narrow_str_to_charconst caller.
gcc/testsuite/
        * g++.dg/cpp26/literals1.C: New test.
        * g++.dg/cpp26/literals2.C: New test.
        * g++.dg/cpp23/wchar-multi1.C (c, d): Expect an error rather than
        warning.

--- libcpp/charset.cc.jj        2023-08-24 15:36:59.000000000 +0200
+++ libcpp/charset.cc   2023-08-25 17:14:14.098733396 +0200
@@ -2567,18 +2567,20 @@ cpp_interpret_string_notranslate (cpp_re
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for narrow strings.  STR is the string structure returned
    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-   cpp_interpret_charconst.  TYPE is the token type.  */
+   cpp_interpret_charconst.  TOKEN is the token.  */
 static cppchar_t
 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
                         unsigned int *pchars_seen, int *unsignedp,
-                        enum cpp_ttype type)
+                        const cpp_token *token)
 {
+  enum cpp_ttype type = token->type;
   size_t width = CPP_OPTION (pfile, char_precision);
   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
   size_t mask = width_to_mask (width);
   size_t i;
   cppchar_t result, c;
   bool unsigned_p;
+  bool diagnosed = false;
 
   /* The value of a multi-character character constant, or a
      single-character character constant whose representation in the
@@ -2602,7 +2604,37 @@ narrow_str_to_charconst (cpp_reader *pfi
 
   if (type == CPP_UTF8CHAR)
     max_chars = 1;
-  if (i > max_chars)
+  else if (i > 1 && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
+    {
+      /* C++ as a DR since
+        P1854R4 - Making non-encodable string literals ill-formed
+        makes multi-character narrow character literals if any of the
+        characters in the literal isn't encodable in char/unsigned char
+        ill-formed.  We need to count the number of c-chars and compare
+        that to str.len.  */
+      cpp_string str2 = { 0, 0 };
+      if (cpp_interpret_string (pfile, &token->val.str, 1, &str2,
+                               CPP_STRING32))
+       {
+         size_t width32 = converter_for_type (pfile, CPP_STRING32).width;
+         size_t nbwc = width32 / width;
+         size_t len = str2.len / nbwc;
+         if (str2.text != token->val.str.text)
+           free ((void *)str2.text);
+         if (str.len > len)
+           {
+             diagnosed
+               = cpp_error (pfile, CPP_DL_PEDWARN,
+                            "character too large for character literal "
+                            "type");
+             if (diagnosed && i > max_chars)
+               i = max_chars;
+           }
+       }
+    }
+  if (diagnosed)
+    /* Already diagnosed above.  */;
+  else if (i > max_chars)
     {
       i = max_chars;
       cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
@@ -2747,7 +2779,7 @@ cpp_interpret_charconst (cpp_reader *pfi
                                    token->type);
   else
     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
-                                     token->type);
+                                     token);
 
   if (str.text != token->val.str.text)
     free ((void *)str.text);
--- gcc/testsuite/g++.dg/cpp26/literals1.C.jj   2023-08-25 17:23:06.662878355 
+0200
+++ gcc/testsuite/g++.dg/cpp26/literals1.C      2023-08-25 17:37:03.085132304 
+0200
@@ -0,0 +1,65 @@
+// C++26 P1854R4 - Making non-encodable string literals ill-formed
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target int32 }
+// { dg-options "-pedantic-errors -finput-charset=UTF-8 -fexec-charset=UTF-8" }
+
+int a = 'abcd';                                                // { dg-warning 
"multi-character character constant" }
+int b = '\x61\x62\x63\x64';                            // { dg-warning 
"multi-character character constant" }
+int c = 'รก';                                           // { dg-error 
"character too large for character literal type" }
+int d = '๐Ÿ˜';                                           // { dg-error 
"character too large for character literal type" }
+int e = '\N{FACE WITH TEARS OF JOY}';                  // { dg-error 
"character too large for character literal type" }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } .-1 }
+int f = '\U0001F602';                                  // { dg-error 
"character too large for character literal type" }
+wchar_t g = L'abcd';                                   // { dg-error 
"character constant too long for its type" "" { target c++23 } }
+                                                       // { dg-warning 
"character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t h = L'\x61\x62\x63\x64';                       // { dg-error 
"character constant too long for its type" "" { target c++23 } }
+                                                       // { dg-warning 
"character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t i = L'รก';
+char16_t j = u'abcd';                                  // { dg-error 
"character constant too long for its type" }
+char16_t k = u'\x61\x62\x63\x64';                      // { dg-error 
"character constant too long for its type" }
+char16_t l = u'รก';
+char16_t m = u'๐Ÿ˜';                                     // { dg-error 
"character constant too long for its type" }
+char16_t n = u'\N{FACE WITH TEARS OF JOY}';            // { dg-error 
"character constant too long for its type" { target c++23 } }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } .-1 }
+char16_t o = u'\U0001F602';                            // { dg-error 
"character constant too long for its type" }
+char32_t p = U'abcd';                                  // { dg-error 
"character constant too long for its type" }
+char32_t q = U'\x61\x62\x63\x64';                      // { dg-error 
"character constant too long for its type" }
+char32_t r = U'รก';
+char32_t s = U'๐Ÿ˜';
+char32_t t = U'\N{FACE WITH TEARS OF JOY}';            // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+char32_t u = U'\U0001F602';
+#if __cpp_unicode_characters >= 201411L
+auto v = u8'abcd';                                     // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto w = u8'\x61\x62\x63\x64';                         // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto x = u8'รก';                                                // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto y = u8'๐Ÿ˜';                                        // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto z = u8'\N{FACE WITH TEARS OF JOY}';               // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target { c++17 && 
c++20_down } } .-1 }
+auto aa = u8'\U0001F602';                              // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+#endif
+const char *ab = "๐Ÿ˜";
+const char *ac = "\N{FACE WITH TEARS OF JOY}";         // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+const char *ad = "\U0001F602";
+const char16_t *ae = u"๐Ÿ˜";
+const char16_t *af = u"\N{FACE WITH TEARS OF JOY}";    // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+const char16_t *ag = u"\U0001F602";
+const char32_t *ah = U"๐Ÿ˜";
+const char32_t *ai = U"\N{FACE WITH TEARS OF JOY}";    // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+const char32_t *aj = U"\U0001F602";
+auto ak = u8"๐Ÿ˜";
+auto al = u8"\N{FACE WITH TEARS OF JOY}";              // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+auto am = u8"\U0001F602";
+int an = '\x123456789';                                        // { dg-error 
"hex escape sequence out of range" }
+wchar_t ao = L'\x123456789abcdef0';                    // { dg-error "hex 
escape sequence out of range" }
+char16_t ap = u'\x12345678';                           // { dg-error "hex 
escape sequence out of range" }
+char32_t aq = U'\x123456789abcdef0';                   // { dg-error "hex 
escape sequence out of range" }
+#if __cpp_unicode_characters >= 201411L
+auto ar = u8'\x123456789abcdef0';                      // { dg-error "hex 
escape sequence out of range" "" { target c++17 } }
+#endif
+char as = '\xff';
+#if __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 32
+wchar_t at = L'\xffffffff';
+#elif __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 16
+wchar_t at = L'\xffff';
+#endif
+int au = '\x1234';                                     // { dg-error "hex 
escape sequence out of range" }
--- gcc/testsuite/g++.dg/cpp26/literals2.C.jj   2023-08-25 17:37:34.549728535 
+0200
+++ gcc/testsuite/g++.dg/cpp26/literals2.C      2023-08-25 17:41:03.923041763 
+0200
@@ -0,0 +1,67 @@
+// C++26 P1854R4 - Making non-encodable string literals ill-formed
+// { dg-do compile { target c++11 } }
+// { dg-require-effective-target int32 }
+// { dg-options "-pedantic-errors -finput-charset=UTF-8 
-fexec-charset=ISO-8859-1" }
+/* { dg-require-iconv "ISO-8859-1" } */
+
+int a = 'abcd';                                                // { dg-warning 
"multi-character character constant" }
+int b = '\x61\x62\x63\x64';                            // { dg-warning 
"multi-character character constant" }
+int c = 'รก';
+int d = '๐Ÿ˜';                                           // { dg-error 
"converting to execution character set" }
+int e = '\N{FACE WITH TEARS OF JOY}';                  // { dg-error 
"converting UCN to execution character set" }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } .-1 }
+int f = '\U0001F602';                                  // { dg-error 
"converting UCN to execution character set" }
+wchar_t g = L'abcd';                                   // { dg-error 
"character constant too long for its type" "" { target c++23 } }
+                                                       // { dg-warning 
"character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t h = L'\x61\x62\x63\x64';                       // { dg-error 
"character constant too long for its type" "" { target c++23 } }
+                                                       // { dg-warning 
"character constant too long for its type" "" { target c++20_down } .-1 }
+wchar_t i = L'รก';
+char16_t j = u'abcd';                                  // { dg-error 
"character constant too long for its type" }
+char16_t k = u'\x61\x62\x63\x64';                      // { dg-error 
"character constant too long for its type" }
+char16_t l = u'รก';
+char16_t m = u'๐Ÿ˜';                                     // { dg-error 
"character constant too long for its type" }
+char16_t n = u'\N{FACE WITH TEARS OF JOY}';            // { dg-error 
"character constant too long for its type" { target c++23 } }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } .-1 }
+char16_t o = u'\U0001F602';                            // { dg-error 
"character constant too long for its type" }
+char32_t p = U'abcd';                                  // { dg-error 
"character constant too long for its type" }
+char32_t q = U'\x61\x62\x63\x64';                      // { dg-error 
"character constant too long for its type" }
+char32_t r = U'รก';
+char32_t s = U'๐Ÿ˜';
+char32_t t = U'\N{FACE WITH TEARS OF JOY}';            // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+char32_t u = U'\U0001F602';
+#if __cpp_unicode_characters >= 201411L
+auto v = u8'abcd';                                     // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto w = u8'\x61\x62\x63\x64';                         // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto x = u8'รก';                                                // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto y = u8'๐Ÿ˜';                                        // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+auto z = u8'\N{FACE WITH TEARS OF JOY}';               // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target { c++17 && 
c++20_down } } .-1 }
+auto aa = u8'\U0001F602';                              // { dg-error 
"character constant too long for its type" "" { target c++17 } }
+#endif
+const char *ab = "๐Ÿ˜";                                  // { dg-error 
"converting to execution character set" }
+const char *ac = "\N{FACE WITH TEARS OF JOY}";         // { dg-error 
"converting UCN to execution character set" }
+                                                       // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } .-1 }
+const char *ad = "\U0001F602";                         // { dg-error 
"converting UCN to execution character set" }
+const char16_t *ae = u"๐Ÿ˜";
+const char16_t *af = u"\N{FACE WITH TEARS OF JOY}";    // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+const char16_t *ag = u"\U0001F602";
+const char32_t *ah = U"๐Ÿ˜";
+const char32_t *ai = U"\N{FACE WITH TEARS OF JOY}";    // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+const char32_t *aj = U"\U0001F602";
+auto ak = u8"๐Ÿ˜";
+auto al = u8"\N{FACE WITH TEARS OF JOY}";              // { dg-error "named 
universal character escapes are only valid in" "" { target c++20_down } }
+auto am = u8"\U0001F602";
+int an = '\x123456789';                                        // { dg-error 
"hex escape sequence out of range" }
+wchar_t ao = L'\x123456789abcdef0';                    // { dg-error "hex 
escape sequence out of range" }
+char16_t ap = u'\x12345678';                           // { dg-error "hex 
escape sequence out of range" }
+char32_t aq = U'\x123456789abcdef0';                   // { dg-error "hex 
escape sequence out of range" }
+#if __cpp_unicode_characters >= 201411L
+auto ar = u8'\x123456789abcdef0';                      // { dg-error "hex 
escape sequence out of range" "" { target c++17 } }
+#endif
+char as = '\xff';
+#if __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 32
+wchar_t at = L'\xffffffff';
+#elif __SIZEOF_WCHAR_T__ * __CHAR_BIT__ == 16
+wchar_t at = L'\xffff';
+#endif
+int au = '\x1234';                                     // { dg-error "hex 
escape sequence out of range" }
--- gcc/testsuite/g++.dg/cpp23/wchar-multi1.C.jj        2022-08-27 
23:01:28.321565931 +0200
+++ gcc/testsuite/g++.dg/cpp23/wchar-multi1.C   2023-08-25 22:20:42.772015922 
+0200
@@ -4,9 +4,9 @@
 
 char a = 'a';
 int b = 'ab';                  // { dg-warning "multi-character character 
constant" }
-int c = '\u05D9';              // { dg-warning "multi-character character 
constant" }
+int c = '\u05D9';              // { dg-error "character too large for 
character literal type" }
 #if __SIZEOF_INT__ > 2
-int d = '\U0001F525';          // { dg-warning "multi-character character 
constant" "" { target int32 } }
+int d = '\U0001F525';          // { dg-error "character too large for 
character literal type" "" { target int32 } }
 #endif
 int e = 'abcd';                        // { dg-warning "multi-character 
character constant" }
 wchar_t f = L'f';

        Jakub

Reply via email to