Hi,
the Universal Character Names accepted by the C family of compilers are mapped
to those of ISO/IEC 10646, which defines the Universal Character Set codespace
as the range 0-0x10FFFF inclusive. The upper bound is already enforced for
identifiers but not for literals, so the following code is accepted in C99:
#include <stddef.h>
wchar_t a = L'\U00110000';
whereas it is rejected with an error by other compilers (Clang, MSVC).
I'm not sure whether the compiler is really equired to issue a diagnostic in
this case. Moreover a few tests in the testsuite manipulate UCNs outside the
UCS codespace. That's why I suggest issuing a pedantic warning.
Tested on x86_64-suse-linux, OK for the mainline?
2019-09-24 Eric Botcazou <ebotca...@adacore.com>
libcpp/
* charset.c (UCS_LIMIT): New macro.
(ucn_valid_in_identifier): Use it instead of a hardcoded constant.
(_cpp_valid_ucn): Issue a pedantic warning for UCNs larger than
UCS_LIMIT outside of identifiers.
2019-09-24 Eric Botcazou <ebotca...@adacore.com>
gcc/testsuite/
* gcc.dg/cpp/ucs.c: Add test for new warning and adjust.
* gcc.dg/cpp/utf8-5byte-1.c: Add -w to the options.
* gcc.dg/attr-alias-5.c: Likewise.
--
Eric Botcazou
Index: libcpp/charset.c
===================================================================
--- libcpp/charset.c (revision 275988)
+++ libcpp/charset.c (working copy)
@@ -901,6 +901,9 @@ struct ucnrange {
};
#include "ucnid.h"
+/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
+#define UCS_LIMIT 0x10FFFF
+
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
identifier. We assume C has already gone through the checks of
@@ -915,7 +918,7 @@ ucn_valid_in_identifier (cpp_reader *pfi
int mn, mx, md;
unsigned short valid_flags, invalid_start_flags;
- if (c > 0x10FFFF)
+ if (c > UCS_LIMIT)
return 0;
mn = 0;
@@ -1016,6 +1019,9 @@ ucn_valid_in_identifier (cpp_reader *pfi
whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
or 0060 (`), nor one in the range D800 through DFFF inclusive.
+ If the hexadecimal value is larger than the upper bound of the UCS
+ codespace specified in ISO/IEC 10646, a pedantic warning is issued.
+
*PSTR must be preceded by "\u" or "\U"; it is assumed that the
buffer end is delimited by a non-hex digit. Returns false if the
UCN has not been consumed, true otherwise.
@@ -1135,6 +1141,10 @@ _cpp_valid_ucn (cpp_reader *pfile, const
"universal character %.*s is not valid at the start of an identifier",
(int) (str - base), base);
}
+ else if (result > UCS_LIMIT)
+ cpp_error (pfile, CPP_DL_PEDWARN,
+ "%.*s is outside the UCS codespace",
+ (int) (str - base), base);
*cp = result;
return true;
Index: gcc/testsuite/gcc.dg/attr-alias-5.c
===================================================================
--- gcc/testsuite/gcc.dg/attr-alias-5.c (revision 275988)
+++ gcc/testsuite/gcc.dg/attr-alias-5.c (working copy)
@@ -1,7 +1,7 @@
/* Verify diagnostics for aliases to strings containing extended
identifiers or bad characters. */
/* { dg-do compile } */
-/* { dg-options "-std=gnu99" } */
+/* { dg-options "-std=gnu99 -w" } */
/* { dg-require-alias "" } */
/* { dg-require-ascii-locale "" } */
/* { dg-skip-if "" { powerpc*-*-aix* } } */
Index: gcc/testsuite/gcc.dg/cpp/ucs.c
===================================================================
--- gcc/testsuite/gcc.dg/cpp/ucs.c (revision 275988)
+++ gcc/testsuite/gcc.dg/cpp/ucs.c (working copy)
@@ -39,7 +39,7 @@
#endif
#if WCHAR_MAX >= 0x7ffffff
-# if L'\U1234abcd' != 0x1234abcd
+# if L'\U1234abcd' != 0x1234abcd /* { dg-warning "outside" "" } */
# error bad long ucs /* { dg-bogus "bad" "bad U1234abcd evaluation" } */
# endif
#endif
@@ -49,7 +49,7 @@ void foo ()
int c;
c = L'\ubad'; /* { dg-error "incomplete" "incomplete UCN 1" } */
- c = L"\U1234"[0]; /* { dg-error "incomplete" "incompete UCN 2" } */
+ c = L"\U1234"[0]; /* { dg-error "incomplete" "incomplete UCN 2" } */
c = L'\u000x'; /* { dg-error "incomplete" "non-hex digit in UCN" } */
/* If sizeof(HOST_WIDE_INT) > sizeof(wchar_t), we can get a multi-character
@@ -64,4 +64,6 @@ void foo ()
c = '\u0025'; /* { dg-error "not a valid" "0025 invalid UCN" } */
c = L"\uD800"[0]; /* { dg-error "not a valid" "D800 invalid UCN" } */
c = L'\U0000DFFF'; /* { dg-error "not a valid" "DFFF invalid UCN" } */
+
+ c = L'\U00110000'; /* { dg-warning "outside" "110000 outside UCS" } */
}
Index: gcc/testsuite/gcc.dg/cpp/utf8-5byte-1.c
===================================================================
--- gcc/testsuite/gcc.dg/cpp/utf8-5byte-1.c (revision 275988)
+++ gcc/testsuite/gcc.dg/cpp/utf8-5byte-1.c (working copy)
@@ -1,7 +1,7 @@
/* Test for bug in conversions from 5-byte UTF-8 sequences in
cpplib. */
/* { dg-do run { target { 4byte_wchar_t } } } */
-/* { dg-options "-std=gnu99" } */
+/* { dg-options "-std=gnu99 -w" } */
extern void abort (void);
extern void exit (int);