llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Aaron Ballman (AaronBallman) <details> <summary>Changes</summary> This was the paper that added Universal Character Names to C. --- Full diff: https://github.com/llvm/llvm-project/pull/87228.diff 3 Files Affected: - (added) clang/test/C/C99/n717.c (+69) - (added) clang/test/C/C99/n717.py (+39) - (modified) clang/www/c_status.html (+1-1) ``````````diff diff --git a/clang/test/C/C99/n717.c b/clang/test/C/C99/n717.c new file mode 100644 index 00000000000000..cc1aa0fd5d53cf --- /dev/null +++ b/clang/test/C/C99/n717.c @@ -0,0 +1,69 @@ +// RUN: %clang_cc1 -verify -std=c99 %s +// RUN: %clang_cc1 -verify -std=c99 -fno-dollars-in-identifiers %s + +/* WG14 N717: Clang 17 + * Extended identifiers + */ + +// Used as a sink for UCNs. +#define M(arg) + +// C99 6.4.3p1 specifies the grammar for UCNs. A \u must be followed by exactly +// four hex digits, and \U must be followed by exactly eight. +M(\u1) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\u12) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\u123) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\u1234) // Okay +M(\u12345)// Okay, two tokens (UCN followed by 5) + +M(\U1) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\U12) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\U123) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\U1234) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} \ + expected-note {{did you mean to use '\u'?}} +M(\U12345) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\U123456) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\U1234567) // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} +M(\U12345678) // Okay +M(\U123456789) // Okay-ish, two tokens (valid-per-spec-but-actually-invalid UCN followed by 9) + +// C99 6.4.3p2: +// A universal character name shall not specify a character whose short +// identifier is less than 00A0 other than 0024 ($), 0040 (@), or 0060 (�), nor +// one in the range D800 through DFFF inclusive. +// +// We use a python script to generate the test contents for the large ranges +// without edge cases. +// RUN: %python %S/n717.py >%t.inc +// RUN: %clang_cc1 -verify -std=c99 -Wno-unicode-whitespace -Wno-unicode-homoglyph -Wno-unicode-zero-width -Wno-mathematical-notation-identifier-extension %t.inc + +// Now test the ones that should work. Note, these work in C17 and earlier but +// are part of the basic character set in C23 and thus should be diagnosed in +// that mode. They're valid in a character constant, but not valid in an +// identifier, except for U+0024 which is allowed if -fdollars-in-identifiers +// is enabled. +// FIXME: These three should be handled the same way, and should be accepted +// when dollar signs are allowed in identifiers, rather than rejected, see +// GH87106. +M(\u0024) // expected-error {{character '$' cannot be specified by a universal character name}} +M(\U00000024) // expected-error {{character '$' cannot be specified by a universal character name}} +M($) + +// These should always be rejected because they're not valid identifier +// characters. +// FIXME: the diagnostic could be improved to make it clear this is an issue +// with forming an identifier rather than a UCN. +M(\u0040) // expected-error {{character '@' cannot be specified by a universal character name}} +M(\u0060) // expected-error {{character '`' cannot be specified by a universal character name}} +M(\U00000040) // expected-error {{character '@' cannot be specified by a universal character name}} +M(\U00000060) // expected-error {{character '`' cannot be specified by a universal character name}} + +// These should always be accepted because they're a valid in a character +// constant. +M('\u0024') +M('\u0040') +M('\u0060') + +M('\U00000024') +M('\U00000040') +M('\U00000060') diff --git a/clang/test/C/C99/n717.py b/clang/test/C/C99/n717.py new file mode 100644 index 00000000000000..8c02d336ff6f60 --- /dev/null +++ b/clang/test/C/C99/n717.py @@ -0,0 +1,39 @@ +print("#define M(arg)") + +def test(size): + Prefix = 'U' if size == 8 else 'u' + # [0x0000 to 0x00A0) excluding [0x0020, 0x007F) + for val in [val for val in range(0x0000, 0x00A0) if val < 0x0020 or val >= 0x007F]: + print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{universal character name refers to a control character}}}}') + print('') + + # [0x0020 to 0x007F), excluding 0x0024, 0x0040, and 0x0060 + for val in [val for val in range(0x0020, 0x007F) if val != 0x0024 and val != 0x0040 and val != 0x0060]: + print(f"M(\\{Prefix}{val:0{size}X}) // expected-error {{{{character '{chr(val)}' cannot be specified by a universal character name}}}}") + print('') + + # [0xD800 to 0xDFFF] + for val in range(0xD800, 0xDFFF + 1): + print(f'M(\\{Prefix}{val:0{size}X}) // expected-error {{{{invalid universal character}}}}') + print('') + + # Everything in this range should be accepted, though it may produce a + # warning diagnostic for things like homoglyphs, whitespace, etc. + for val in range(0x00A1, 0xD800): + print(f'M(\\{Prefix}{val:0{size}X})') + print('') + +# Print \u tests +test(4) +# Print \U tests +test(8) + +# Validate that the \U characters have the same identity as the \u characters +# within the valid (short) range. +# This is disabled because enabling the test 1) requires using L because u and +# U don't exist until C11, 2) is questionable in terms of value because the +# code points could be different if L isn't using a Unicode encoding, and 3) +# this addition to the test adds 10x the execution time when running the test. +#for val in range(0x00A1, 0xD800): +# print(f"_Static_assert(L'\\u{val:04X}' == L'\\U{val:08X}', \"\");") +#print('') diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 028234a8961db2..a14bfa2c1efb3d 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -203,7 +203,7 @@ <h2 id="c99">C99 implementation status</h2> <tr> <td>extended identifiers</td> <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n717.htm">N717</a></td> - <td class="unknown" align="center">Unknown</td> + <td class="full" align="center">Clang 17</td> </tr> <tr> <td>hexadecimal floating-point constants</td> `````````` </details> https://github.com/llvm/llvm-project/pull/87228 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits