cor3ntin created this revision. Herald added a project: All. cor3ntin requested review of this revision. Herald added a project: clang. Herald added a subscriber: cfe-commits.
Implement the proposed UAX Profile "Mathematical notation profile for default identifiers". This implements a not-yet approved Unicode for a vetted UAX32 identifier profile https://www.unicode.org/L2/L2022/22230-math-profile.pdf This change mitigates the reported disruption caused by the implementation of UAX32 in C++ and C2x, as these mathematical symbols are commonly used in the scientific community. Fixes #54732 Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D137051 Files: clang/docs/ReleaseNotes.rst clang/include/clang/Basic/DiagnosticLexKinds.td clang/lib/Lex/Lexer.cpp clang/lib/Lex/UnicodeCharSets.h clang/test/Driver/autocomplete.c clang/test/Lexer/unicode.c
Index: clang/test/Lexer/unicode.c =================================================================== --- clang/test/Lexer/unicode.c +++ clang/test/Lexer/unicode.c @@ -45,7 +45,17 @@ extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \ // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}} +extern int ð; // expected-warning {{mathematical notation character <U+1D6DB> in identifier is a Clang extension}} +extern int â; // expected-error {{character <U+2089> not allowed at the start of an identifier}} \\ + expected-warning {{declaration does not declare anything}} +int a¹bâââââ; // expected-warning 6{{mathematical notation character}} + +int \u{221E} = 1; // expected-warning {{mathematical notation character}} +int \N{MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL} = 1; + // expected-warning@-1 {{mathematical notation character}} + +int a\N{SUBSCRIPT EQUALS SIGN} = 1; // expected-warning {{mathematical notation character}} // This character doesn't have the XID_Start property extern int \U00016AC0; // TANGSA DIGIT ZERO // cxx-error {{expected unqualified-id}} \ Index: clang/test/Driver/autocomplete.c =================================================================== --- clang/test/Driver/autocomplete.c +++ clang/test/Driver/autocomplete.c @@ -111,6 +111,7 @@ // WARNING-NEXT: -Wmain-return-type // WARNING-NEXT: -Wmalformed-warning-check // WARNING-NEXT: -Wmany-braces-around-scalar-init +// WARNING-NEXT: -Wmathematical-notation-identifier-extension // WARNING-NEXT: -Wmax-tokens // WARNING-NEXT: -Wmax-unsigned-zero // RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING Index: clang/lib/Lex/UnicodeCharSets.h =================================================================== --- clang/lib/Lex/UnicodeCharSets.h +++ clang/lib/Lex/UnicodeCharSets.h @@ -366,6 +366,34 @@ {0x1E4EC, 0x1E4F9}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9}, {0xE0100, 0xE01EF}}; +// Clang supports the "Mathematical notation profile" as an extension, +// as described in https://www.unicode.org/L2/L2022/22230-math-profile.pdf +// Math_Start +static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[] = { + {0x02202, 0x02202}, // â + {0x02207, 0x02207}, // â + {0x0221E, 0x0221E}, // â + {0x1D6C1, 0x1D6C1}, // ð + {0x1D6DB, 0x1D6DB}, // ð + {0x1D6FB, 0x1D6FB}, // ð» + {0x1D715, 0x1D715}, // ð + {0x1D735, 0x1D735}, // ðµ + {0x1D74F, 0x1D74F}, // ð + {0x1D76F, 0x1D76F}, // ð¯ + {0x1D789, 0x1D789}, // ð + {0x1D7A9, 0x1D7A9}, // ð© + {0x1D7C3, 0x1D7C3}, // ð +}; + +// Math_Continue +static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[] = { + {0x000B2, 0x000B3}, // ²-³ + {0x000B9, 0x000B9}, // ¹ + {0x02070, 0x02070}, // â° + {0x02074, 0x0207E}, // â´-â¾ + {0x02080, 0x0208E}, // â-â +}; + // C11 D.1, C++11 [charname.allowed] static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = { // 1 Index: clang/lib/Lex/Lexer.cpp =================================================================== --- clang/lib/Lex/Lexer.cpp +++ clang/lib/Lex/Lexer.cpp @@ -1457,7 +1457,30 @@ return UnicodeWhitespaceChars.contains(Codepoint); } -static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { +static llvm::SmallString<5> codepointAsHexString(uint32_t C) { + llvm::SmallString<5> CharBuf; + llvm::raw_svector_ostream CharOS(CharBuf); + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); + return CharBuf; +} + +// To mitigate https://github.com/llvm/llvm-project/issues/54732, +// we allow "Mathematical Notation Characters" in identifiers. +// This is a proposed profile that extends the XID_Start/XID_continue +// with mathematical symbols, superscipts and subscripts digits +// found in some production software. +// https://www.unicode.org/L2/L2022/22230-math-profile.pdf +static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool & IsExtension) { + static const llvm::sys::UnicodeCharSet MathStartChars(MathematicalNotationProfileIDStartRanges); + static const llvm::sys::UnicodeCharSet MathContinueChars(MathematicalNotationProfileIDContinueRanges); + if(MathStartChars.contains(C) || (!IsStart && MathContinueChars.contains(C))) { + IsExtension = true; + return true; + } + return false; +} + +static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool & IsExtension) { if (LangOpts.AsmPreprocessor) { return false; } else if (LangOpts.DollarIdents && '$' == C) { @@ -1469,8 +1492,10 @@ // '_' doesn't have the XID_Continue property but is allowed in C and C++. static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); - return C == '_' || XIDStartChars.contains(C) || - XIDContinueChars.contains(C); + if(C == '_' || XIDStartChars.contains(C) || + XIDContinueChars.contains(C)) + return true; + return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, IsExtension); } else if (LangOpts.C11) { static const llvm::sys::UnicodeCharSet C11AllowedIDChars( C11AllowedIDCharRanges); @@ -1482,16 +1507,19 @@ } } -static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { +static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool & IsExtension) { assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); + IsExtension = false; if (LangOpts.AsmPreprocessor) { return false; } if (LangOpts.CPlusPlus || LangOpts.C2x) { static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); - return XIDStartChars.contains(C); + if(XIDStartChars.contains(C)) + return true; + return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, IsExtension); } - if (!isAllowedIDChar(C, LangOpts)) + if (!isAllowedIDChar(C, LangOpts, IsExtension)) return false; if (LangOpts.C11) { static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( @@ -1503,6 +1531,18 @@ return !C99DisallowedInitialIDChars.contains(C); } +static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range) { + + static const llvm::sys::UnicodeCharSet MathStartChars(MathematicalNotationProfileIDStartRanges); + static const llvm::sys::UnicodeCharSet MathContinueChars(MathematicalNotationProfileIDContinueRanges); + + assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) + && "Unexpected mathematical notation codepoint"); + Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) + << codepointAsHexString(C) << Range; +} + + static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End) { return CharSourceRange::getCharRange(L.getSourceLocation(Begin), @@ -1602,18 +1642,13 @@ std::lower_bound(std::begin(SortedHomoglyphs), std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); if (Homoglyph->Character == C) { - llvm::SmallString<5> CharBuf; - { - llvm::raw_svector_ostream CharOS(CharBuf); - llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); - } if (Homoglyph->LooksLike) { const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) - << Range << CharBuf << LooksLikeStr; + << Range << codepointAsHexString(C) << LooksLikeStr; } else { Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) - << Range << CharBuf; + << Range << codepointAsHexString(C); } } } @@ -1624,25 +1659,22 @@ if (isASCII(CodePoint)) return; - bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts); - bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts); + bool IsExtension; + bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); + bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) return; bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; - llvm::SmallString<5> CharBuf; - llvm::raw_svector_ostream CharOS(CharBuf); - llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4); - if (!IsFirst || InvalidOnlyAtStart) { Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) - << Range << CharBuf << int(InvalidOnlyAtStart) + << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) << FixItHint::CreateRemoval(Range); } else { Diags.Report(Range.getBegin(), diag::err_character_not_allowed) - << Range << CharBuf << FixItHint::CreateRemoval(Range); + << Range << codepointAsHexString(CodePoint) << FixItHint::CreateRemoval(Range); } } @@ -1653,8 +1685,8 @@ if (CodePoint == 0) { return false; } - - if (!isAllowedIDChar(CodePoint, LangOpts)) { + bool IsExtension = false; + if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) return false; if (!isLexingRawMode() && !ParsingPreprocessorDirective && @@ -1667,10 +1699,15 @@ // We got a unicode codepoint that is neither a space nor a // a valid identifier part. // Carry on as if the codepoint was valid for recovery purposes. - } else if (!isLexingRawMode()) + } else if (!isLexingRawMode()) { + if(IsExtension) + diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UCNPtr)); + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, makeCharRange(*this, CurPtr, UCNPtr), /*IsFirst=*/false); + } Result.setFlag(Token::HasUCN); if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || @@ -1693,7 +1730,8 @@ if (Result != llvm::conversionOK) return false; - if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) { + bool IsExtension = false;; + if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, IsExtension)) { if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) return false; @@ -1706,6 +1744,9 @@ // a valid identifier part. Carry on as if the codepoint was // valid for recovery purposes. } else if (!isLexingRawMode()) { + if(IsExtension) + diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr)); maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); @@ -1719,9 +1760,13 @@ bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr) { - if (isAllowedInitiallyIDChar(C, LangOpts)) { + bool IsExtension = false; + if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { if (!isLexingRawMode() && !ParsingPreprocessorDirective && !PP->isPreprocessedOutput()) { + if(IsExtension) + diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, + makeCharRange(*this, BufferPtr, CurPtr)); maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, makeCharRange(*this, BufferPtr, CurPtr), /*IsFirst=*/true); @@ -1735,7 +1780,7 @@ if (!isLexingRawMode() && !ParsingPreprocessorDirective && !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && - !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { + !isUnicodeWhitespace(C)) { // Non-ASCII characters tend to creep into source code unintentionally. // Instead of letting the parser complain about the unknown token, // just drop the character. Index: clang/include/clang/Basic/DiagnosticLexKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticLexKinds.td +++ clang/include/clang/Basic/DiagnosticLexKinds.td @@ -132,6 +132,9 @@ def warn_utf8_symbol_zero_width : Warning< "identifier contains Unicode character <U+%0> that is invisible in " "some environments">, InGroup<DiagGroup<"unicode-zero-width">>; +def ext_mathematical_notation : ExtWarn< + "mathematical notation character <U+%0> in identifier is a Clang extension">, + InGroup<DiagGroup<"mathematical-notation-identifier-extension">>; def ext_delimited_escape_sequence : Extension< "%select{delimited|named}0 escape sequences are a " Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -344,6 +344,11 @@ - Unicode support has been updated to support Unicode 15.0. New unicode codepoints are supported as appropriate in diagnostics, C and C++ identifiers, and escape sequences. +- In identifiers, Clang allows a restricted set of additional mathematical symbols + as an extension. These symbols correspond to a proposed Unicode + `Mathematical notation profile for default identifiers + <https://www.unicode.org/L2/L2022/22230-math-profile.pdf>`_. + This resolves `Issue 54732 <https://github.com/llvm/llvm-project/issues/54732>`_. - Clang now supports loading multiple configuration files. The files from default configuration paths are loaded first, unless ``--no-default-config`` option is used. All files explicitly specified using ``--config=`` option
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits