[PATCH] D137051: [Clang] Allow additional mathematical symbols in identifiers.

Corentin Jabot via Phabricator via cfe-commits Sun, 30 Oct 2022 15:25:31 -0700

cor3ntin created this revision.
Herald added a project: All.
cor3ntin requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


Implement the proposed UAX Profile
"Mathematical notation profile for default identifiers".

This implements a not-yet approved Unicode for a vetted
UAX32 identifier profile
https://www.unicode.org/L2/L2022/22230-math-profile.pdf

This change mitigates the reported disruption caused
by the implementation of UAX32 in C++ and C2x,
as these mathematical symbols are commonly used in the
scientific community.

Fixes #54732


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D137051

Files:
  clang/docs/ReleaseNotes.rst
  clang/include/clang/Basic/DiagnosticLexKinds.td
  clang/lib/Lex/Lexer.cpp
  clang/lib/Lex/UnicodeCharSets.h
  clang/test/Driver/autocomplete.c
  clang/test/Lexer/unicode.c

Index: clang/test/Lexer/unicode.c
===================================================================
--- clang/test/Lexer/unicode.c
+++ clang/test/Lexer/unicode.c
@@ -45,7 +45,17 @@
 extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
                                 // expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}
 
+extern int ð; // expected-warning {{mathematical notation character <U+1D6DB> in identifier is a Clang extension}}
+extern int â; // expected-error {{character <U+2089> not allowed at the start of an identifier}} \\
+                 expected-warning {{declaration does not declare anything}}
 
+int aÂ¹bâââââ; // expected-warning 6{{mathematical notation character}}
+
+int \u{221E} = 1; // expected-warning {{mathematical notation character}}
+int \N{MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL} = 1;
+                 // expected-warning@-1 {{mathematical notation character}}
+
+int a\N{SUBSCRIPT EQUALS SIGN} = 1; // expected-warning {{mathematical notation character}}
 
 // This character doesn't have the XID_Start property
 extern int  \U00016AC0; // TANGSA DIGIT ZERO  // cxx-error {{expected unqualified-id}} \
Index: clang/test/Driver/autocomplete.c
===================================================================
--- clang/test/Driver/autocomplete.c
+++ clang/test/Driver/autocomplete.c
@@ -111,6 +111,7 @@
 // WARNING-NEXT: -Wmain-return-type
 // WARNING-NEXT: -Wmalformed-warning-check
 // WARNING-NEXT: -Wmany-braces-around-scalar-init
+// WARNING-NEXT: -Wmathematical-notation-identifier-extension
 // WARNING-NEXT: -Wmax-tokens
 // WARNING-NEXT: -Wmax-unsigned-zero
 // RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING
Index: clang/lib/Lex/UnicodeCharSets.h
===================================================================
--- clang/lib/Lex/UnicodeCharSets.h
+++ clang/lib/Lex/UnicodeCharSets.h
@@ -366,6 +366,34 @@
     {0x1E4EC, 0x1E4F9}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A},
     {0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9}, {0xE0100, 0xE01EF}};
 
+// Clang supports the "Mathematical notation profile" as an extension,
+// as described in https://www.unicode.org/L2/L2022/22230-math-profile.pdf
+// Math_Start
+static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[] = {
+    {0x02202, 0x02202}, // â
+    {0x02207, 0x02207}, // â
+    {0x0221E, 0x0221E}, // â
+    {0x1D6C1, 0x1D6C1}, // ð
+    {0x1D6DB, 0x1D6DB}, // ð
+    {0x1D6FB, 0x1D6FB}, // ð»
+    {0x1D715, 0x1D715}, // ð
+    {0x1D735, 0x1D735}, // ðµ
+    {0x1D74F, 0x1D74F}, // ð
+    {0x1D76F, 0x1D76F}, // ð¯
+    {0x1D789, 0x1D789}, // ð
+    {0x1D7A9, 0x1D7A9}, // ð©
+    {0x1D7C3, 0x1D7C3}, // ð
+};
+
+// Math_Continue
+static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[] = {
+    {0x000B2, 0x000B3}, // Â²-Â³
+    {0x000B9, 0x000B9}, // Â¹
+    {0x02070, 0x02070}, // â°
+    {0x02074, 0x0207E}, // â´-â¾
+    {0x02080, 0x0208E}, // â-â
+};
+
 // C11 D.1, C++11 [charname.allowed]
 static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = {
   // 1
Index: clang/lib/Lex/Lexer.cpp
===================================================================
--- clang/lib/Lex/Lexer.cpp
+++ clang/lib/Lex/Lexer.cpp
@@ -1457,7 +1457,30 @@
   return UnicodeWhitespaceChars.contains(Codepoint);
 }
 
-static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
+static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
+  llvm::SmallString<5> CharBuf;
+  llvm::raw_svector_ostream CharOS(CharBuf);
+  llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+  return CharBuf;
+}
+
+// To mitigate https://github.com/llvm/llvm-project/issues/54732,
+// we allow "Mathematical Notation Characters" in identifiers.
+// This is a proposed profile that extends the XID_Start/XID_continue
+// with mathematical symbols, superscipts and subscripts digits
+// found in some production software.
+// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
+static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool & IsExtension) {
+  static const llvm::sys::UnicodeCharSet MathStartChars(MathematicalNotationProfileIDStartRanges);
+  static const llvm::sys::UnicodeCharSet MathContinueChars(MathematicalNotationProfileIDContinueRanges);
+  if(MathStartChars.contains(C) || (!IsStart && MathContinueChars.contains(C))) {
+    IsExtension = true;
+    return true;
+  }
+  return false;
+}
+
+static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool & IsExtension) {
   if (LangOpts.AsmPreprocessor) {
     return false;
   } else if (LangOpts.DollarIdents && '$' == C) {
@@ -1469,8 +1492,10 @@
     // '_' doesn't have the XID_Continue property but is allowed in C and C++.
     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
     static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
-    return C == '_' || XIDStartChars.contains(C) ||
-           XIDContinueChars.contains(C);
+    if(C == '_' || XIDStartChars.contains(C) ||
+        XIDContinueChars.contains(C))
+      return true;
+    return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, IsExtension);
   } else if (LangOpts.C11) {
     static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
         C11AllowedIDCharRanges);
@@ -1482,16 +1507,19 @@
   }
 }
 
-static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
+static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool & IsExtension) {
   assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
+  IsExtension = false;
   if (LangOpts.AsmPreprocessor) {
     return false;
   }
   if (LangOpts.CPlusPlus || LangOpts.C2x) {
     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
-    return XIDStartChars.contains(C);
+    if(XIDStartChars.contains(C))
+      return true;
+    return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, IsExtension);
   }
-  if (!isAllowedIDChar(C, LangOpts))
+  if (!isAllowedIDChar(C, LangOpts, IsExtension))
     return false;
   if (LangOpts.C11) {
     static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
@@ -1503,6 +1531,18 @@
   return !C99DisallowedInitialIDChars.contains(C);
 }
 
+static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range) {
+
+  static const llvm::sys::UnicodeCharSet MathStartChars(MathematicalNotationProfileIDStartRanges);
+  static const llvm::sys::UnicodeCharSet MathContinueChars(MathematicalNotationProfileIDContinueRanges);
+
+  assert((MathStartChars.contains(C) || MathContinueChars.contains(C))
+         && "Unexpected mathematical notation codepoint");
+  Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
+      << codepointAsHexString(C) << Range;
+}
+
+
 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
                                             const char *End) {
   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
@@ -1602,18 +1642,13 @@
       std::lower_bound(std::begin(SortedHomoglyphs),
                        std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
   if (Homoglyph->Character == C) {
-    llvm::SmallString<5> CharBuf;
-    {
-      llvm::raw_svector_ostream CharOS(CharBuf);
-      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
-    }
     if (Homoglyph->LooksLike) {
       const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
-          << Range << CharBuf << LooksLikeStr;
+          << Range << codepointAsHexString(C) << LooksLikeStr;
     } else {
       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
-          << Range << CharBuf;
+          << Range << codepointAsHexString(C);
     }
   }
 }
@@ -1624,25 +1659,22 @@
   if (isASCII(CodePoint))
     return;
 
-  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
-  bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
+  bool IsExtension;
+  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
+  bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
 
   if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
     return;
 
   bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
 
-  llvm::SmallString<5> CharBuf;
-  llvm::raw_svector_ostream CharOS(CharBuf);
-  llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
-
   if (!IsFirst || InvalidOnlyAtStart) {
     Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
-        << Range << CharBuf << int(InvalidOnlyAtStart)
+        << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
         << FixItHint::CreateRemoval(Range);
   } else {
     Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
-        << Range << CharBuf << FixItHint::CreateRemoval(Range);
+        << Range << codepointAsHexString(CodePoint) << FixItHint::CreateRemoval(Range);
   }
 }
 
@@ -1653,8 +1685,8 @@
   if (CodePoint == 0) {
     return false;
   }
-
-  if (!isAllowedIDChar(CodePoint, LangOpts)) {
+  bool IsExtension = false;
+  if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
       return false;
     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
@@ -1667,10 +1699,15 @@
     // We got a unicode codepoint that is neither a space nor a
     // a valid identifier part.
     // Carry on as if the codepoint was valid for recovery purposes.
-  } else if (!isLexingRawMode())
+  } else if (!isLexingRawMode()) {
+    if(IsExtension)
+      diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
+                                            makeCharRange(*this, CurPtr, UCNPtr));
+
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UCNPtr),
                               /*IsFirst=*/false);
+  }
 
   Result.setFlag(Token::HasUCN);
   if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
@@ -1693,7 +1730,8 @@
   if (Result != llvm::conversionOK)
     return false;
 
-  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
+  bool IsExtension = false;;
+  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, IsExtension)) {
     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
       return false;
 
@@ -1706,6 +1744,9 @@
     // a valid identifier part. Carry on as if the codepoint was
     // valid for recovery purposes.
   } else if (!isLexingRawMode()) {
+    if(IsExtension)
+      diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
+                                            makeCharRange(*this, CurPtr, UnicodePtr));
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UnicodePtr),
                               /*IsFirst=*/false);
@@ -1719,9 +1760,13 @@
 
 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
                                       const char *CurPtr) {
-  if (isAllowedInitiallyIDChar(C, LangOpts)) {
+  bool IsExtension = false;
+  if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
         !PP->isPreprocessedOutput()) {
+      if(IsExtension)
+        diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
+                                              makeCharRange(*this, BufferPtr, CurPtr));
       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
                                 makeCharRange(*this, BufferPtr, CurPtr),
                                 /*IsFirst=*/true);
@@ -1735,7 +1780,7 @@
 
   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
       !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
-      !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
+      !isUnicodeWhitespace(C)) {
     // Non-ASCII characters tend to creep into source code unintentionally.
     // Instead of letting the parser complain about the unknown token,
     // just drop the character.
Index: clang/include/clang/Basic/DiagnosticLexKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticLexKinds.td
+++ clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -132,6 +132,9 @@
 def warn_utf8_symbol_zero_width : Warning<
   "identifier contains Unicode character <U+%0> that is invisible in "
   "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
+def ext_mathematical_notation : ExtWarn<
+  "mathematical notation character <U+%0> in identifier is a Clang extension">,
+  InGroup<DiagGroup<"mathematical-notation-identifier-extension">>;
 
 def ext_delimited_escape_sequence : Extension<
   "%select{delimited|named}0 escape sequences are a "
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -344,6 +344,11 @@
 - Unicode support has been updated to support Unicode 15.0.
   New unicode codepoints are supported as appropriate in diagnostics,
   C and C++ identifiers, and escape sequences.
+- In identifiers, Clang allows a restricted set of additional mathematical symbols
+  as an extension. These symbols correspond to a proposed Unicode
+  `Mathematical notation profile for default identifiers
+  <https://www.unicode.org/L2/L2022/22230-math-profile.pdf>`_.
+  This resolves `Issue 54732 <https://github.com/llvm/llvm-project/issues/54732>`_.
 - Clang now supports loading multiple configuration files. The files from
   default configuration paths are loaded first, unless ``--no-default-config``
   option is used. All files explicitly specified using ``--config=`` option

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D137051: [Clang] Allow additional mathematical symbols in identifiers.

Reply via email to