cor3ntin updated this revision to Diff 438926. cor3ntin marked 3 inline comments as done. cor3ntin added a comment.
- Address style comments - Improve commit message - Enable the warning in -pedantic Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D128059/new/ https://reviews.llvm.org/D128059 Files: clang/docs/ReleaseNotes.rst clang/include/clang/Basic/DiagnosticLexKinds.td clang/lib/Lex/Lexer.cpp clang/test/Lexer/comment-invalid-utf8.c llvm/include/llvm/Support/ConvertUTF.h llvm/lib/Support/ConvertUTF.cpp
Index: llvm/lib/Support/ConvertUTF.cpp =================================================================== --- llvm/lib/Support/ConvertUTF.cpp +++ llvm/lib/Support/ConvertUTF.cpp @@ -417,6 +417,16 @@ return isLegalUTF8(source, length); } +/* + * Exported function to return the size of the first utf-8 code unit sequence, + * Or 0 if the sequence is not valid; + */ +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + return (length > sourceEnd - source && isLegalUTF8(source, length)) ? length + : 0; +} + /* --------------------------------------------------------------------- */ static unsigned Index: llvm/include/llvm/Support/ConvertUTF.h =================================================================== --- llvm/include/llvm/Support/ConvertUTF.h +++ llvm/include/llvm/Support/ConvertUTF.h @@ -181,6 +181,8 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); +unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd); + unsigned getNumBytesForUTF8(UTF8 firstByte); /*************************************************************************/ Index: clang/lib/Lex/Lexer.cpp =================================================================== --- clang/lib/Lex/Lexer.cpp +++ clang/lib/Lex/Lexer.cpp @@ -2391,13 +2391,39 @@ // // This loop terminates with CurPtr pointing at the newline (or end of buffer) // character that ends the line comment. + + bool WarnOnInvalidUtf8 = + !isLexingRawMode() && + !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment, + getSourceLocation(CurPtr)); + bool UnicodeDecodeFailed = false; + char C; while (true) { C = *CurPtr; // Skip over characters in the fast loop. - while (C != 0 && // Potentially EOF. - C != '\n' && C != '\r') // Newline or DOS-style newline. + // Warn on invalid UTF-8 if the corresponding warning is enabled, emitting a + // diagnostic only once per sequence that cannot be decoded. + while ((!WarnOnInvalidUtf8 || isASCII(C)) && C != 0 && // Potentially EOF. + C != '\n' && C != '\r') { // Newline or DOS-style newline. C = *++CurPtr; + UnicodeDecodeFailed = false; + } + + if (WarnOnInvalidUtf8 && !isASCII(C)) { + unsigned Length = llvm::getUTF8SequenceSize( + (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); + if (Length == 0) { + if (!UnicodeDecodeFailed) + Diag(CurPtr, diag::warn_invalid_utf8_in_comment); + UnicodeDecodeFailed = true; + ++CurPtr; + } else { + UnicodeDecodeFailed = false; + CurPtr += Length; + } + continue; + } const char *NextLine = CurPtr; if (C != 0) { @@ -2664,10 +2690,18 @@ if (C == '/') C = *CurPtr++; + bool WarnOnInvalidUtf8 = + !isLexingRawMode() && + !PP->getDiagnostics().isIgnored(diag::warn_invalid_utf8_in_comment, + getSourceLocation(CurPtr)); + bool UnicodeDecodeFailed = false; + while (true) { // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. - if (CurPtr + 24 < BufferEnd && + // When diagnosing invalid UTF-8 sequences we always skip the fast + // vectorized path. + if (!WarnOnInvalidUtf8 && CurPtr + 24 < BufferEnd && // If there is a code-completion point avoid the fast scan because it // doesn't check for '\0'. !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { @@ -2714,9 +2748,27 @@ C = *CurPtr++; } - // Loop to scan the remainder. - while (C != '/' && C != '\0') - C = *CurPtr++; + // Loop to scan the remainder, warning on invalid UTF-8 + // if the corresponding warning is enabled, emitting a diagnostic only once + // per sequence that cannot be decoded. + while (C != '/' && C != '\0') { + if (!WarnOnInvalidUtf8 || isASCII(C)) { + UnicodeDecodeFailed = false; + C = *CurPtr++; + continue; + } + unsigned Length = llvm::getUTF8SequenceSize( + (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); + if (Length == 0) { + if (!UnicodeDecodeFailed) + Diag(CurPtr, diag::warn_invalid_utf8_in_comment); + UnicodeDecodeFailed = true; + C = *CurPtr++; + continue; + } + UnicodeDecodeFailed = false; + C = *(CurPtr += Length - 1); + } if (C == '/') { FoundSlash: Index: clang/include/clang/Basic/DiagnosticLexKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticLexKinds.td +++ clang/include/clang/Basic/DiagnosticLexKinds.td @@ -113,6 +113,8 @@ // Unicode and UCNs def err_invalid_utf8 : Error< "source file is not valid UTF-8">; +def warn_invalid_utf8_in_comment : ExtWarn< + "invalid UTF-8 in comment">, InGroup<DiagGroup<"invalid-utf8">>; def err_character_not_allowed : Error< "unexpected character <U+%0>">; def err_character_not_allowed_identifier : Error< Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -267,6 +267,8 @@ - When using class templates without arguments, clang now tells developers that template arguments are missing in certain contexts. This fixes `Issue 55962 <https://github.com/llvm/llvm-project/issues/55962>`_. +- Added ``-Winvalid-utf8`` which diagnose invalid UTF-8 code unit sequences in + comments. Non-comprehensive list of changes in this release -------------------------------------------------
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits