llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Abhina Sree (abhina-sree) <details> <summary>Changes</summary> This patch enables the fexec-charset option to control the execution charset of string literals. It sets the default internal charset, system charset, and execution charset for z/OS and UTF-8 for all other platforms. This patch depends on adding the CharSetConverter class https://github.com/llvm/llvm-project/pull/138893 --- Patch is 34.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138895.diff 20 Files Affected: - (modified) clang/docs/LanguageExtensions.rst (+1-2) - (modified) clang/include/clang/Basic/LangOptions.h (+3) - (modified) clang/include/clang/Basic/TokenKinds.h (+7) - (modified) clang/include/clang/Driver/Options.td (+5) - (added) clang/include/clang/Lex/LiteralConverter.h (+36) - (modified) clang/include/clang/Lex/LiteralSupport.h (+8-4) - (modified) clang/include/clang/Lex/Preprocessor.h (+3) - (modified) clang/lib/Driver/ToolChains/Clang.cpp (+13-4) - (modified) clang/lib/Frontend/CompilerInstance.cpp (+4) - (modified) clang/lib/Frontend/InitPreprocessor.cpp (+8-4) - (modified) clang/lib/Lex/CMakeLists.txt (+1) - (added) clang/lib/Lex/LiteralConverter.cpp (+68) - (modified) clang/lib/Lex/LiteralSupport.cpp (+106-22) - (added) clang/test/CodeGen/systemz-charset.c (+35) - (added) clang/test/CodeGen/systemz-charset.cpp (+46) - (modified) clang/test/Driver/cl-options.c (+4-3) - (modified) clang/test/Driver/clang_f_opts.c (+9-3) - (modified) clang/test/Preprocessor/init-s390x.c (+1) - (modified) llvm/include/llvm/TargetParser/Triple.h (+3) - (modified) llvm/lib/TargetParser/Triple.cpp (+7) ``````````diff diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index ebcad44197ce4..44e20623d4d0b 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -416,8 +416,7 @@ Builtin Macros ``__clang_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of narrow string literals, e.g., ``"hello"``. This macro typically expands to - "UTF-8" (but may change in the future if the - ``-fexec-charset="Encoding-Name"`` option is implemented.) + the charset specified by -fexec-charset if specified, or the system charset. ``__clang_wide_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 491e8bee9fd5c..559a4be70b74c 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -633,6 +633,9 @@ class LangOptions : public LangOptionsBase { bool AtomicFineGrainedMemory = false; bool AtomicIgnoreDenormalMode = false; + /// Name of the exec charset to convert the internal charset to. + std::string ExecCharset; + LangOptions(); /// Set language defaults for the given input language and diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h index 1b133dde89587..34f6133973e71 100644 --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -101,6 +101,13 @@ inline bool isLiteral(TokenKind K) { isStringLiteral(K) || K == tok::header_name || K == tok::binary_data; } +/// Return true if this is a utf literal kind. +inline bool isUTFLiteral(TokenKind K) { + return K == tok::utf8_char_constant || K == tok::utf8_string_literal || + K == tok::utf16_char_constant || K == tok::utf16_string_literal || + K == tok::utf32_char_constant || K == tok::utf32_string_literal; +} + /// Return true if this is any of tok::annot_* kinds. bool isAnnotation(TokenKind K); diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 30ea75bb108d5..9d352eb1270fe 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -7197,6 +7197,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in { def tune_cpu : Separate<["-"], "tune-cpu">, HelpText<"Tune for a specific cpu type">, MarshallingInfoString<TargetOpts<"TuneCPU">>; +def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">, + HelpText<"Set the execution <charset> for string and character literals. " + "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 " + "and those supported by the host icu or iconv library.">, + MarshallingInfoString<LangOpts<"ExecCharset">>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString<TargetOpts<"CPU">>; diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h new file mode 100644 index 0000000000000..203111255b791 --- /dev/null +++ b/clang/include/clang/Lex/LiteralConverter.h @@ -0,0 +1,36 @@ +//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H +#define LLVM_CLANG_LEX_LITERALCONVERTER_H + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CharSet.h" + +enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset }; + +class LiteralConverter { + llvm::StringRef InternalCharset; + llvm::StringRef SystemCharset; + llvm::StringRef ExecCharset; + llvm::StringMap<llvm::CharSetConverter> CharsetConverters; + +public: + llvm::CharSetConverter *getConverter(const char *Codepage); + llvm::CharSetConverter *getConverter(ConversionAction Action); + llvm::CharSetConverter *createAndInsertCharConverter(const char *To); + void setConvertersFromOptions(const clang::LangOptions &Opts, + const clang::TargetInfo &TInfo, + clang::DiagnosticsEngine &Diags); +}; + +#endif diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h index ea5f63bc20399..114d50c226a29 100644 --- a/clang/include/clang/Lex/LiteralSupport.h +++ b/clang/include/clang/Lex/LiteralSupport.h @@ -17,10 +17,12 @@ #include "clang/Basic/CharInfo.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/TokenKinds.h" +#include "clang/Lex/LiteralConverter.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/CharSet.h" #include "llvm/Support/DataTypes.h" namespace clang { @@ -233,6 +235,7 @@ class StringLiteralParser { const LangOptions &Features; const TargetInfo &Target; DiagnosticsEngine *Diags; + LiteralConverter *LiteralConv; unsigned MaxTokenLength; unsigned SizeBound; @@ -248,16 +251,17 @@ class StringLiteralParser { public: StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, StringLiteralEvalMethod StringMethod = - StringLiteralEvalMethod::Evaluated); + StringLiteralEvalMethod::Evaluated, + ConversionAction Action = ToExecCharset); StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm, const LangOptions &features, const TargetInfo &target, DiagnosticsEngine *diags = nullptr) : SM(sm), Features(features), Target(target), Diags(diags), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), Pascal(false) { - init(StringToks); + init(StringToks, NoConversion); } bool hadError; @@ -305,7 +309,7 @@ class StringLiteralParser { static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); private: - void init(ArrayRef<Token> StringToks); + void init(ArrayRef<Token> StringToks, ConversionAction Action); bool CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment); void DiagnoseLexingError(SourceLocation Loc); diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index f2dfd3a349b8b..350cd2d436eb3 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -25,6 +25,7 @@ #include "clang/Basic/TokenKinds.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/ModuleLoader.h" #include "clang/Lex/ModuleMap.h" @@ -156,6 +157,7 @@ class Preprocessor { std::unique_ptr<ScratchBuffer> ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + LiteralConverter LiteralConv; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1218,6 +1220,7 @@ class Preprocessor { SelectorTable &getSelectorTable() { return Selectors; } Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + LiteralConverter &getLiteralConverter() { return LiteralConv; } void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f87549baff5e1..7b364634f8ff7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -50,6 +50,7 @@ #include "llvm/Frontend/Debug/Options.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Option/ArgList.h" +#include "llvm/Support/CharSet.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Compression.h" @@ -7597,12 +7598,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, << value; } - // -fexec_charset=UTF-8 is default. Reject others + // Set the default fexec-charset as the system charset. + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset())); if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) { StringRef value = execCharset->getValue(); - if (!value.equals_insensitive("utf-8")) - D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args) - << value; + llvm::ErrorOr<llvm::CharSetConverter> ErrorOrConverter = + llvm::CharSetConverter::create("UTF-8", value.data()); + if (ErrorOrConverter) { + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(value)); + } else { + D.Diag(diag::err_drv_invalid_value) + << execCharset->getAsString(Args) << value; + } } RenderDiagnosticsOptions(D, Args, CmdArgs); diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index b59496babb62c..879b81bbadabe 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -32,6 +32,7 @@ #include "clang/Frontend/Utils.h" #include "clang/Frontend/VerifyDiagnosticConsumer.h" #include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Sema/CodeCompleteConsumer.h" @@ -537,6 +538,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { if (GetDependencyDirectives) PP->setDependencyDirectivesGetter(*GetDependencyDirectives); + + PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(), + getDiagnostics()); } std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) { diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 96d6fb64a6319..47b71d0fb98c5 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1058,10 +1058,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI, } } - // Macros to help identify the narrow and wide character sets - // FIXME: clang currently ignores -fexec-charset=. If this changes, - // then this may need to be updated. - Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\""); + // Macros to help identify the narrow and wide character sets. This is set + // to fexec-charset. If fexec-charset is not specified, the default is the + // system charset. + if (!LangOpts.ExecCharset.empty()) + Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset); + else + Builder.defineMacro("__clang_literal_encoding__", + TI.getTriple().getSystemCharset()); if (TI.getTypeWidth(TI.getWCharType()) >= 32) { // FIXME: 32-bit wchar_t signals UTF-32. This may change // if -fwide-exec-charset= is ever supported. diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..9e38a1b8fbb44 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_library(clangLex InitHeaderSearch.cpp Lexer.cpp LexHLSLRootSignature.cpp + LiteralConverter.cpp LiteralSupport.cpp MacroArgs.cpp MacroInfo.cpp diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp new file mode 100644 index 0000000000000..a89781e182e83 --- /dev/null +++ b/clang/lib/Lex/LiteralConverter.cpp @@ -0,0 +1,68 @@ +//===--- LiteralConverter.cpp - Translator for String Literals -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/LiteralConverter.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::CharSetConverter *LiteralConverter::getConverter(const char *Codepage) { + auto Iter = CharsetConverters.find(Codepage); + if (Iter != CharsetConverters.end()) + return &Iter->second; + return nullptr; +} + +llvm::CharSetConverter * +LiteralConverter::getConverter(ConversionAction Action) { + StringRef CodePage; + if (Action == ToSystemCharset) + CodePage = SystemCharset; + else if (Action == ToExecCharset) + CodePage = ExecCharset; + else + CodePage = InternalCharset; + return getConverter(CodePage.data()); +} + +llvm::CharSetConverter * +LiteralConverter::createAndInsertCharConverter(const char *To) { + const char *From = InternalCharset.data(); + llvm::CharSetConverter *Converter = getConverter(To); + if (Converter) + return Converter; + + ErrorOr<CharSetConverter> ErrorOrConverter = + llvm::CharSetConverter::create(From, To); + if (!ErrorOrConverter) + return nullptr; + CharsetConverters.insert_or_assign(StringRef(To), + std::move(*ErrorOrConverter)); + return getConverter(To); +} + +void LiteralConverter::setConvertersFromOptions( + const clang::LangOptions &Opts, const clang::TargetInfo &TInfo, + clang::DiagnosticsEngine &Diags) { + using namespace llvm; + SystemCharset = TInfo.getTriple().getSystemCharset(); + InternalCharset = "UTF-8"; + ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset; + // Create converter between internal and system charset + if (!InternalCharset.equals(SystemCharset)) + createAndInsertCharConverter(SystemCharset.data()); + + // Create converter between internal and exec charset specified + // in fexec-charset option. + if (InternalCharset.equals(ExecCharset)) + return; + if (!createAndInsertCharConverter(ExecCharset.data())) { + Diags.Report(clang::diag::err_drv_invalid_value) + << "-fexec-charset" << ExecCharset; + } +} diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 75ad977d64b24..927c4e1823a19 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -134,7 +134,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, FullSourceLoc Loc, unsigned CharWidth, DiagnosticsEngine *Diags, const LangOptions &Features, - StringLiteralEvalMethod EvalMethod) { + StringLiteralEvalMethod EvalMethod, + llvm::CharSetConverter *Converter) { const char *EscapeBegin = ThisTokBuf; bool Delimited = false; bool EndDelimiterFound = false; @@ -146,6 +147,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, // that would have been \", which would not have been the end of string. unsigned ResultChar = *ThisTokBuf++; char Escape = ResultChar; + bool Translate = true; + bool Invalid = false; switch (ResultChar) { // These map to themselves. case '\\': case '\'': case '"': case '?': break; @@ -186,6 +189,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, ResultChar = 11; break; case 'x': { // Hex escape. + Translate = false; ResultChar = 0; if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { Delimited = true; @@ -249,6 +253,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, case '4': case '5': case '6': case '7': { // Octal escapes. --ThisTokBuf; + Translate = false; ResultChar = 0; // Octal escapes are a series of octal digits with maximum length 3. @@ -334,6 +339,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, << std::string(1, ResultChar); break; default: + Invalid = true; if (!Diags) break; @@ -367,6 +373,15 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, HadError = true; } + if (Translate && Converter) { + // Invalid escapes are written as '?' and then translated. + char ByteChar = Invalid ? '?' : ResultChar; + SmallString<8> ResultCharConv; + Converter->convert(StringRef(&ByteChar, 1), ResultCharConv); + assert(ResultCharConv.size() == 1 && + "Char size increased after translation"); + ResultChar = ResultCharConv[0]; + } return ResultChar; } @@ -1739,6 +1754,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = false; Kind = kind; + LiteralConverter *LiteralConv = &PP.getLiteralConverter(); const char *TokBegin = begin; @@ -1805,6 +1821,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, largest_character_for_kind = 0x7Fu; } + llvm::CharSetConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && LiteralConv) + Converter = LiteralConv->getConverter(ToExecCharset); + while (begin != end) { // Is this a span of non-escape characters? if (begin[0] != '\\') { @@ -1842,6 +1862,16 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = true; PP.Diag(Loc, diag::err_character_too_large); } + if (!HadError && Converter) { + assert(Kind != tok::wide_char_constant && + "Wide character translation not supported"); + char ByteChar = *tmp_out_start; + SmallString<1> ConvertedChar; + Converter->convert(StringRef(&ByteChar, 1), ConvertedChar); + assert(ConvertedChar.size() == 1 && + "Char size increased after translation"); + *tmp_out_start = ConvertedChar[0]; + } } } @@ -1849,16 +1879,35 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } // Is this a Universal Character Name escape? if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') { - unsigned short UcnLen = 0; - if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, - FullSourceLoc(Loc, PP.getSourceManager()), - &PP.getDiagnostics(), PP.getLangOpts(), true)) { - HadError = true; - } else if (*buffer_begin > largest_character_for_kind) { - HadError = true; - PP.Diag(Loc, diag::err_character_too_large); + if (Converter == nullptr) { + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, + FullSourceLoc(Loc, PP.getSourceManager()), + &PP.getDiagnostics(), PP.getLangOpts(), true)) { + HadError = true; + } else if (*buffer_begin > largest_character_for_kind) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } + } else { + char Cp[8]; + char *ResultPtr = Cp; + unsigned CharByteWidth = 1; + EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError, + FullSourceLoc(Loc, PP.getSourceManager()), + CharByteWidth, &PP.getDiagnostics(), PP.getLangOpts()); + if (!HadError) { + SmallString<8> CpConv; + Converter->convert(StringRef(Cp), CpConv); + if (CpConv.size() > 1) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } else { + memcpy(Cp, CpConv.data(), CpConv.size()); + *buffer_begin = *Cp; + } + } } - ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/138895 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits