https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/138895
>From 586094419b9b2e1aa493b5a47af1a510e55bbf54 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:16:13 -0400 Subject: [PATCH 1/2] This patch enables the fexec-charset option to control the execution charset of string literals. It sets the default internal charset, system charset, and execution charset for z/OS and UTF-8 for all other platforms. --- .../clang/Basic/DiagnosticFrontendKinds.td | 2 +- .../include/clang/Basic/DiagnosticLexKinds.td | 2 + clang/include/clang/Basic/LangOptions.h | 3 + clang/include/clang/Lex/LiteralSupport.h | 19 +- clang/include/clang/Lex/Preprocessor.h | 3 + clang/include/clang/Lex/TextEncodingConfig.h | 34 ++++ clang/include/clang/Options/Options.td | 5 + clang/lib/Frontend/CompilerInstance.cpp | 6 + clang/lib/Frontend/FrontendAction.cpp | 4 +- clang/lib/Frontend/InitPreprocessor.cpp | 15 +- clang/lib/Lex/CMakeLists.txt | 1 + clang/lib/Lex/LiteralSupport.cpp | 170 ++++++++++++++---- clang/lib/Lex/PPDirectives.cpp | 6 +- clang/lib/Lex/TextEncodingConfig.cpp | 45 +++++ clang/test/CodeGen/systemz-charset.c | 58 ++++++ clang/test/CodeGen/systemz-charset.cpp | 46 +++++ clang/test/Preprocessor/init-s390x.c | 1 + llvm/include/llvm/TargetParser/Triple.h | 4 + llvm/lib/TargetParser/Triple.cpp | 7 + 19 files changed, 385 insertions(+), 46 deletions(-) create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp create mode 100644 clang/test/CodeGen/systemz-charset.c create mode 100644 clang/test/CodeGen/systemz-charset.cpp diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td index f384a97b6825e..61f96759862c4 100644 --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -359,7 +359,7 @@ def err_non_default_visibility_dllimport : Error< "non-default visibility cannot be applied to 'dllimport' declaration">; def err_ifunc_resolver_return : Error< "ifunc resolver function must return a pointer">; - +def err_fe_text_encoding_config : Error<"failed to set fexec-charset to '%0'">; def warn_atomic_op_misaligned : Warning< "misaligned atomic operation may incur " "significant performance penalty" diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 85fa290de6fd9..f1ebbb40ceb4d 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -287,6 +287,8 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds " "support">, InGroup<OverlengthStrings>; def err_character_too_large : Error< "character too large for enclosing character literal type">; +def err_exec_charset_conversion_failed + : Error<"conversion to execution encoding failed: '%0'">; def warn_c99_compat_unicode_literal : Warning< "unicode literals are incompatible with C99">, InGroup<C99Compat>, DefaultIgnore; diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 64b12b6fd72c7..1501bc0e38218 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -618,6 +618,9 @@ class LangOptions : public LangOptionsBase { /// The allocation token mode. std::optional<llvm::AllocTokenMode> AllocTokenMode; + /// Name of the execution encoding to convert the internal encoding to. + std::string ExecEncoding; + LangOptions(); /// Set language defaults for the given input language and diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h index ea5f63bc20399..6b404403ed95f 100644 --- a/clang/include/clang/Lex/LiteralSupport.h +++ b/clang/include/clang/Lex/LiteralSupport.h @@ -17,11 +17,13 @@ #include "clang/Basic/CharInfo.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/TokenKinds.h" +#include "clang/Lex/TextEncodingConfig.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/DataTypes.h" +#include "llvm/Support/TextEncoding.h" namespace clang { @@ -233,6 +235,7 @@ class StringLiteralParser { const LangOptions &Features; const TargetInfo &Target; DiagnosticsEngine *Diags; + TextEncodingConfig *TEC; unsigned MaxTokenLength; unsigned SizeBound; @@ -246,18 +249,19 @@ class StringLiteralParser { StringLiteralEvalMethod EvalMethod; public: - StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, - StringLiteralEvalMethod StringMethod = - StringLiteralEvalMethod::Evaluated); + StringLiteralParser( + ArrayRef<Token> StringToks, Preprocessor &PP, + StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated, + ConversionAction Action = CA_ToExecEncoding); StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm, const LangOptions &features, const TargetInfo &target, DiagnosticsEngine *diags = nullptr) - : SM(sm), Features(features), Target(target), Diags(diags), + : SM(sm), Features(features), Target(target), Diags(diags), TEC(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), Pascal(false) { - init(StringToks); + init(StringToks, CA_NoConversion); } bool hadError; @@ -305,9 +309,10 @@ class StringLiteralParser { static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); private: - void init(ArrayRef<Token> StringToks); + void init(ArrayRef<Token> StringToks, ConversionAction Action); bool CopyStringFragment(const Token &Tok, const char *TokBegin, - StringRef Fragment); + StringRef Fragment, + llvm::TextEncodingConverter *Converter); void DiagnoseLexingError(SourceLocation Loc); }; diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8cba21539e48a..62cbe2dc5ce57 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -30,6 +30,7 @@ #include "clang/Lex/ModuleMap.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/PPEmbedParameters.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Lex/Token.h" #include "clang/Lex/TokenLexer.h" #include "clang/Support/Compiler.h" @@ -198,6 +199,7 @@ class Preprocessor { std::unique_ptr<ScratchBuffer> ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + TextEncodingConfig TEC; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1269,6 +1271,7 @@ class Preprocessor { SelectorTable &getSelectorTable() { return Selectors; } Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + TextEncodingConfig &getTextEncodingConfig() { return TEC; } void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h new file mode 100644 index 0000000000000..09967a81beeed --- /dev/null +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -0,0 +1,34 @@ +//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H +#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/TextEncoding.h" + +enum ConversionAction { CA_NoConversion, CA_ToExecEncoding }; + +class TextEncodingConfig { + llvm::StringRef ExecEncoding; + llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr; + +public: + llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; + static std::error_code + setConvertersFromOptions(TextEncodingConfig &TEC, + const clang::LangOptions &Opts); + + llvm::StringRef getExecEncoding() { return ExecEncoding; } +}; + +#endif diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 5eeabf4c33b76..73bce00b921ea 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -7504,6 +7504,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in { def tune_cpu : Separate<["-"], "tune-cpu">, HelpText<"Tune for a specific cpu type">, MarshallingInfoString<TargetOpts<"TuneCPU">>; +def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<encoding>">, + HelpText<"Set the execution <encoding> for string and character literals. " + "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " + "and possibly those supported by ICU or the host iconv library.">, + MarshallingInfoString<LangOpts<"ExecEncoding">>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString<TargetOpts<"CPU">>; diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 91eda7392784f..c9b5342b7e8d9 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -34,6 +34,7 @@ #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" @@ -547,6 +548,11 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { if (GetDependencyDirectives) PP->setDependencyDirectivesGetter(*GetDependencyDirectives); + + if (auto EC = TextEncodingConfig::setConvertersFromOptions( + PP->getTextEncodingConfig(), getLangOpts())) + PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config) + << PP->getTextEncodingConfig().getExecEncoding(); } // ASTContext diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index 3bb1375fc5b77..47eb6ca1b87e6 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -525,7 +525,9 @@ static SourceLocation ReadOriginalFileName(CompilerInstance &CI, if (T.isAtStartOfLine() || T.getKind() != tok::string_literal) return SourceLocation(); - StringLiteralParser Literal(T, CI.getPreprocessor()); + StringLiteralParser Literal(T, CI.getPreprocessor(), + StringLiteralEvalMethod::Evaluated, + CA_NoConversion); if (Literal.hadError) return SourceLocation(); RawLexer->LexFromRawLexer(T); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 3f0468a938149..200eab9b971a7 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1033,10 +1033,17 @@ static void InitializePredefinedMacros(const TargetInfo &TI, } } - // Macros to help identify the narrow and wide character sets - // FIXME: clang currently ignores -fexec-charset=. If this changes, - // then this may need to be updated. - Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\""); + // Macros to help identify the narrow and wide character sets. This is set + // to fexec-charset. If fexec-charset is not specified, the default is the + // system charset. + if (!LangOpts.ExecEncoding.empty()) + Builder.defineMacro("__clang_literal_encoding__", + Twine("\"" + LangOpts.ExecEncoding + "\"")); + else + Builder.defineMacro( + "__clang_literal_encoding__", + Twine("\"" + TI.getTriple().getDefaultNarrowTextEncoding() + "\"")); + if (TI.getTypeWidth(TI.getWCharType()) >= 32) { // FIXME: 32-bit wchar_t signals UTF-32. This may change // if -fwide-exec-charset= is ever supported. diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..106a5d3b126be 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -29,6 +29,7 @@ add_clang_library(clangLex Preprocessor.cpp PreprocessorLexer.cpp ScratchBuffer.cpp + TextEncodingConfig.cpp TokenConcatenation.cpp TokenLexer.cpp diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 482146ccf8654..9b8835bbf5e35 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -126,6 +126,17 @@ static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) { return false; } +static llvm::ErrorOr<char> +convertCharacter(StringRef Char, const llvm::TextEncodingConverter &Converter) { + SmallString<8> ResultCharConv; + std::error_code EC = Converter.convert(Char, ResultCharConv); + if (EC) + return EC; + else if (ResultCharConv.size() > 1) + return std::error_code(E2BIG, std::generic_category()); + return ResultCharConv[0]; +} + /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. static unsigned ProcessCharEscape(const char *ThisTokBegin, @@ -134,7 +145,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, FullSourceLoc Loc, unsigned CharWidth, DiagnosticsEngine *Diags, const LangOptions &Features, - StringLiteralEvalMethod EvalMethod) { + StringLiteralEvalMethod EvalMethod, + llvm::TextEncodingConverter *Converter) { const char *EscapeBegin = ThisTokBuf; bool Delimited = false; bool EndDelimiterFound = false; @@ -146,6 +158,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, // that would have been \", which would not have been the end of string. unsigned ResultChar = *ThisTokBuf++; char Escape = ResultChar; + bool Transcode = true; + bool Invalid = false; switch (ResultChar) { // These map to themselves. case '\\': case '\'': case '"': case '?': break; @@ -186,6 +200,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, ResultChar = 11; break; case 'x': { // Hex escape. + Transcode = false; ResultChar = 0; if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { Delimited = true; @@ -249,6 +264,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, case '4': case '5': case '6': case '7': { // Octal escapes. --ThisTokBuf; + Transcode = false; ResultChar = 0; // Octal escapes are a series of octal digits with maximum length 3. @@ -272,6 +288,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, } case 'o': { bool Overflow = false; + Transcode = false; if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') { HadError = true; if (Diags) @@ -334,6 +351,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, << std::string(1, ResultChar); break; default: + Invalid = true; if (!Diags) break; @@ -367,6 +385,21 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, HadError = true; } + if (!HadError && EvalMethod != StringLiteralEvalMethod::Unevaluated && + Transcode && Converter) { + // Invalid escapes are written as '?' and then translated. + assert(ResultChar <= std::numeric_limits<char>::max()); + char ByteChar = Invalid ? '?' : ResultChar; + auto ErrorOrChar = convertCharacter(StringRef(&ByteChar, 1), *Converter); + if (ErrorOrChar) + ResultChar = *ErrorOrChar; + else { + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_exec_charset_conversion_failed) + << ErrorOrChar.getError().message(); + HadError = true; + } + } return ResultChar; } @@ -1811,6 +1844,11 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, uint32_t *buffer_begin = &codepoint_buffer.front(); uint32_t *buffer_end = buffer_begin + codepoint_buffer.size(); + const TextEncodingConfig &TEC = PP.getTextEncodingConfig(); + llvm::TextEncodingConverter *Converter = nullptr; + if (isOrdinary()) + Converter = TEC.getConverter(CA_ToExecEncoding); + // Unicode escapes representing characters that cannot be correctly // represented in a single code unit are disallowed in character literals // by this implementation. @@ -1825,7 +1863,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } else if (tok::utf32_char_constant == Kind) { largest_character_for_kind = 0x10FFFF; } else { - largest_character_for_kind = 0x7Fu; + largest_character_for_kind = (Converter == nullptr) ? 0x7Fu : 0xFFu; } while (begin != end) { @@ -1865,6 +1903,22 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = true; PP.Diag(Loc, diag::err_character_too_large); } + if (!HadError && Converter) { + assert(isOrdinary() && "Only ordinary characters are supported"); + std::string UTF8String; + convertUTF32ToUTF8String( + ArrayRef<char>(reinterpret_cast<const char *>(tmp_out_start), + 4), + UTF8String); + auto ErrorOrChar = convertCharacter(UTF8String, *Converter); + if (ErrorOrChar) { + *tmp_out_start = *ErrorOrChar; + } else { + HadError = true; + PP.Diag(Loc, diag::err_exec_charset_conversion_failed) + << ErrorOrChar.getError().message(); + } + } } } @@ -1872,16 +1926,37 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } // Is this a Universal Character Name escape? if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') { - unsigned short UcnLen = 0; - if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, - FullSourceLoc(Loc, PP.getSourceManager()), - &PP.getDiagnostics(), PP.getLangOpts(), true)) { - HadError = true; - } else if (*buffer_begin > largest_character_for_kind) { - HadError = true; - PP.Diag(Loc, diag::err_character_too_large); + if (Converter == nullptr) { + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, + FullSourceLoc(Loc, PP.getSourceManager()), + &PP.getDiagnostics(), PP.getLangOpts(), true)) { + HadError = true; + } else if (*buffer_begin > largest_character_for_kind) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } + } else { + char Cp[5]; + char *ResultPtr = Cp; + EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError, + FullSourceLoc(Loc, PP.getSourceManager()), + /*CharByteWidth=*/1u, &PP.getDiagnostics(), + PP.getLangOpts()); + assert(ResultPtr - Cp <= 4 && + "unexpected result size for UCN escape character"); + if (!HadError) { + auto ErrorOrChar = + convertCharacter(StringRef(Cp, ResultPtr - Cp), *Converter); + if (ErrorOrChar) + *buffer_begin = *ErrorOrChar; + else { + PP.Diag(Loc, diag::err_exec_charset_conversion_failed) + << ErrorOrChar.getError().message(); + HadError = true; + } + } } - ++buffer_begin; continue; } @@ -1890,7 +1965,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, ProcessCharEscape(TokBegin, begin, end, HadError, FullSourceLoc(Loc, PP.getSourceManager()), CharWidth, &PP.getDiagnostics(), PP.getLangOpts(), - StringLiteralEvalMethod::Evaluated); + StringLiteralEvalMethod::Evaluated, nullptr); *buffer_begin++ = result; } @@ -2000,16 +2075,18 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, /// StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP, - StringLiteralEvalMethod EvalMethod) + StringLiteralEvalMethod EvalMethod, + ConversionAction Action) : SM(PP.getSourceManager()), Features(PP.getLangOpts()), Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false), - Pascal(false) { - init(StringToks); + TEC(&PP.getTextEncodingConfig()), MaxTokenLength(0), SizeBound(0), + CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), + EvalMethod(EvalMethod), hadError(false), Pascal(false) { + init(StringToks, Action); } -void StringLiteralParser::init(ArrayRef<Token> StringToks){ +void StringLiteralParser::init(ArrayRef<Token> StringToks, + ConversionAction Action) { // The literal token may have come from an invalid source location (e.g. due // to a PCH error), in which case the token length will be 0. if (StringToks.empty() || StringToks[0].getLength() < 2) @@ -2101,6 +2178,10 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){ SourceLocation UDSuffixTokLoc; + llvm::TextEncodingConverter *Converter = nullptr; + if (isOrdinary() && TEC) + Converter = TEC->getConverter(Action); + for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { const char *ThisTokBuf = &TokenBuf[0]; // Get the spelling of the token, which eliminates trigraphs, etc. We know @@ -2211,7 +2292,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){ StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos); // Copy everything before the \r\n sequence into the string literal. - if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) + if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF, + Converter)) hadError = true; // Point into the \n inside the \r\n sequence and operate on the @@ -2250,24 +2332,32 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){ // Copy the character span over. if (CopyStringFragment(StringToks[i], ThisTokBegin, - StringRef(InStart, ThisTokBuf - InStart))) + StringRef(InStart, ThisTokBuf - InStart), + Converter)) hadError = true; continue; } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' || ThisTokBuf[1] == 'N') { - EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, - ResultPtr, hadError, + char *Cp = ResultPtr; + EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth, Diags, Features); + if (!hadError && Converter) { + SmallString<8> CpConv; + Converter->convert(StringRef(Cp), CpConv); + memcpy(Cp, CpConv.data(), CpConv.size()); + ResultPtr = Cp + CpConv.size(); + } continue; } // Otherwise, this is a non-UCN escape character. Process it. - unsigned ResultChar = - ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, - FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth * 8, Diags, Features, EvalMethod); + unsigned ResultChar = ProcessCharEscape( + ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth * 8, + Diags, Features, EvalMethod, Converter); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of @@ -2343,12 +2433,29 @@ static const char *resyncUTF8(const char *Err, const char *End) { /// This function copies from Fragment, which is a sequence of bytes /// within Tok's contents (which begin at TokBegin) into ResultPtr. /// Performs widening for multi-byte characters. -bool StringLiteralParser::CopyStringFragment(const Token &Tok, - const char *TokBegin, - StringRef Fragment) { +bool StringLiteralParser::CopyStringFragment( + const Token &Tok, const char *TokBegin, StringRef Fragment, + llvm::TextEncodingConverter *Converter) { + const llvm::UTF8 *ErrorPtrTmp; - if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) + if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) { + if (Converter) { + assert(isOrdinary() && "Only ordinary literals are supported"); + SmallString<64> CpConv; + char *Cp = ResultPtr - Fragment.size(); + auto EC = Converter->convert(Fragment, CpConv); + if (!EC) { + memcpy(Cp, CpConv.data(), CpConv.size()); + ResultPtr = Cp + CpConv.size(); + } else { // there was a conversion error + if (Diags) + Diags->Report(Tok.getLocation(), + diag::err_exec_charset_conversion_failed) + << EC.message(); + } + } return false; + } // If we see bad encoding for unprefixed string literals, warn and // simply copy the byte values, for compatibility with gcc and older @@ -2465,7 +2572,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, } else { ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8, - Diags, Features, StringLiteralEvalMethod::Evaluated); + Diags, Features, StringLiteralEvalMethod::Evaluated, + /*TextEncodingConfig=*/nullptr); --ByteNo; } assert(!HadError && "This method isn't valid on erroneous strings"); diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 6e90f20572f1f..1add87d2a5177 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -1650,7 +1650,8 @@ void Preprocessor::HandleLineDirective() { return; } else { // Parse and validate the string, converting it into a unique ID. - StringLiteralParser Literal(StrTok, *this); + StringLiteralParser Literal( + StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion); assert(Literal.isOrdinary() && "Didn't allow wide strings in"); if (Literal.hadError) { DiscardUntilEndOfDirective(); @@ -1801,7 +1802,8 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) { return; } else { // Parse and validate the string, converting it into a unique ID. - StringLiteralParser Literal(StrTok, *this); + StringLiteralParser Literal( + StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion); assert(Literal.isOrdinary() && "Didn't allow wide strings in"); if (Literal.hadError) { DiscardUntilEndOfDirective(); diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp new file mode 100644 index 0000000000000..b89d5baefcc23 --- /dev/null +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -0,0 +1,45 @@ +//===--- TextEncodingConfig.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TextEncodingConfig.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::TextEncodingConverter * +TextEncodingConfig::getConverter(ConversionAction Action) const { + switch (Action) { + case CA_ToExecEncoding: + return ToExecEncodingConverter; + default: + return nullptr; + } +} + +std::error_code +TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC, + const clang::LangOptions &Opts) { + using namespace llvm; + + const char *UTF8 = "UTF-8"; + TEC.ExecEncoding = + Opts.ExecEncoding.empty() ? UTF8 : Opts.ExecEncoding.c_str(); + + // Create converter between internal and exec encoding specified + // in fexec-charset option. + if (TEC.ExecEncoding == UTF8) + return std::error_code(); + ErrorOr<TextEncodingConverter> ErrorOrConverter = + llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding); + if (ErrorOrConverter) + TEC.ToExecEncodingConverter = + new TextEncodingConverter(std::move(*ErrorOrConverter)); + else + return ErrorOrConverter.getError(); + return std::error_code(); +} diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c new file mode 100644 index 0000000000000..897b9d2eeefa1 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.c @@ -0,0 +1,58 @@ +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8 + +const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +//CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" +//CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00" + +const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz"; +//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00" +//CHECK-UTF8: c"abcdefghijklmnopqrstuvwxyz\00" + +const char *Digits = "0123456789"; +//CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00" +//CHECK-UTF8: c"0123456789\00" + +const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@="; +//CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00" +//CHECK-UTF8: c" .<(+|&!$*);^-/,%%_>`:#@=\00" + +const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00" +//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" + +const char *InvalidEscape = "\y\z"; +//CHECK: c"oo\00" +//CHECK-UTF8: c"yz\00" + +const char *HexCharacters = "\x12\x13\x14"; +//CHECK: c"\12\13\14\00" +//CHECK-UTF8: c"\12\13\14\00" + +const char *OctalCharacters = "\141\142\143"; +//CHECK: c"abc\00" +//CHECK-UTF8: c"abc\00" + +const char singleChar = 'a'; +//CHECK: i8 -127 +//CHECK-UTF8: 97 + +#ifndef IBM1047_ONLY +const char cent = '¢'; +//CHECK: i8 74 + +const char currency = '¤'; +//CHECK: i8 -97 +#endif + +const char *UcnCharacters = "\u00E2\u00AC\U000000DF"; +//CHECK: c"B\B0Y\00" +//CHECK-UTF8: c"\C3\A2\C2\AC\C3\9F\00" + +const char *Unicode = "ÿ"; +//CHECK: c"\DF\00" +//CHECK-UTF8: c"\C3\BF\00" + +// RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +// CHECK-ERROR: error: failed to set fexec-charset to 'invalid' + diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp new file mode 100644 index 0000000000000..f7becd5b39492 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.cpp @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -std=c++17 -fexec-charset IBM-1047 -o - | FileCheck %s + +const char *RawString = R"(Hello\n)"; +//CHECK: c"\C8\85\93\93\96\E0\95\00" + +const char *MultiLineRawString = R"( +Hello +There)"; +//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00" + +char UnicodeChar8 = u8'1'; +//CHECK: i8 49 +char16_t UnicodeChar16 = u'1'; +//CHECK: i16 49 +char32_t UnicodeChar32 = U'1'; +//CHECK: i32 49 + +const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" + +const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0] + +const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0] + +const char *UnicodeString8 = u8"Hello"; +//CHECK: c"Hello\00" +const char16_t *UnicodeString16 = u"Hello"; +//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0] +const char32_t *UnicodeString32 = U"Hello"; +//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0] + +const char *UnicodeRawString8 = u8R"("Hello\")"; +//CHECK: c"\22Hello\\\22\00" +const char16_t *UnicodeRawString16 = uR"("Hello\")"; +//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0] +const char32_t *UnicodeRawString32 = UR"("Hello\")"; +//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0] + +const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF"; +//CHECK: c"\C3\A2\C2\AC\C3\9F\00" +const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0] +const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0] diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c index a8fbde46cbb75..4414c7d919879 100644 --- a/clang/test/Preprocessor/init-s390x.c +++ b/clang/test/Preprocessor/init-s390x.c @@ -206,4 +206,5 @@ // S390X-ZOS: #define __TOS_390__ 1 // S390X-ZOS: #define __TOS_MVS__ 1 // S390X-ZOS: #define __XPLINK__ 1 +// S390X-ZOS: #define __clang_literal_encoding__ "IBM-1047" // S390X-ZOS-GNUXX: #define __wchar_t 1 diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index d5a42d9646c18..74e6ab17d9b3c 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -528,6 +528,10 @@ class Triple { /// For example, "fooos1.2.3" would return "1.2.3". LLVM_ABI StringRef getEnvironmentVersionString() const; + /// Get the default system encoding of the triple. + /// For example, "IBM-1047" for z/OS, "UTF-8" for others + LLVM_ABI StringRef getDefaultNarrowTextEncoding() const; + /// @} /// @name Convenience Predicates /// @{ diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index c6515425b7eb5..1f1812c9f4096 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1678,6 +1678,13 @@ StringRef Triple::getOSAndEnvironmentName() const { return Tmp.split('-').second; // Strip second component } +// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise +StringRef Triple::getDefaultNarrowTextEncoding() const { + if (getOS() == llvm::Triple::ZOS) + return "IBM-1047"; + return "UTF-8"; +} + static VersionTuple parseVersionFromName(StringRef Name) { VersionTuple Version; Version.tryParse(Name); >From 418a6c515338789dcd6b63be21fbb8f28b61e409 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Mon, 11 May 2026 14:45:38 -0400 Subject: [PATCH 2/2] move conversion into EncodeUCNEscape, update testcase --- clang/lib/Lex/LiteralSupport.cpp | 50 ++++++++++++-------------- clang/test/CodeGen/systemz-charset.cpp | 3 ++ 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 9b8835bbf5e35..59ece0dbf79ed 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -784,11 +784,11 @@ static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, /// StringLiteralParser. When we decide to implement UCN's for identifiers, /// we will likely rework our support for UCN's. static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, - const char *ThisTokEnd, - char *&ResultBuf, bool &HadError, - FullSourceLoc Loc, unsigned CharByteWidth, - DiagnosticsEngine *Diags, - const LangOptions &Features) { + const char *ThisTokEnd, char *&ResultBuf, + bool &HadError, FullSourceLoc Loc, + unsigned CharByteWidth, DiagnosticsEngine *Diags, + const LangOptions &Features, + llvm::TextEncodingConverter *Converter) { typedef uint32_t UTF32; UTF32 UcnVal = 0; unsigned short UcnLen = 0; @@ -875,6 +875,20 @@ static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, } // Update the buffer. ResultBuf += bytesToWrite; + + if (Converter) { + SmallString<4> CpConv; + char *Cp = ResultBuf - bytesToWrite; + auto EC = Converter->convert(StringRef(Cp, bytesToWrite), CpConv); + if (!EC) { + memcpy(Cp, CpConv.data(), CpConv.size()); + ResultBuf = Cp + CpConv.size(); + } else { + Diags->Report(Loc, diag::err_exec_charset_conversion_failed) + << EC.message(); + HadError = true; + } + } } /// integer-constant: [C99 6.4.4.1] @@ -1942,20 +1956,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError, FullSourceLoc(Loc, PP.getSourceManager()), /*CharByteWidth=*/1u, &PP.getDiagnostics(), - PP.getLangOpts()); - assert(ResultPtr - Cp <= 4 && - "unexpected result size for UCN escape character"); - if (!HadError) { - auto ErrorOrChar = - convertCharacter(StringRef(Cp, ResultPtr - Cp), *Converter); - if (ErrorOrChar) - *buffer_begin = *ErrorOrChar; - else { - PP.Diag(Loc, diag::err_exec_charset_conversion_failed) - << ErrorOrChar.getError().message(); - HadError = true; - } - } + PP.getLangOpts(), Converter); + if (!HadError) + *buffer_begin = *Cp; } ++buffer_begin; continue; @@ -2340,17 +2343,10 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks, // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' || ThisTokBuf[1] == 'N') { - char *Cp = ResultPtr; EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth, Diags, Features); - if (!hadError && Converter) { - SmallString<8> CpConv; - Converter->convert(StringRef(Cp), CpConv); - memcpy(Cp, CpConv.data(), CpConv.size()); - ResultPtr = Cp + CpConv.size(); - } + CharByteWidth, Diags, Features, Converter); continue; } // Otherwise, this is a non-UCN escape character. Process it. diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp index f7becd5b39492..8ce4e906325dd 100644 --- a/clang/test/CodeGen/systemz-charset.cpp +++ b/clang/test/CodeGen/systemz-charset.cpp @@ -15,6 +15,9 @@ char16_t UnicodeChar16 = u'1'; char32_t UnicodeChar32 = U'1'; //CHECK: i32 49 +int FourChar = '1234'; +//CHECK: i32 -235736076 + const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?"; //CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
