https://github.com/abhina-sree created https://github.com/llvm/llvm-project/pull/196568
None >From 5f9b389a8d09367107d54af8cb2e7ec94244bf6e Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:19:11 -0400 Subject: [PATCH] Add format string handling --- clang/include/clang/AST/Expr.h | 6 ++ clang/include/clang/AST/FormatString.h | 12 +-- clang/include/clang/Basic/TargetInfo.h | 3 + clang/include/clang/Lex/TextEncodingConfig.h | 3 +- clang/include/clang/Sema/Sema.h | 2 +- clang/lib/AST/Expr.cpp | 14 +++ clang/lib/AST/FormatString.cpp | 86 ++++++++++--------- clang/lib/AST/FormatStringParsing.h | 36 +++++--- clang/lib/AST/PrintfFormatString.cpp | 89 +++++++++++++------- clang/lib/AST/ScanfFormatString.cpp | 23 +++-- clang/lib/Basic/TargetInfo.cpp | 3 + clang/lib/Frontend/CompilerInstance.cpp | 2 +- clang/lib/Lex/TextEncodingConfig.cpp | 11 ++- clang/lib/Sema/SemaChecking.cpp | 54 +++++++----- clang/lib/Sema/SemaExpr.cpp | 5 +- clang/test/CodeGen/systemz-charset.c | 2 + llvm/include/llvm/Support/TextEncoding.h | 10 +++ llvm/lib/Support/TextEncoding.cpp | 19 +++++ 18 files changed, 258 insertions(+), 122 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 393fe275c6269..d01afcff4095d 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -28,6 +28,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/SyncScope.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/TextEncodingConfig.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SmallVector.h" @@ -2066,6 +2067,11 @@ class PredefinedExpr final return getIdentKindName(getIdentKind()); } + static std::string + ComputeNameAndTranslate(PredefinedIdentKind IK, const Decl *CurrentDecl, + TextEncodingConfig &TEC, + bool ForceElaboratedPrinting = false); + static std::string ComputeName(PredefinedIdentKind IK, const Decl *CurrentDecl, bool ForceElaboratedPrinting = false); diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index a3382e1a1d007..a24ade2d71ee9 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -19,6 +19,7 @@ #define LLVM_CLANG_AST_FORMATSTRING_H #include "clang/AST/CanonicalType.h" +#include "llvm/Support/TextEncoding.h" #include <optional> namespace clang { @@ -728,7 +729,8 @@ class FormatStringHandler { virtual bool HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } @@ -744,10 +746,10 @@ class FormatStringHandler { // Scanf-specific handlers. - virtual bool - HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) { + virtual bool HandleInvalidScanfConversionSpecifier( + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 9f7d2a17a0f8a..ec7d4fcd4d8e3 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -38,6 +38,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Error.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" #include <cassert> @@ -323,6 +324,8 @@ class TargetInfo : public TransferrableTargetInfo, virtual ~TargetInfo(); + llvm::TextEncodingConverter *FormatStrConverter; + /// Retrieve the target options. TargetOptions &getTargetOpts() const { assert(TargetOpts && "Missing target options"); diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h index 09967a81beeed..f4ef578eb2991 100644 --- a/clang/include/clang/Lex/TextEncodingConfig.h +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -26,7 +26,8 @@ class TextEncodingConfig { llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; static std::error_code setConvertersFromOptions(TextEncodingConfig &TEC, - const clang::LangOptions &Opts); + const clang::LangOptions &Opts, + clang::TargetInfo &TInfo); llvm::StringRef getExecEncoding() { return ExecEncoding; } }; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index e2bc5593efa97..8ac5cc175fd2f 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,7 +55,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" -#include "clang/Lex/LiteralConverter.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 64d61dbc3d128..e067df4cefd7b 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -668,6 +668,20 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) { llvm_unreachable("Unknown ident kind for PredefinedExpr"); } +std::string PredefinedExpr::ComputeNameAndTranslate( + PredefinedIdentKind IK, const Decl *CurrentDecl, TextEncodingConfig &TEC, + bool ForceElaboratedPrinting) { + using namespace clang::charinfo; + std::string Result = ComputeName(IK, CurrentDecl, ForceElaboratedPrinting); + llvm::TextEncodingConverter *Converter = TEC.getConverter(CA_ToExecEncoding); + if (Converter) { + SmallString<128> Converted; + Converter->convert(Result, Converted); + Result = std::string(Converted); + } + return Result; +} + // FIXME: Maybe this should use DeclPrinter with a special "print predefined // expr" policy instead. std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index 7e1ac0de6dcaf..0d449fb5f0904 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -33,8 +33,9 @@ FormatStringHandler::~FormatStringHandler() {} // scanf format strings. //===----------------------------------------------------------------------===// -OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, - const char *E) { +OptionalAmount clang::analyze_format_string::ParseAmount( + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; UpdateOnReturn<const char *> UpdateBeg(Beg, I); @@ -42,7 +43,7 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, bool hasDigits = false; for (; I != E; ++I) { - char c = *I; + char c = FormatStrConverter.convert(*I); if (c >= '0' && c <= '9') { hasDigits = true; accumulator = (accumulator * 10) + (c - '0'); @@ -60,21 +61,22 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, } OptionalAmount clang::analyze_format_string::ParseNonPositionAmount( - const char *&Beg, const char *E, unsigned &argIndex) { - if (*Beg == '*') { + const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { ++Beg; return OptionalAmount(OptionalAmount::Arg, argIndex++, Beg, 0, false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } OptionalAmount clang::analyze_format_string::ParsePositionAmount( FormatStringHandler &H, const char *Start, const char *&Beg, const char *E, - PositionContext p) { - if (*Beg == '*') { + PositionContext p, const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { const char *I = Beg + 1; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() == OptionalAmount::NotSpecified) { H.HandleInvalidPosition(Beg, I - Beg, p); @@ -89,7 +91,7 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount( assert(Amt.getHowSpecified() == OptionalAmount::Constant); - if (*I == '$') { + if (FormatStrConverter.convert(*I) == '$') { // Handle positional arguments // Special case: '*0$', since this is an easy mistake. @@ -109,18 +111,21 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount( return OptionalAmount(false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } bool clang::analyze_format_string::ParseFieldWidth( FormatStringHandler &H, FormatSpecifier &CS, const char *Start, - const char *&Beg, const char *E, unsigned *argIndex) { + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { // FIXME: Support negative field widths. if (argIndex) { - CS.setFieldWidth(ParseNonPositionAmount(Beg, E, *argIndex)); + CS.setFieldWidth( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { const OptionalAmount Amt = ParsePositionAmount( - H, Start, Beg, E, analyze_format_string::FieldWidthPos); + H, Start, Beg, E, analyze_format_string::FieldWidthPos, + FormatStrConverter); if (Amt.isInvalid()) return true; @@ -129,14 +134,13 @@ bool clang::analyze_format_string::ParseFieldWidth( return false; } -bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, - FormatSpecifier &FS, - const char *Start, - const char *&Beg, - const char *E) { +bool clang::analyze_format_string::ParseArgPosition( + FormatStringHandler &H, FormatSpecifier &FS, const char *Start, + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (I == E) { // No more characters left? @@ -144,7 +148,8 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return true; } - if (Amt.getHowSpecified() == OptionalAmount::Constant && *(I++) == '$') { + if (Amt.getHowSpecified() == OptionalAmount::Constant && + FormatStrConverter.convert(*(I++)) == '$') { // Warn that positional arguments are non-standard. H.HandlePosition(Start, I - Start); @@ -165,16 +170,15 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return false; } -bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, - FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO) { +bool clang::analyze_format_string::ParseVectorModifier( + FormatStringHandler &H, FormatSpecifier &FS, const char *&I, const char *E, + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter) { if (!LO.OpenCL) return false; const char *Start = I; - if (*I == 'v') { + if (FormatStrConverter.convert(*I) == 'v') { ++I; if (I == E) { @@ -182,7 +186,7 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return true; } - OptionalAmount NumElts = ParseAmount(I, E); + OptionalAmount NumElts = ParseAmount(I, E, FormatStrConverter); if (NumElts.getHowSpecified() != OptionalAmount::Constant) { H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -194,22 +198,20 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return false; } -bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO, - bool IsScanf) { +bool clang::analyze_format_string::ParseLengthModifier( + FormatSpecifier &FS, const char *&I, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, bool IsScanf) { LengthModifier::Kind lmKind = LengthModifier::None; const char *lmPosition = I; - switch (*I) { + switch (FormatStrConverter.convert(*I)) { default: return false; case 'h': ++I; - if (I != E && *I == 'h') { + if (I != E && FormatStrConverter.convert(*I) == 'h') { ++I; lmKind = LengthModifier::AsChar; - } else if (I != E && *I == 'l' && LO.OpenCL) { + } else if (I != E && FormatStrConverter.convert(*I) == 'l' && LO.OpenCL) { ++I; lmKind = LengthModifier::AsShortLong; } else { @@ -218,7 +220,7 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, break; case 'l': ++I; - if (I != E && *I == 'l') { + if (I != E && FormatStrConverter.convert(*I) == 'l') { ++I; lmKind = LengthModifier::AsLongLong; } else { @@ -251,7 +253,9 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, // be parsed as the GNU extension 'a' length modifier. If not, this // will be parsed as a conversion specifier. ++I; - if (I != E && (*I == 's' || *I == 'S' || *I == '[')) { + if (I != E && (FormatStrConverter.convert(*I) == 's' || + FormatStrConverter.convert(*I) == 'S' || + FormatStrConverter.convert(*I) == '[')) { lmKind = LengthModifier::AsAllocate; break; } @@ -269,7 +273,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, // scanf: AsInt64 case 'I': if (I + 1 != E && I + 2 != E) { - if (I[1] == '6' && I[2] == '4') { + if (FormatStrConverter.convert(I[1]) == '6' && + FormatStrConverter.convert(I[2]) == '4') { I += 3; lmKind = LengthModifier::AsInt64; break; @@ -277,7 +282,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, if (IsScanf) return false; - if (I[1] == '3' && I[2] == '2') { + if (FormatStrConverter.convert(I[1]) == '3' && + FormatStrConverter.convert(I[2]) == '2') { I += 3; lmKind = LengthModifier::AsInt32; break; diff --git a/clang/lib/AST/FormatStringParsing.h b/clang/lib/AST/FormatStringParsing.h index 401528481a9d6..531bc291e0b5b 100644 --- a/clang/lib/AST/FormatStringParsing.h +++ b/clang/lib/AST/FormatStringParsing.h @@ -35,29 +35,43 @@ template <typename T> class UpdateOnReturn { namespace analyze_format_string { -OptionalAmount ParseAmount(const char *&Beg, const char *E); -OptionalAmount ParseNonPositionAmount(const char *&Beg, const char *E, - unsigned &argIndex); +OptionalAmount +ParseAmount(const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); -OptionalAmount ParsePositionAmount(FormatStringHandler &H, const char *Start, - const char *&Beg, const char *E, - PositionContext p); +OptionalAmount +ParseNonPositionAmount(const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); + +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); + +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseFieldWidth(FormatStringHandler &H, FormatSpecifier &CS, const char *Start, const char *&Beg, const char *E, - unsigned *argIndex); + unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseArgPosition(FormatStringHandler &H, FormatSpecifier &CS, - const char *Start, const char *&Beg, const char *E); + const char *Start, const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseVectorModifier(FormatStringHandler &H, FormatSpecifier &FS, - const char *&Beg, const char *E, - const LangOptions &LO); + const char *&Beg, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter); /// Returns true if a LengthModifier was parsed and installed in the /// FormatSpecifier& argument, and false otherwise. bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E, - const LangOptions &LO, bool IsScanf = false); + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, + bool IsScanf = false); /// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8 /// string; check that it won't go further than \p FmtStrEnd and write diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index 6610a2de9e083..7efcc554ec136 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -35,14 +35,17 @@ typedef clang::analyze_format_string::SpecifierResult<PrintfSpecifier> using analyze_format_string::ParseNonPositionAmount; -static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, - const char *Start, const char *&Beg, const char *E, - unsigned *argIndex) { +static bool +ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, const char *Start, + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { if (argIndex) { - FS.setPrecision(ParseNonPositionAmount(Beg, E, *argIndex)); + FS.setPrecision( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { const OptionalAmount Amt = ParsePositionAmount( - H, Start, Beg, E, analyze_format_string::PrecisionPos); + H, Start, Beg, E, analyze_format_string::PrecisionPos, + FormatStrConverter); if (Amt.isInvalid()) return true; FS.setPrecision(Amt); @@ -50,11 +53,14 @@ static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, return false; } -static bool ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, - const char *FlagBeg, const char *E, bool Warn) { +static bool +ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, const char *FlagBeg, + const char *E, bool Warn, + const llvm::TextEncodingConverter &FormatStrConverter) { StringRef Flag(FlagBeg, E - FlagBeg); // Currently there is only one flag. - if (Flag == "tt") { + if (Flag.size() == 2 && FormatStrConverter.convert(FlagBeg[0]) == 't' && + FormatStrConverter.convert(FlagBeg[1]) == 't') { FS.setHasObjCTechnicalTerm(FlagBeg); return false; } @@ -81,6 +87,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, const char *Start = nullptr; UpdateOnReturn<const char *> UpdateBeg(Beg, I); + const llvm::TextEncodingConverter &FormatStrConverter = + *Target.FormatStrConverter; // Look for a '%' character that indicates the start of a format specifier. for (; I != E; ++I) { char c = *I; @@ -89,7 +97,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, H.HandleNullChar(I); return true; } - if (c == '%') { + if (FormatStrConverter.convert(c) == '%') { Start = I++; // Record the start of the format specifier. break; } @@ -107,7 +115,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } PrintfSpecifier FS; - if (ParseArgPosition(H, FS, Start, I, E)) + if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter)) return true; if (I == E) { @@ -117,13 +125,17 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, return true; } - if (*I == '{') { + if (FormatStrConverter.convert(*I) == '{') { ++I; unsigned char PrivacyFlags = 0; StringRef MatchedStr; do { - StringRef Str(I, E - I); + const char *II; + std::string S(I, E - I); + for (unsigned long i = 0; i < S.length(); ++i) + S[i] = FormatStrConverter.convert(S[i]); + StringRef Str(S); std::string Match = "^[[:space:]]*" "(private|public|sensitive|mask\\.[^[:space:],}]*)" "[[:space:]]*(,|})"; @@ -132,25 +144,38 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, if (R.match(Str, &Matches)) { MatchedStr = Matches[1]; + II = I; I += Matches[0].size(); + while (FormatStrConverter.convert(*II) == ' ') + ++II; + // Set the privacy flag if the privacy annotation in the // comma-delimited segment is at least as strict as the privacy // annotations in previous comma-delimited segments. if (MatchedStr.starts_with("mask")) { - StringRef MaskType = MatchedStr.substr(sizeof("mask.") - 1); + StringRef MaskType(II + sizeof("mask.") - 1, + MatchedStr.size() - sizeof("mask.") + 1); unsigned Size = MaskType.size(); + if (Warn && (Size == 0 || Size > 8)) H.handleInvalidMaskType(MaskType); FS.setMaskType(MaskType); - } else if (MatchedStr == "sensitive") + } else if (MatchedStr == "sensitive") { + StringRef ProxyMatchedStr(II, sizeof("sensitive") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsSensitive; - else if (PrivacyFlags != - clang::analyze_os_log::OSLogBufferItem::IsSensitive && - MatchedStr == "private") + } else if (PrivacyFlags != + clang::analyze_os_log::OSLogBufferItem::IsSensitive && + MatchedStr == "private") { + StringRef ProxyMatchedStr(II, sizeof("private") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPrivate; - else if (PrivacyFlags == 0 && MatchedStr == "public") + } else if (PrivacyFlags == 0 && MatchedStr == "public") { + StringRef ProxyMatchedStr(II, sizeof("public") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPublic; + } } else { size_t CommaOrBracePos = Str.find_if([](char c) { return c == ',' || c == '}'; }); @@ -165,7 +190,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, I += CommaOrBracePos + 1; } // Continue until the closing brace is found. - } while (*(I - 1) == ','); + } while (FormatStrConverter.convert(*(I - 1)) == ','); // Set the privacy flag. switch (PrivacyFlags) { @@ -188,7 +213,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Look for flags (if any). bool hasMore = true; for (; I != E; ++I) { - switch (*I) { + switch (FormatStrConverter.convert(*I)) { default: hasMore = false; break; @@ -225,7 +250,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Look for the field width (if any). if (ParseFieldWidth(H, FS, Start, I, E, - FS.usesPositionalArg() ? nullptr : &argIndex)) + FS.usesPositionalArg() ? nullptr : &argIndex, + FormatStrConverter)) return true; if (I == E) { @@ -236,7 +262,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } // Look for the precision (if any). - if (*I == '.') { + if (FormatStrConverter.convert(*I) == '.') { ++I; if (I == E) { if (Warn) @@ -245,7 +271,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } if (ParsePrecision(H, FS, Start, I, E, - FS.usesPositionalArg() ? nullptr : &argIndex)) + FS.usesPositionalArg() ? nullptr : &argIndex, + FormatStrConverter)) return true; if (I == E) { @@ -256,11 +283,11 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } } - if (ParseVectorModifier(H, FS, I, E, LO)) + if (ParseVectorModifier(H, FS, I, E, LO, FormatStrConverter)) return true; // Look for the length modifier. - if (ParseLengthModifier(FS, I, E, LO) && I == E) { + if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter) && I == E) { // No more characters left? if (Warn) H.HandleIncompleteSpecifier(Start, E - Start); @@ -274,7 +301,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // enables better recovery, and we don't know if // these flags are applicable until later. const char *ObjCModifierFlagsStart = nullptr, *ObjCModifierFlagsEnd = nullptr; - if (*I == '[') { + if (FormatStrConverter.convert(*I) == '[') { ObjCModifierFlagsStart = I; ++I; auto flagStart = I; @@ -286,8 +313,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, return true; } // Did we find the closing ']'? - if (*I == ']') { - if (ParseObjCFlags(H, FS, flagStart, I, Warn)) + if (FormatStrConverter.convert(*I) == ']') { + if (ParseObjCFlags(H, FS, flagStart, I, Warn, FormatStrConverter)) return true; ++I; break; @@ -307,7 +334,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Finally, look for the conversion specifier. const char *conversionPosition = I++; ConversionSpecifier::Kind k = ConversionSpecifier::InvalidSpecifier; - switch (*conversionPosition) { + switch (FormatStrConverter.convert(*conversionPosition)) { default: break; // C99: 7.19.6.1 (section 8). @@ -470,7 +497,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, FS.setConversionSpecifier(CS); } // Assume the conversion takes one argument. - return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len); + return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len, + FormatStrConverter); } return PrintfSpecifierResult(Start, FS); } @@ -480,7 +508,6 @@ bool clang::analyze_format_string::ParsePrintfString( const TargetInfo &Target, bool isFreeBSDKPrintf) { unsigned argIndex = 0; - // Keep looking for a format specifier until we have exhausted the string. while (I != E) { const PrintfSpecifierResult &FSR = ParsePrintfSpecifier( diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp index 90cbbd60bbcf5..c63171844d90d 100644 --- a/clang/lib/AST/ScanfFormatString.cpp +++ b/clang/lib/AST/ScanfFormatString.cpp @@ -81,7 +81,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, const char *I = Beg; const char *Start = nullptr; UpdateOnReturn<const char *> UpdateBeg(Beg, I); - + const llvm::TextEncodingConverter &FormatStrConverter = + *Target.FormatStrConverter; // Look for a '%' character that indicates the start of a format specifier. for (; I != E; ++I) { char c = *I; @@ -90,7 +91,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, H.HandleNullChar(I); return true; } - if (c == '%') { + SmallString<1> ConvertedChar; + FormatStrConverter.convert(StringRef(&c, 1), ConvertedChar); + if (ConvertedChar[0] == '%') { Start = I++; // Record the start of the format specifier. break; } @@ -107,7 +110,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } ScanfSpecifier FS; - if (ParseArgPosition(H, FS, Start, I, E)) + if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter)) return true; if (I == E) { @@ -117,7 +120,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } // Look for '*' flag if it is present. - if (*I == '*') { + if (FormatStrConverter.convert(*I) == '*') { FS.setSuppressAssignment(I); if (++I == E) { H.HandleIncompleteSpecifier(Start, E - Start); @@ -127,7 +130,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, // Look for the field width (if any). Unlike printf, this is either // a fixed integer or isn't present. - const OptionalAmount &Amt = clang::analyze_format_string::ParseAmount(I, E); + const OptionalAmount &Amt = + clang::analyze_format_string::ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() != OptionalAmount::NotSpecified) { assert(Amt.getHowSpecified() == OptionalAmount::Constant); FS.setFieldWidth(Amt); @@ -140,7 +144,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } // Look for the length modifier. - if (ParseLengthModifier(FS, I, E, LO, /*IsScanf=*/true) && I == E) { + if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter, + /*IsScanf=*/true) && + I == E) { // No more characters left? H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -155,7 +161,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, // Finally, look for the conversion specifier. const char *conversionPosition = I++; ScanfConversionSpecifier::Kind k = ScanfConversionSpecifier::InvalidSpecifier; - switch (*conversionPosition) { + switch (FormatStrConverter.convert(*conversionPosition)) { default: break; case '%': @@ -262,7 +268,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, FS.setConversionSpecifier(CS); } // Assume the conversion takes one argument. - return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len); + return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len, + FormatStrConverter); } return ScanfSpecifierResult(Start, FS); } diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index e6ae89e0948c5..43efca42886cc 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -194,6 +194,9 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { MaxOpenCLWorkGroupSize = 1024; MaxBitIntWidth.reset(); + + FormatStrConverter = new llvm::TextEncodingConverter( + std::move(*llvm::TextEncodingConverter::createNoopConverter())); } // Out of line virtual dtor for TargetInfo. diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index c9b5342b7e8d9..83945d203762c 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -550,7 +550,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { PP->setDependencyDirectivesGetter(*GetDependencyDirectives); if (auto EC = TextEncodingConfig::setConvertersFromOptions( - PP->getTextEncodingConfig(), getLangOpts())) + PP->getTextEncodingConfig(), getLangOpts(), getTarget())) PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config) << PP->getTextEncodingConfig().getExecEncoding(); } diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp index b89d5baefcc23..427b75a1c0a8b 100644 --- a/clang/lib/Lex/TextEncodingConfig.cpp +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -23,7 +23,8 @@ TextEncodingConfig::getConverter(ConversionAction Action) const { std::error_code TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC, - const clang::LangOptions &Opts) { + const clang::LangOptions &Opts, + clang::TargetInfo &TInfo) { using namespace llvm; const char *UTF8 = "UTF-8"; @@ -41,5 +42,13 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC, new TextEncodingConverter(std::move(*ErrorOrConverter)); else return ErrorOrConverter.getError(); + + ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding, + TEC.InternalEncoding); + + if (ErrorOrConverter) + TInfo.FormatStrConverter = + new TextEncodingConverter(std::move(*ErrorOrConverter)); + return std::error_code(); } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 4706fa5d3cde0..9b15c23c7494d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -104,6 +104,7 @@ #include "llvm/Support/Locale.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SaveAndRestore.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/TargetParser/Triple.h" @@ -7872,10 +7873,10 @@ class CheckFormatHandler : public analyze_format_string::FormatStringHandler { ArrayRef<FixItHint> Fixit = {}); protected: - bool HandleInvalidConversionSpecifier(unsigned argIndex, SourceLocation Loc, - const char *startSpec, - unsigned specifierLen, - const char *csStart, unsigned csLen); + bool HandleInvalidConversionSpecifier( + unsigned argIndex, SourceLocation Loc, const char *startSpec, + unsigned specifierLen, const char *csStart, unsigned csLen, + const llvm::TextEncodingConverter &FormatStrConverter); void HandlePositionalNonpositionalArgs(SourceLocation Loc, const char *startSpec, @@ -8105,7 +8106,8 @@ void UncoveredArgHandler::Diagnose(Sema &S, bool IsFunctionCall, bool CheckFormatHandler::HandleInvalidConversionSpecifier( unsigned argIndex, SourceLocation Loc, const char *startSpec, - unsigned specifierLen, const char *csStart, unsigned csLen) { + unsigned specifierLen, const char *csStart, unsigned csLen, + const llvm::TextEncodingConverter &FormatStrConverter) { bool keepGoing = true; if (argIndex < NumDataArgs) { // Consider the argument coverered, even though the specifier doesn't @@ -8120,7 +8122,13 @@ bool CheckFormatHandler::HandleInvalidConversionSpecifier( keepGoing = false; } - StringRef Specifier(csStart, csLen); + // The csStart points to a character that has already been converted to the + // exec charset, so we have to reverse the conversion to allow diagnostic + // message to match an expected value when using -verify option, + std::string RS(csStart, csLen); + for (unsigned int i = 0; i < RS.size(); ++i) + RS[i] = FormatStrConverter.convert(RS[i]); + StringRef Specifier(RS); // If the specifier in non-printable, it could be the first byte of a UTF-8 // sequence. In that case, print the UTF-8 code point. If not, print the byte @@ -8274,7 +8282,8 @@ class CheckPrintfHandler : public CheckFormatHandler { bool HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) override; + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) override; void handleInvalidMaskType(StringRef MaskType) override; @@ -8414,13 +8423,14 @@ class DecomposePrintfHandler : public CheckPrintfHandler { bool CheckPrintfHandler::HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { const analyze_printf::PrintfConversionSpecifier &CS = FS.getConversionSpecifier(); return HandleInvalidConversionSpecifier( FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier, - specifierLen, CS.getStart(), CS.getLength()); + specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter); } void CheckPrintfHandler::handleInvalidMaskType(StringRef MaskType) { @@ -8928,15 +8938,15 @@ bool CheckPrintfHandler::HandlePrintfSpecifier( // Check for using an Objective-C specific conversion specifier // in a non-ObjC literal. if (!allowsObjCArg() && CS.isObjCArg()) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // %P can only be used with os_log. if (FSType != FormatStringType::OSLog && CS.getKind() == ConversionSpecifier::PArg) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // %n is not allowed with os_log. @@ -8955,8 +8965,8 @@ bool CheckPrintfHandler::HandlePrintfSpecifier( (CS.getKind() == ConversionSpecifier::PArg || CS.getKind() == ConversionSpecifier::sArg || CS.getKind() == ConversionSpecifier::ObjCObjArg)) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // Check for use of public/private annotation outside of os_log(). @@ -9614,10 +9624,10 @@ class CheckScanfHandler : public CheckFormatHandler { const char *startSpecifier, unsigned specifierLen) override; - bool - HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) override; + bool HandleInvalidScanfConversionSpecifier( + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) override; void HandleIncompleteScanList(const char *start, const char *end) override; }; @@ -9633,13 +9643,15 @@ void CheckScanfHandler::HandleIncompleteScanList(const char *start, bool CheckScanfHandler::HandleInvalidScanfConversionSpecifier( const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { + const analyze_scanf::ScanfConversionSpecifier &CS = FS.getConversionSpecifier(); return HandleInvalidConversionSpecifier( FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier, - specifierLen, CS.getStart(), CS.getLength()); + specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter); } bool CheckScanfHandler::HandleScanfSpecifier( diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 83d57a917fa1e..ee3ceefd8f97e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -3636,8 +3636,9 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc, // the string. bool ForceElaboratedPrinting = IK == PredefinedIdentKind::Function && getLangOpts().MSVCCompat; - auto Str = - PredefinedExpr::ComputeName(IK, currentDecl, ForceElaboratedPrinting); + auto Str = PredefinedExpr::ComputeNameAndTranslate( + IK, currentDecl, getPreprocessor().getTextEncodingConfig(), + ForceElaboratedPrinting); unsigned Length = Str.length(); llvm::APInt LengthI(32, Length + 1); diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 5279b780531c3..78ae3353224af 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -1,6 +1,8 @@ // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8 +int printf(char const *, ...); + const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; //CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" //CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00" diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..8f5a6122ede45 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -105,6 +105,8 @@ class TextEncodingConverter { LLVM_ABI static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To); + LLVM_ABI static ErrorOr<TextEncodingConverter> createNoopConverter(); + TextEncodingConverter(const TextEncodingConverter &) = delete; TextEncodingConverter &operator=(const TextEncodingConverter &) = delete; @@ -135,6 +137,14 @@ class TextEncodingConverter { return std::string(Result); return EC; } + + char convert(char SingleChar) const { + SmallString<1> Result; + auto EC = Converter->convert(StringRef(&SingleChar, 1), Result); + if (!EC) + return Result[0]; + return '\0'; + } }; } // namespace llvm diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..5c1d9696686a2 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -356,3 +356,22 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, return std::make_error_code(std::errc::invalid_argument); #endif } + +class TextEncodingConverterNoop final + : public details::TextEncodingConverterImplBase { + +public: + TextEncodingConverterNoop() {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl<char> &Result) override { + Result.assign(Source.begin(), Source.end()); + return std::error_code(); + } + + void reset() override {} +}; + +ErrorOr<TextEncodingConverter> TextEncodingConverter::createNoopConverter() { + return TextEncodingConverter(std::make_unique<TextEncodingConverterNoop>()); +} _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
