twoh created this revision. This patch implements 4.3 of http://open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4220.pdf. If a raw string contains a newline character, replace each newline character with the \n escape code. Without this patch, included test case (macro_raw_string.cpp) results compilation failure.
https://reviews.llvm.org/D39279 Files: lib/Lex/Lexer.cpp test/Preprocessor/macro_raw_string.cpp unittests/Lex/LexerTest.cpp
Index: unittests/Lex/LexerTest.cpp =================================================================== --- unittests/Lex/LexerTest.cpp +++ unittests/Lex/LexerTest.cpp @@ -37,7 +37,7 @@ DiagID(new DiagnosticIDs()), Diags(DiagID, new DiagnosticOptions, new IgnoringDiagConsumer()), SourceMgr(Diags, FileMgr), - TargetOpts(new TargetOptions) + TargetOpts(new TargetOptions) { TargetOpts->Triple = "x86_64-apple-darwin11.1.0"; Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts); @@ -478,4 +478,18 @@ EXPECT_TRUE(LexedTokens.empty()); } +TEST_F(LexerTest, StringizingRasString) { + std::string String1 = R"(foo + {"bar":[]} + baz)"; + SmallString<128> String2; + String2 += String1.c_str(); + + String1 = Lexer::Stringify(StringRef(String1)); + Lexer::Stringify(String2); + + EXPECT_EQ(String1, R"(foo\n {\"bar\":[]}\n baz)"); + EXPECT_EQ(String2, R"(foo\n {\"bar\":[]}\n baz)"); +} + } // anonymous namespace Index: test/Preprocessor/macro_raw_string.cpp =================================================================== --- /dev/null +++ test/Preprocessor/macro_raw_string.cpp @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -E -std=c++11 %s -o %t +// RUN: %clang_cc1 %t + +#define FOO(str) foo(#str) + +extern void foo(const char *str); + +void bar() { + FOO(R"(foo + bar)"); +} Index: lib/Lex/Lexer.cpp =================================================================== --- lib/Lex/Lexer.cpp +++ lib/Lex/Lexer.cpp @@ -210,27 +210,46 @@ } /// Stringify - Convert the specified string into a C string, with surrounding -/// ""'s, and with escaped \ and " characters. +/// ""'s, and with escaped \ and " characters. The function replaces each +/// newline character with the "\n" escape code as well. std::string Lexer::Stringify(StringRef Str, bool Charify) { std::string Result = Str; char Quote = Charify ? '\'' : '"'; - for (unsigned i = 0, e = Result.size(); i != e; ++i) { + for (unsigned i = 0, e = Result.size(); i < e; ++i) { if (Result[i] == '\\' || Result[i] == Quote) { Result.insert(Result.begin()+i, '\\'); ++i; ++e; } + else if (auto Size = getEscapedNewLineSize(Result.substr(i).data())) { + Result.erase(Result.begin()+i, Result.begin()+i+Size); + Result.insert(Result.begin()+i, '\\'); + Result.insert(Result.begin()+i+1, 'n'); + i+=2; e+=(2-Size); + } } return Result; } /// Stringify - Convert the specified string into a C string by escaping '\' -/// and " characters. This does not add surrounding ""'s to the string. +/// and " characters. The function replaces each newline character with the +/// "\n" escape code as well. This does not add surrounding ""'s to the string. void Lexer::Stringify(SmallVectorImpl<char> &Str) { - for (unsigned i = 0, e = Str.size(); i != e; ++i) { + for (unsigned i = 0, e = Str.size(); i < e; ++i) { if (Str[i] == '\\' || Str[i] == '"') { Str.insert(Str.begin()+i, '\\'); ++i; ++e; } + else if (Str[i] == '\n' || Str[i] == '\r') { + unsigned Size = 1; + if ((i < e-1) && (Str[i+1] == '\n' || Str[i+1] == '\r') && + Str[i] != Str[i+1]) + Size += 1; + + Str.erase(Str.begin()+i, Str.begin()+i+Size); + Str.insert(Str.begin()+i, '\\'); + Str.insert(Str.begin()+i+1, 'n'); + i+=2; e+=(2-Size); + } } } @@ -367,7 +386,7 @@ /// to point to a constant buffer with the data already in it (avoiding a /// copy). The caller is not allowed to modify the returned buffer pointer /// if an internal buffer is returned. -unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, +unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid) { assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); @@ -592,17 +611,17 @@ if (TheTok.getKind() == tok::eof) { break; } - + // If we haven't hit the end of the preprocessor directive, skip this // token. if (!TheTok.isAtStartOfLine()) continue; - + // We've passed the end of the preprocessor directive, and will look // at this token again below. InPreprocessorDirective = false; } - + // Keep track of the # of lines in the preamble. if (TheTok.isAtStartOfLine()) { unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; @@ -619,13 +638,13 @@ ActiveCommentLoc = TheTok.getLocation(); continue; } - + if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { - // This is the start of a preprocessor directive. + // This is the start of a preprocessor directive. Token HashTok = TheTok; InPreprocessorDirective = true; ActiveCommentLoc = SourceLocation(); - + // Figure out which directive this is. Since we're lexing raw tokens, // we don't have an identifier table available. Instead, just look at // the raw identifier to recognize and categorize preprocessor directives. @@ -665,7 +684,7 @@ break; } } - + // We only end up here if we didn't recognize the preprocessor // directive or it was one that can't occur in the preamble at this // point. Roll back the current token to the location of the '#'. @@ -678,7 +697,7 @@ // the preamble. break; } while (true); - + SourceLocation End; if (ActiveCommentLoc.isValid()) End = ActiveCommentLoc; // don't truncate a decl comment. @@ -700,13 +719,13 @@ // trigraphs. bool Invalid = false; const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); - + // If they request the first char of the token, we're trivially done. if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) return TokStart; - + unsigned PhysOffset = 0; - + // The usual case is that tokens don't contain anything interesting. Skip // over the uninteresting characters. If a token only consists of simple // chars, this method is extremely fast. @@ -717,23 +736,23 @@ --CharNo; ++PhysOffset; } - + // If we have a character that may be a trigraph or escaped newline, use a // lexer to parse it correctly. for (; CharNo; --CharNo) { unsigned Size; Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); TokPtr += Size; PhysOffset += Size; } - + // Final detail: if we end up on an escaped newline, we want to return the // location of the actual byte of the token. For example foo\<newline>bar // advanced by 3 should return the location of b, not of \\. One compounding // detail of this is that the escape may be made by a trigraph. if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; - + return TokStart.getLocWithOffset(PhysOffset); } @@ -768,7 +787,7 @@ Len = Len - Offset; else return Loc; - + return Loc.getLocWithOffset(Len); } @@ -965,7 +984,7 @@ // For macro arguments we need to check that the argument did not come // from an inner macro, e.g: "MAC1( MAC2(foo) )" - + // Loc points to the argument id of the macro definition, move to the // macro expansion. Loc = SM.getImmediateExpansionRange(Loc).first; @@ -1791,15 +1810,15 @@ // getAndAdvanceChar. if (C == '\\') C = getAndAdvanceChar(CurPtr, Result); - + if (C == '\n' || C == '\r' || // Newline. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; FormTokenWithChars(Result, CurPtr-1, tok::unknown); return true; } - + if (C == 0) { if (isCodeCompletionPoint(CurPtr-1)) { PP->CodeCompleteNaturalLanguage(); @@ -2228,7 +2247,7 @@ std::string Spelling = PP->getSpelling(Result, &Invalid); if (Invalid) return true; - + assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); Spelling[1] = '*'; // Change prefix to "/*". Spelling += "*/"; // add suffix. @@ -2554,16 +2573,16 @@ resetExtendedTokenMode(); return true; // Have a token. } - + // If we are in raw mode, return this event as an EOF token. Let the caller // that put us in raw mode handle the event. if (isLexingRawMode()) { Result.startToken(); BufferPtr = BufferEnd; FormTokenWithChars(Result, BufferEnd, tok::eof); return true; } - + if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { PP->setRecordedPreambleConditionalStack(ConditionalStack); ConditionalStack.clear(); @@ -2675,7 +2694,7 @@ if (CurPtr != BufferStart && CurPtr[-1] != '\n' && CurPtr[-1] != '\r') return false; - + // Check to see if we have <<<<<<< or >>>>. if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) @@ -2685,7 +2704,7 @@ // it. if (CurrentConflictMarkerState || isLexingRawMode()) return false; - + ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; // Check to see if there is an ending marker somewhere in the buffer at the @@ -2695,7 +2714,7 @@ // Diagnose this, and ignore to the end of line. Diag(CurPtr, diag::err_conflict_marker); CurrentConflictMarkerState = Kind; - + // Skip ahead to the end of line. We know this exists because the // end-of-conflict marker starts with \r or \n. while (*CurPtr != '\r' && *CurPtr != '\n') { @@ -2705,7 +2724,7 @@ BufferPtr = CurPtr; return true; } - + // No end of conflict marker found. return false; } @@ -2719,35 +2738,35 @@ if (CurPtr != BufferStart && CurPtr[-1] != '\n' && CurPtr[-1] != '\r') return false; - + // If we have a situation where we don't care about conflict markers, ignore // it. if (!CurrentConflictMarkerState || isLexingRawMode()) return false; - + // Check to see if we have the marker (4 characters in a row). for (unsigned i = 1; i != 4; ++i) if (CurPtr[i] != CurPtr[0]) return false; - + // If we do have it, search for the end of the conflict marker. This could // fail if it got skipped with a '#if 0' or something. Note that CurPtr might // be the end of conflict marker. if (const char *End = FindConflictEnd(CurPtr, BufferEnd, CurrentConflictMarkerState)) { CurPtr = End; - + // Skip ahead to the end of line. while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') ++CurPtr; - + BufferPtr = CurPtr; - + // No longer in the conflict marker. CurrentConflictMarkerState = CMK_None; return true; } - + return false; } @@ -3056,7 +3075,7 @@ // We know the lexer hasn't changed, so just try again with this lexer. // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; - + case 26: // DOS & CP/M EOF: "^Z". // If we're in Microsoft extensions mode, treat this as end of file. if (LangOpts.MicrosoftExt) { @@ -3068,7 +3087,7 @@ // If Microsoft extensions are disabled, this is just random garbage. Kind = tok::unknown; break; - + case '\r': if (CurPtr[0] == '\n') Char = getAndAdvanceChar(CurPtr, Result); @@ -3131,7 +3150,7 @@ // We only saw whitespace, so just try again with this lexer. // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; - + // C99 6.4.4.1: Integer Constants. // C99 6.4.4.2: Floating Constants. case '0': case '1': case '2': case '3': case '4': @@ -3630,7 +3649,7 @@ // If this is '====' and we're in a conflict marker, ignore it. if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) goto LexNextToken; - + Kind = tok::equalequal; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); } else { @@ -3717,7 +3736,7 @@ } return LexUnicode(Result, CodePoint, CurPtr); } - + if (isLexingRawMode() || ParsingPreprocessorDirective || PP->isPreprocessedOutput()) { ++CurPtr;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits