[PATCH] D39279: Stringizing raw string literals containing newline

Taewook Oh via Phabricator via cfe-commits Tue, 24 Oct 2017 23:59:23 -0700

twoh created this revision.

This patch implements 4.3 of 
http://open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4220.pdf. If a raw string 
contains a newline character, replace each newline character with the \n escape 
code. Without this patch, included test case (macro_raw_string.cpp) results 
compilation failure.



https://reviews.llvm.org/D39279

Files:
  lib/Lex/Lexer.cpp
  test/Preprocessor/macro_raw_string.cpp
  unittests/Lex/LexerTest.cpp

Index: unittests/Lex/LexerTest.cpp
===================================================================
--- unittests/Lex/LexerTest.cpp
+++ unittests/Lex/LexerTest.cpp
@@ -37,7 +37,7 @@
       DiagID(new DiagnosticIDs()),
       Diags(DiagID, new DiagnosticOptions, new IgnoringDiagConsumer()),
       SourceMgr(Diags, FileMgr),
-      TargetOpts(new TargetOptions) 
+      TargetOpts(new TargetOptions)
   {
     TargetOpts->Triple = "x86_64-apple-darwin11.1.0";
     Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts);
@@ -478,4 +478,18 @@
   EXPECT_TRUE(LexedTokens.empty());
 }
 
+TEST_F(LexerTest, StringizingRasString) {
+  std::string String1 = R"(foo
+    {"bar":[]}
+    baz)";
+  SmallString<128> String2;
+  String2 += String1.c_str();
+
+  String1 = Lexer::Stringify(StringRef(String1));
+  Lexer::Stringify(String2);
+
+  EXPECT_EQ(String1, R"(foo\n    {\"bar\":[]}\n    baz)");
+  EXPECT_EQ(String2, R"(foo\n    {\"bar\":[]}\n    baz)");
+}
+
 } // anonymous namespace
Index: test/Preprocessor/macro_raw_string.cpp
===================================================================
--- /dev/null
+++ test/Preprocessor/macro_raw_string.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -E -std=c++11 %s -o %t
+// RUN: %clang_cc1 %t
+
+#define FOO(str) foo(#str)
+
+extern void foo(const char *str);
+
+void bar() {
+  FOO(R"(foo
+    bar)");
+}
Index: lib/Lex/Lexer.cpp
===================================================================
--- lib/Lex/Lexer.cpp
+++ lib/Lex/Lexer.cpp
@@ -210,27 +210,46 @@
 }
 
 /// Stringify - Convert the specified string into a C string, with surrounding
-/// ""'s, and with escaped \ and " characters.
+/// ""'s, and with escaped \ and " characters. The function replaces each
+/// newline character with the "\n" escape code as well.
 std::string Lexer::Stringify(StringRef Str, bool Charify) {
   std::string Result = Str;
   char Quote = Charify ? '\'' : '"';
-  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
+  for (unsigned i = 0, e = Result.size(); i < e; ++i) {
     if (Result[i] == '\\' || Result[i] == Quote) {
       Result.insert(Result.begin()+i, '\\');
       ++i; ++e;
     }
+    else if (auto Size = getEscapedNewLineSize(Result.substr(i).data())) {
+      Result.erase(Result.begin()+i, Result.begin()+i+Size);
+      Result.insert(Result.begin()+i, '\\');
+      Result.insert(Result.begin()+i+1, 'n');
+      i+=2; e+=(2-Size);
+    }
   }
   return Result;
 }
 
 /// Stringify - Convert the specified string into a C string by escaping '\'
-/// and " characters.  This does not add surrounding ""'s to the string.
+/// and " characters. The function replaces each newline character with the
+/// "\n" escape code as well. This does not add surrounding ""'s to the string.
 void Lexer::Stringify(SmallVectorImpl<char> &Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+  for (unsigned i = 0, e = Str.size(); i < e; ++i) {
     if (Str[i] == '\\' || Str[i] == '"') {
       Str.insert(Str.begin()+i, '\\');
       ++i; ++e;
     }
+    else if (Str[i] == '\n' || Str[i] == '\r') {
+      unsigned Size = 1;
+      if ((i < e-1) && (Str[i+1] == '\n' || Str[i+1] == '\r') &&
+          Str[i] != Str[i+1])
+        Size += 1;
+
+      Str.erase(Str.begin()+i, Str.begin()+i+Size);
+      Str.insert(Str.begin()+i, '\\');
+      Str.insert(Str.begin()+i+1, 'n');
+      i+=2; e+=(2-Size);
+    }
   }
 }
 
@@ -367,7 +386,7 @@
 /// to point to a constant buffer with the data already in it (avoiding a
 /// copy).  The caller is not allowed to modify the returned buffer pointer
 /// if an internal buffer is returned.
-unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 
+unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
                             const SourceManager &SourceMgr,
                             const LangOptions &LangOpts, bool *Invalid) {
   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
@@ -592,17 +611,17 @@
       if (TheTok.getKind() == tok::eof) {
         break;
       }
-      
+
       // If we haven't hit the end of the preprocessor directive, skip this
       // token.
       if (!TheTok.isAtStartOfLine())
         continue;
-        
+
       // We've passed the end of the preprocessor directive, and will look
       // at this token again below.
       InPreprocessorDirective = false;
     }
-    
+
     // Keep track of the # of lines in the preamble.
     if (TheTok.isAtStartOfLine()) {
       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
@@ -619,13 +638,13 @@
         ActiveCommentLoc = TheTok.getLocation();
       continue;
     }
-    
+
     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
-      // This is the start of a preprocessor directive. 
+      // This is the start of a preprocessor directive.
       Token HashTok = TheTok;
       InPreprocessorDirective = true;
       ActiveCommentLoc = SourceLocation();
-      
+
       // Figure out which directive this is. Since we're lexing raw tokens,
       // we don't have an identifier table available. Instead, just look at
       // the raw identifier to recognize and categorize preprocessor directives.
@@ -665,7 +684,7 @@
           break;
         }
       }
-      
+
       // We only end up here if we didn't recognize the preprocessor
       // directive or it was one that can't occur in the preamble at this
       // point. Roll back the current token to the location of the '#'.
@@ -678,7 +697,7 @@
     // the preamble.
     break;
   } while (true);
-  
+
   SourceLocation End;
   if (ActiveCommentLoc.isValid())
     End = ActiveCommentLoc; // don't truncate a decl comment.
@@ -700,13 +719,13 @@
   // trigraphs.
   bool Invalid = false;
   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
-  
+
   // If they request the first char of the token, we're trivially done.
   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
     return TokStart;
-  
+
   unsigned PhysOffset = 0;
-  
+
   // The usual case is that tokens don't contain anything interesting.  Skip
   // over the uninteresting characters.  If a token only consists of simple
   // chars, this method is extremely fast.
@@ -717,23 +736,23 @@
     --CharNo;
     ++PhysOffset;
   }
-  
+
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
     unsigned Size;
     Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
     TokPtr += Size;
     PhysOffset += Size;
   }
-  
+
   // Final detail: if we end up on an escaped newline, we want to return the
   // location of the actual byte of the token.  For example foo\<newline>bar
   // advanced by 3 should return the location of b, not of \\.  One compounding
   // detail of this is that the escape may be made by a trigraph.
   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
-  
+
   return TokStart.getLocWithOffset(PhysOffset);
 }
 
@@ -768,7 +787,7 @@
     Len = Len - Offset;
   else
     return Loc;
-  
+
   return Loc.getLocWithOffset(Len);
 }
 
@@ -965,7 +984,7 @@
 
     // For macro arguments we need to check that the argument did not come
     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
-    
+
     // Loc points to the argument id of the macro definition, move to the
     // macro expansion.
     Loc = SM.getImmediateExpansionRange(Loc).first;
@@ -1791,15 +1810,15 @@
     // getAndAdvanceChar.
     if (C == '\\')
       C = getAndAdvanceChar(CurPtr, Result);
-    
+
     if (C == '\n' || C == '\r' ||             // Newline.
         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
       return true;
     }
-    
+
     if (C == 0) {
       if (isCodeCompletionPoint(CurPtr-1)) {
         PP->CodeCompleteNaturalLanguage();
@@ -2228,7 +2247,7 @@
   std::string Spelling = PP->getSpelling(Result, &Invalid);
   if (Invalid)
     return true;
-  
+
   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
   Spelling[1] = '*';   // Change prefix to "/*".
   Spelling += "*/";    // add suffix.
@@ -2554,16 +2573,16 @@
       resetExtendedTokenMode();
     return true;  // Have a token.
   }
- 
+
   // If we are in raw mode, return this event as an EOF token.  Let the caller
   // that put us in raw mode handle the event.
   if (isLexingRawMode()) {
     Result.startToken();
     BufferPtr = BufferEnd;
     FormTokenWithChars(Result, BufferEnd, tok::eof);
     return true;
   }
-  
+
   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
     PP->setRecordedPreambleConditionalStack(ConditionalStack);
     ConditionalStack.clear();
@@ -2675,7 +2694,7 @@
   if (CurPtr != BufferStart &&
       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
     return false;
-  
+
   // Check to see if we have <<<<<<< or >>>>.
   if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
       !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
@@ -2685,7 +2704,7 @@
   // it.
   if (CurrentConflictMarkerState || isLexingRawMode())
     return false;
-  
+
   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
 
   // Check to see if there is an ending marker somewhere in the buffer at the
@@ -2695,7 +2714,7 @@
     // Diagnose this, and ignore to the end of line.
     Diag(CurPtr, diag::err_conflict_marker);
     CurrentConflictMarkerState = Kind;
-    
+
     // Skip ahead to the end of line.  We know this exists because the
     // end-of-conflict marker starts with \r or \n.
     while (*CurPtr != '\r' && *CurPtr != '\n') {
@@ -2705,7 +2724,7 @@
     BufferPtr = CurPtr;
     return true;
   }
-  
+
   // No end of conflict marker found.
   return false;
 }
@@ -2719,35 +2738,35 @@
   if (CurPtr != BufferStart &&
       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
     return false;
-  
+
   // If we have a situation where we don't care about conflict markers, ignore
   // it.
   if (!CurrentConflictMarkerState || isLexingRawMode())
     return false;
-  
+
   // Check to see if we have the marker (4 characters in a row).
   for (unsigned i = 1; i != 4; ++i)
     if (CurPtr[i] != CurPtr[0])
       return false;
-  
+
   // If we do have it, search for the end of the conflict marker.  This could
   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
   // be the end of conflict marker.
   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
                                         CurrentConflictMarkerState)) {
     CurPtr = End;
-    
+
     // Skip ahead to the end of line.
     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
       ++CurPtr;
-    
+
     BufferPtr = CurPtr;
-    
+
     // No longer in the conflict marker.
     CurrentConflictMarkerState = CMK_None;
     return true;
   }
-  
+
   return false;
 }
 
@@ -3056,7 +3075,7 @@
     // We know the lexer hasn't changed, so just try again with this lexer.
     // (We manually eliminate the tail call to avoid recursion.)
     goto LexNextToken;
-      
+
   case 26:  // DOS & CP/M EOF: "^Z".
     // If we're in Microsoft extensions mode, treat this as end of file.
     if (LangOpts.MicrosoftExt) {
@@ -3068,7 +3087,7 @@
     // If Microsoft extensions are disabled, this is just random garbage.
     Kind = tok::unknown;
     break;
-      
+
   case '\r':
     if (CurPtr[0] == '\n')
       Char = getAndAdvanceChar(CurPtr, Result);
@@ -3131,7 +3150,7 @@
     // We only saw whitespace, so just try again with this lexer.
     // (We manually eliminate the tail call to avoid recursion.)
     goto LexNextToken;
-      
+
   // C99 6.4.4.1: Integer Constants.
   // C99 6.4.4.2: Floating Constants.
   case '0': case '1': case '2': case '3': case '4':
@@ -3630,7 +3649,7 @@
       // If this is '====' and we're in a conflict marker, ignore it.
       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
         goto LexNextToken;
-      
+
       Kind = tok::equalequal;
       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
     } else {
@@ -3717,7 +3736,7 @@
       }
       return LexUnicode(Result, CodePoint, CurPtr);
     }
-    
+
     if (isLexingRawMode() || ParsingPreprocessorDirective ||
         PP->isPreprocessedOutput()) {
       ++CurPtr;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D39279: Stringizing raw string literals containing newline

Reply via email to