[PATCH] D46000: [AST] Added a helper to extract a user-friendly text of a comment.

Ilya Biryukov via Phabricator via cfe-commits Fri, 11 May 2018 07:36:03 -0700

ilya-biryukov updated this revision to Diff 146326.
ilya-biryukov marked an inline comment as done.
ilya-biryukov added a comment.


- Simplify test code with SourceManagerForFile.


Repository:
  rC Clang

https://reviews.llvm.org/D46000

Files:
  include/clang/AST/CommentLexer.h
  include/clang/AST/RawCommentList.h
  lib/AST/CommentLexer.cpp
  lib/AST/RawCommentList.cpp
  unittests/AST/CMakeLists.txt
  unittests/AST/CommentTextTest.cpp

Index: unittests/AST/CommentTextTest.cpp
===================================================================
--- /dev/null
+++ unittests/AST/CommentTextTest.cpp
@@ -0,0 +1,122 @@
+//===- unittest/AST/CommentTextTest.cpp - Comment text extraction test ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for user-friendly output formatting of comments, i.e.
+// RawComment::getFormattedText().
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/RawCommentList.h"
+#include "clang/Basic/CommentOptions.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/FileSystemOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/VirtualFileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <gtest/gtest.h>
+
+namespace clang {
+
+class CommentTextTest : public ::testing::Test {
+protected:
+  std::string formatComment(llvm::StringRef CommentText) {
+    SourceManagerForFile FileSourceMgr("comment-test.cpp", CommentText);
+    SourceManager& SourceMgr = FileSourceMgr.get();
+
+    auto CommentStartOffset = CommentText.find("/");
+    assert(CommentStartOffset != llvm::StringRef::npos);
+    FileID File = SourceMgr.getMainFileID();
+
+    SourceRange CommentRange(
+        SourceMgr.getLocForStartOfFile(File).getLocWithOffset(
+            CommentStartOffset),
+        SourceMgr.getLocForEndOfFile(File));
+    CommentOptions EmptyOpts;
+    // FIXME: technically, merged that we set here is incorrect, but that
+    // shouldn't matter.
+    RawComment Comment(SourceMgr, CommentRange, EmptyOpts, /*Merged=*/true);
+    DiagnosticsEngine Diags(new DiagnosticIDs, new DiagnosticOptions);
+    return Comment.getFormattedText(SourceMgr, Diags);
+  }
+};
+
+TEST_F(CommentTextTest, FormattedText) {
+  // clang-format off
+  auto ExpectedOutput =
+R"(This function does this and that.
+For example,
+   Runnning it in that case will give you
+   this result.
+That's about it.)";
+  // Two-slash comments.
+  EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+// This function does this and that.
+// For example,
+//    Runnning it in that case will give you
+//    this result.
+// That's about it.)cpp"));
+
+  // Three-slash comments.
+  EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/// This function does this and that.
+/// For example,
+///    Runnning it in that case will give you
+///    this result.
+/// That's about it.)cpp"));
+
+  // Block comments.
+  EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/* This function does this and that.
+ * For example,
+ *    Runnning it in that case will give you
+ *    this result.
+ * That's about it.*/)cpp"));
+
+  // Doxygen-style block comments.
+  EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/** This function does this and that.
+  * For example,
+  *    Runnning it in that case will give you
+  *    this result.
+  * That's about it.*/)cpp"));
+
+  // Weird indentation.
+  EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+       // This function does this and that.
+  //      For example,
+  //         Runnning it in that case will give you
+        //   this result.
+       // That's about it.)cpp"));
+  // clang-format on
+}
+
+TEST_F(CommentTextTest, KeepsDoxygenControlSeqs) {
+  // clang-format off
+  auto ExpectedOutput =
+R"(\brief This is the brief part of the comment.
+\param a something about a.
+@param b something about b.)";
+
+  EXPECT_EQ(ExpectedOutput, formatComment(
+R"cpp(
+/// \brief This is the brief part of the comment.
+/// \param a something about a.
+/// @param b something about b.)cpp"));
+  // clang-format on
+}
+
+} // namespace clang
Index: unittests/AST/CMakeLists.txt
===================================================================
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -9,6 +9,7 @@
   ASTVectorTest.cpp
   CommentLexer.cpp
   CommentParser.cpp
+  CommentTextTest.cpp
   DataCollectionTest.cpp
   DeclPrinterTest.cpp
   DeclTest.cpp
Index: lib/AST/RawCommentList.cpp
===================================================================
--- lib/AST/RawCommentList.cpp
+++ lib/AST/RawCommentList.cpp
@@ -335,3 +335,98 @@
              BeforeThanCompare<RawComment>(SourceMgr));
   std::swap(Comments, MergedComments);
 }
+
+std::string RawComment::getFormattedText(const ASTContext &Ctx) const {
+  return getFormattedText(Ctx.getSourceManager(), Ctx.getDiagnostics());
+}
+
+std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
+                                         DiagnosticsEngine &Diags) const {
+  llvm::StringRef CommentText = getRawText(SourceMgr);
+  if (CommentText.empty())
+    return "";
+
+  llvm::BumpPtrAllocator Allocator;
+  // We do not parse any commands, so CommentOptions are ignored by
+  // comments::Lexer. Therefore, we just use default-constructed options.
+  CommentOptions DefOpts;
+  comments::CommandTraits EmptyTraits(Allocator, DefOpts);
+  comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
+                    CommentText.begin(), CommentText.end(),
+                    /*ParseCommands=*/false);
+
+  std::string Result;
+  // A column number of the first non-whitespace token in the comment text.
+  // We skip whitespace up to this column, but keep the whitespace after this
+  // column. IndentColumn is calculated when lexing the first line and reused
+  // for the rest of lines.
+  unsigned IndentColumn = 0;
+
+  // Processes one line of the comment and adds it to the result.
+  // Handles skipping the indent at the start of the line.
+  // Returns false when eof is reached and true otherwise.
+  auto LexLine = [&](bool IsFirstLine) -> bool {
+    comments::Token Tok;
+    // Lex the first token on the line. We handle it separately, because we to
+    // fix up its indentation.
+    L.lex(Tok);
+    if (Tok.is(comments::tok::eof))
+      return false;
+    if (Tok.is(comments::tok::newline)) {
+      Result += "\n";
+      return true;
+    }
+    llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
+    bool LocInvalid = false;
+    unsigned TokColumn =
+        SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
+    assert(!LocInvalid && "getFormattedText for invalid location");
+
+    // Amount of leading whitespace in TokText.
+    size_t WhitespaceLen = TokText.find_first_not_of(" \t");
+    if (WhitespaceLen == StringRef::npos)
+      WhitespaceLen = TokText.size();
+    // Remember the amount of whitespace we skipped in the first line to remove
+    // indent up to that column in the following lines.
+    if (IsFirstLine)
+      IndentColumn = TokColumn + WhitespaceLen;
+
+    // Amount of leading whitespace we actually want to skip.
+    // For the first line we skip all the whitespace.
+    // For the rest of the lines, we skip whitespace up to IndentColumn.
+    unsigned SkipLen =
+        IsFirstLine
+            ? WhitespaceLen
+            : std::min<size_t>(
+                  WhitespaceLen,
+                  std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
+    llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
+    Result += Trimmed;
+    // Lex all tokens in the rest of the line.
+    for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
+      if (Tok.is(comments::tok::newline)) {
+        Result += "\n";
+        return true;
+      }
+      Result += L.getSpelling(Tok, SourceMgr);
+    }
+    // We've reached the end of file token.
+    return false;
+  };
+
+  auto DropTrailingNewLines = [](std::string &Str) {
+    while (Str.back() == '\n')
+      Str.pop_back();
+  };
+
+  // Proces first line separately to remember indent for the following lines.
+  if (!LexLine(/*IsFirstLine=*/true)) {
+    DropTrailingNewLines(Result);
+    return Result;
+  }
+  // Process the rest of the lines.
+  while (LexLine(/*IsFirstLine=*/false))
+    ;
+  DropTrailingNewLines(Result);
+  return Result;
+}
Index: lib/AST/CommentLexer.cpp
===================================================================
--- lib/AST/CommentLexer.cpp
+++ lib/AST/CommentLexer.cpp
@@ -294,6 +294,39 @@
   assert(CommentState == LCS_InsideBCPLComment ||
          CommentState == LCS_InsideCComment);
 
+  // Handles lexing non-command text, i.e. text and newline.
+  auto HandleNonCommandToken = [&]() -> void {
+    assert(State == LS_Normal);
+
+    const char *TokenPtr = BufferPtr;
+    assert(TokenPtr < CommentEnd);
+    switch (*TokenPtr) {
+      case '\n':
+      case '\r':
+          TokenPtr = skipNewline(TokenPtr, CommentEnd);
+          formTokenWithChars(T, TokenPtr, tok::newline);
+
+          if (CommentState == LCS_InsideCComment)
+            skipLineStartingDecorations();
+          return;
+
+      default: {
+          StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
+          size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
+                           .find_first_of(TokStartSymbols);
+          if (End != StringRef::npos)
+            TokenPtr += End;
+          else
+            TokenPtr = CommentEnd;
+          formTextToken(T, TokenPtr);
+          return;
+      }
+    }
+  };
+
+  if (!ParseCommands)
+    return HandleNonCommandToken();
+
   switch (State) {
   case LS_Normal:
     break;
@@ -315,136 +348,116 @@
   }
 
   assert(State == LS_Normal);
-
   const char *TokenPtr = BufferPtr;
   assert(TokenPtr < CommentEnd);
-  while (TokenPtr != CommentEnd) {
-    switch(*TokenPtr) {
-      case '\\':
-      case '@': {
-        // Commands that start with a backslash and commands that start with
-        // 'at' have equivalent semantics.  But we keep information about the
-        // exact syntax in AST for comments.
-        tok::TokenKind CommandKind =
-            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+  switch(*TokenPtr) {
+    case '\\':
+    case '@': {
+      // Commands that start with a backslash and commands that start with
+      // 'at' have equivalent semantics.  But we keep information about the
+      // exact syntax in AST for comments.
+      tok::TokenKind CommandKind =
+          (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+      TokenPtr++;
+      if (TokenPtr == CommentEnd) {
+        formTextToken(T, TokenPtr);
+        return;
+      }
+      char C = *TokenPtr;
+      switch (C) {
+      default:
+        break;
+
+      case '\\': case '@': case '&': case '$':
+      case '#':  case '<': case '>': case '%':
+      case '\"': case '.': case ':':
+        // This is one of \\ \@ \& \$ etc escape sequences.
         TokenPtr++;
-        if (TokenPtr == CommentEnd) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
-        char C = *TokenPtr;
-        switch (C) {
-        default:
-          break;
-
-        case '\\': case '@': case '&': case '$':
-        case '#':  case '<': case '>': case '%':
-        case '\"': case '.': case ':':
-          // This is one of \\ \@ \& \$ etc escape sequences.
+        if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
+          // This is the \:: escape sequence.
           TokenPtr++;
-          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
-            // This is the \:: escape sequence.
-            TokenPtr++;
-          }
-          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(UnescapedText);
-          return;
         }
+        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
+        formTokenWithChars(T, TokenPtr, tok::text);
+        T.setText(UnescapedText);
+        return;
+      }
 
-        // Don't make zero-length commands.
-        if (!isCommandNameStartCharacter(*TokenPtr)) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
+      // Don't make zero-length commands.
+      if (!isCommandNameStartCharacter(*TokenPtr)) {
+        formTextToken(T, TokenPtr);
+        return;
+      }
 
-        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
-        unsigned Length = TokenPtr - (BufferPtr + 1);
-
-        // Hardcoded support for lexing LaTeX formula commands
-        // \f$ \f[ \f] \f{ \f} as a single command.
-        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
-          C = *TokenPtr;
-          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
-            TokenPtr++;
-            Length++;
-          }
-        }
+      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
+      unsigned Length = TokenPtr - (BufferPtr + 1);
 
-        StringRef CommandName(BufferPtr + 1, Length);
-
-        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
-        if (!Info) {
-          if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
-            StringRef CorrectedName = Info->Name;
-            SourceLocation Loc = getSourceLocation(BufferPtr);
-            SourceLocation EndLoc = getSourceLocation(TokenPtr);
-            SourceRange FullRange = SourceRange(Loc, EndLoc);
-            SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
-            Diag(Loc, diag::warn_correct_comment_command_name)
-              << FullRange << CommandName << CorrectedName
-              << FixItHint::CreateReplacement(CommandRange, CorrectedName);
-          } else {
-            formTokenWithChars(T, TokenPtr, tok::unknown_command);
-            T.setUnknownCommandName(CommandName);
-            Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
-                << SourceRange(T.getLocation(), T.getEndLocation());
-            return;
-          }
-        }
-        if (Info->IsVerbatimBlockCommand) {
-          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
-          return;
-        }
-        if (Info->IsVerbatimLineCommand) {
-          setupAndLexVerbatimLine(T, TokenPtr, Info);
-          return;
+      // Hardcoded support for lexing LaTeX formula commands
+      // \f$ \f[ \f] \f{ \f} as a single command.
+      if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
+        C = *TokenPtr;
+        if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
+          TokenPtr++;
+          Length++;
         }
-        formTokenWithChars(T, TokenPtr, CommandKind);
-        T.setCommandID(Info->getID());
-        return;
       }
 
-      case '&':
-        lexHTMLCharacterReference(T);
-        return;
-
-      case '<': {
-        TokenPtr++;
-        if (TokenPtr == CommentEnd) {
-          formTextToken(T, TokenPtr);
+      StringRef CommandName(BufferPtr + 1, Length);
+
+      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
+      if (!Info) {
+        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
+          StringRef CorrectedName = Info->Name;
+          SourceLocation Loc = getSourceLocation(BufferPtr);
+          SourceLocation EndLoc = getSourceLocation(TokenPtr);
+          SourceRange FullRange = SourceRange(Loc, EndLoc);
+          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
+          Diag(Loc, diag::warn_correct_comment_command_name)
+            << FullRange << CommandName << CorrectedName
+            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
+        } else {
+          formTokenWithChars(T, TokenPtr, tok::unknown_command);
+          T.setUnknownCommandName(CommandName);
+          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
+              << SourceRange(T.getLocation(), T.getEndLocation());
           return;
         }
-        const char C = *TokenPtr;
-        if (isHTMLIdentifierStartingCharacter(C))
-          setupAndLexHTMLStartTag(T);
-        else if (C == '/')
-          setupAndLexHTMLEndTag(T);
-        else
-          formTextToken(T, TokenPtr);
+      }
+      if (Info->IsVerbatimBlockCommand) {
+        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
         return;
       }
-
-      case '\n':
-      case '\r':
-        TokenPtr = skipNewline(TokenPtr, CommentEnd);
-        formTokenWithChars(T, TokenPtr, tok::newline);
-
-        if (CommentState == LCS_InsideCComment)
-          skipLineStartingDecorations();
+      if (Info->IsVerbatimLineCommand) {
+        setupAndLexVerbatimLine(T, TokenPtr, Info);
         return;
+      }
+      formTokenWithChars(T, TokenPtr, CommandKind);
+      T.setCommandID(Info->getID());
+      return;
+    }
 
-      default: {
-        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
-                         find_first_of("\n\r\\@&<");
-        if (End != StringRef::npos)
-          TokenPtr += End;
-        else
-          TokenPtr = CommentEnd;
+    case '&':
+      lexHTMLCharacterReference(T);
+      return;
+
+    case '<': {
+      TokenPtr++;
+      if (TokenPtr == CommentEnd) {
         formTextToken(T, TokenPtr);
         return;
       }
+      const char C = *TokenPtr;
+      if (isHTMLIdentifierStartingCharacter(C))
+        setupAndLexHTMLStartTag(T);
+      else if (C == '/')
+        setupAndLexHTMLEndTag(T);
+      else
+        formTextToken(T, TokenPtr);
+      return;
     }
+
+    default:
+      return HandleNonCommandToken();
   }
 }
 
@@ -727,14 +740,13 @@
 }
 
 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-             const CommandTraits &Traits,
-             SourceLocation FileLoc,
-             const char *BufferStart, const char *BufferEnd):
-    Allocator(Allocator), Diags(Diags), Traits(Traits),
-    BufferStart(BufferStart), BufferEnd(BufferEnd),
-    FileLoc(FileLoc), BufferPtr(BufferStart),
-    CommentState(LCS_BeforeComment), State(LS_Normal) {
-}
+             const CommandTraits &Traits, SourceLocation FileLoc,
+             const char *BufferStart, const char *BufferEnd,
+             bool ParseCommands)
+    : Allocator(Allocator), Diags(Diags), Traits(Traits),
+      BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
+      BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
+      ParseCommands(ParseCommands) {}
 
 void Lexer::lex(Token &T) {
 again:
Index: include/clang/AST/RawCommentList.h
===================================================================
--- include/clang/AST/RawCommentList.h
+++ include/clang/AST/RawCommentList.h
@@ -111,6 +111,33 @@
     return extractBriefText(Context);
   }
 
+  /// Returns sanitized comment text, suitable for presentation in editor UIs.
+  /// E.g. will transform:
+  ///     // This is a long multiline comment.
+  ///     //   Parts of it  might be indented.
+  ///     /* The comments styles might be mixed. */
+  ///  into
+  ///     "This is a long multiline comment.\n"
+  ///     "  Parts of it  might be indented.\n"
+  ///     "The comments styles might be mixed."
+  /// Also removes leading indentation and sanitizes some common cases:
+  ///     /* This is a first line.
+  ///      *   This is a second line. It is indented.
+  ///      * This is a third line. */
+  /// and
+  ///     /* This is a first line.
+  ///          This is a second line. It is indented.
+  ///     This is a third line. */
+  /// will both turn into:
+  ///     "This is a first line.\n"
+  ///     "  This is a second line. It is indented.\n"
+  ///     "This is a third line."
+  std::string getFormattedText(const ASTContext &Ctx) const;
+  /// An overload with more fine-grained parameters. Only for unit tests, use
+  /// the overload with ASTContext in the rest of the code.
+  std::string getFormattedText(const SourceManager &SourceMgr,
+                               DiagnosticsEngine &Diags) const;
+
   /// Parse the comment, assuming it is attached to decl \c D.
   comments::FullComment *parse(const ASTContext &Context,
                                const Preprocessor *PP, const Decl *D) const;
Index: include/clang/AST/CommentLexer.h
===================================================================
--- include/clang/AST/CommentLexer.h
+++ include/clang/AST/CommentLexer.h
@@ -281,6 +281,11 @@
   /// command, including command marker.
   SmallString<16> VerbatimBlockEndCommandName;
 
+  /// If true, the commands, html tags, etc will be parsed and reported as
+  /// separate tokens inside the comment body. If false, the comment text will
+  /// be parsed into text and newline tokens.
+  bool ParseCommands;
+
   /// Given a character reference name (e.g., "lt"), return the character that
   /// it stands for (e.g., "<").
   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
@@ -315,12 +320,11 @@
   /// Eat string matching regexp \code \s*\* \endcode.
   void skipLineStartingDecorations();
 
-  /// Lex stuff inside comments.  CommentEnd should be set correctly.
+  /// Lex comment text, including commands if ParseCommands is set to true.
   void lexCommentText(Token &T);
 
-  void setupAndLexVerbatimBlock(Token &T,
-                                const char *TextBegin,
-                                char Marker, const CommandInfo *Info);
+  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
+                                const CommandInfo *Info);
 
   void lexVerbatimBlockFirstLine(Token &T);
 
@@ -343,14 +347,13 @@
 
 public:
   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-        const CommandTraits &Traits,
-        SourceLocation FileLoc,
-        const char *BufferStart, const char *BufferEnd);
+        const CommandTraits &Traits, SourceLocation FileLoc,
+        const char *BufferStart, const char *BufferEnd,
+        bool ParseCommands = true);
 
   void lex(Token &T);
 
-  StringRef getSpelling(const Token &Tok,
-                        const SourceManager &SourceMgr,
+  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
                         bool *Invalid = nullptr) const;
 };

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D46000: [AST] Added a helper to extract a user-friendly text of a comment.

Reply via email to