[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange

2020-02-26 Thread Kadir Cetinkaya via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG8c2cf499e611: [clang][Tooling] Add a way to tokenize a 
FileRange (authored by kadircet).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D74962/new/

https://reviews.llvm.org/D74962

Files:
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/unittests/Tooling/Syntax/TokensTest.cpp

Index: clang/unittests/Tooling/Syntax/TokensTest.cpp
===
--- clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -153,11 +153,17 @@
 }
   }
 
-  /// Add a new file, run syntax::tokenize() on it and return the results.
+  /// Add a new file, run syntax::tokenize() on the range if any, run it on the
+  /// whole file otherwise and return the results.
   std::vector tokenize(llvm::StringRef Text) {
+llvm::Annotations Annot(Text);
+auto FID = SourceMgr->createFileID(
+llvm::MemoryBuffer::getMemBufferCopy(Annot.code()));
 // FIXME: pass proper LangOptions.
+if (Annot.ranges().empty())
+  return syntax::tokenize(FID, *SourceMgr, LangOptions());
 return syntax::tokenize(
-SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)),
+syntax::FileRange(FID, Annot.range().Begin, Annot.range().End),
 *SourceMgr, LangOptions());
   }
 
@@ -258,6 +264,20 @@
   ElementsAre(Kind(tok::kw_int),
   AllOf(HasText("a"), Kind(tok::identifier)),
   Kind(tok::semi)));
+  EXPECT_THAT(tokenize("int [[main() {]]}"),
+  ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace)));
+  EXPECT_THAT(tokenize("int [[main() {   ]]}"),
+  ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace)));
+  // First token is partially parsed, last token is fully included even though
+  // only a part of it is contained in the range.
+  EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"),
+  ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace), Kind(tok::kw_return)));
 }
 
 TEST_F(TokenCollectorTest, Basic) {
Index: clang/lib/Tooling/Syntax/Tokens.cpp
===
--- clang/lib/Tooling/Syntax/Tokens.cpp
+++ clang/lib/Tooling/Syntax/Tokens.cpp
@@ -67,7 +67,8 @@
   auto F = First.range(SM);
   auto L = Last.range(SM);
   assert(F.file() == L.file() && "tokens from different files");
-  assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens");
+  assert((F == L || F.endOffset() <= L.beginOffset()) &&
+ "wrong order of tokens");
   return FileRange(F.file(), F.beginOffset(), L.endOffset());
 }
 
@@ -307,7 +308,8 @@
   return Expansions;
 }
 
-std::vector syntax::tokenize(FileID FID, const SourceManager &SM,
+std::vector syntax::tokenize(const FileRange &FR,
+const SourceManager &SM,
 const LangOptions &LO) {
   std::vector Tokens;
   IdentifierTable Identifiers(LO);
@@ -322,18 +324,28 @@
 Tokens.push_back(syntax::Token(T));
   };
 
-  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+  auto SrcBuffer = SM.getBufferData(FR.file());
+  Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),
+  SrcBuffer.data() + FR.beginOffset(),
+  // We can't make BufEnd point to FR.endOffset, as Lexer requires a
+  // null terminated buffer.
+  SrcBuffer.data() + SrcBuffer.size());
 
   clang::Token T;
-  while (!L.LexFromRawLexer(T))
+  while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
 AddToken(T);
-  // 'eof' is only the last token if the input is null-terminated. Never store
-  // it, for consistency.
-  if (T.getKind() != tok::eof)
+  // LexFromRawLexer returns true when it parses the last token of the file, add
+  // it iff it starts within the range we are interested in.
+  if (SM.getFileOffset(T.getLocation()) < FR.endOffset())
 AddToken(T);
   return Tokens;
 }
 
+std::vector syntax::tokenize(FileID FID, const SourceManager &SM,
+const LangOptions &LO) {
+  return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO);
+}
+
 /// Records information reqired to construct mappings for the token buffer that
 /// we are collecting.
 class TokenCollector::CollectPPExpansions : public PPCallbacks {
Index: clang/include/clang/Tooling/Syntax/Tokens.h
===

[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange

2020-02-26 Thread Kadir Cetinkaya via Phabricator via cfe-commits
kadircet updated this revision to Diff 246646.
kadircet added a comment.

- Address comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D74962/new/

https://reviews.llvm.org/D74962

Files:
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/unittests/Tooling/Syntax/TokensTest.cpp

Index: clang/unittests/Tooling/Syntax/TokensTest.cpp
===
--- clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -153,11 +153,17 @@
 }
   }
 
-  /// Add a new file, run syntax::tokenize() on it and return the results.
+  /// Add a new file, run syntax::tokenize() on the range if any, run it on the
+  /// whole file otherwise and return the results.
   std::vector tokenize(llvm::StringRef Text) {
+llvm::Annotations Annot(Text);
+auto FID = SourceMgr->createFileID(
+llvm::MemoryBuffer::getMemBufferCopy(Annot.code()));
 // FIXME: pass proper LangOptions.
+if (Annot.ranges().empty())
+  return syntax::tokenize(FID, *SourceMgr, LangOptions());
 return syntax::tokenize(
-SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)),
+syntax::FileRange(FID, Annot.range().Begin, Annot.range().End),
 *SourceMgr, LangOptions());
   }
 
@@ -258,6 +264,20 @@
   ElementsAre(Kind(tok::kw_int),
   AllOf(HasText("a"), Kind(tok::identifier)),
   Kind(tok::semi)));
+  EXPECT_THAT(tokenize("int [[main() {]]}"),
+  ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace)));
+  EXPECT_THAT(tokenize("int [[main() {   ]]}"),
+  ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace)));
+  // First token is partially parsed, last token is fully included even though
+  // only a part of it is contained in the range.
+  EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"),
+  ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace), Kind(tok::kw_return)));
 }
 
 TEST_F(TokenCollectorTest, Basic) {
Index: clang/lib/Tooling/Syntax/Tokens.cpp
===
--- clang/lib/Tooling/Syntax/Tokens.cpp
+++ clang/lib/Tooling/Syntax/Tokens.cpp
@@ -67,7 +67,8 @@
   auto F = First.range(SM);
   auto L = Last.range(SM);
   assert(F.file() == L.file() && "tokens from different files");
-  assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens");
+  assert((F == L || F.endOffset() <= L.beginOffset()) &&
+ "wrong order of tokens");
   return FileRange(F.file(), F.beginOffset(), L.endOffset());
 }
 
@@ -307,7 +308,8 @@
   return Expansions;
 }
 
-std::vector syntax::tokenize(FileID FID, const SourceManager &SM,
+std::vector syntax::tokenize(const FileRange &FR,
+const SourceManager &SM,
 const LangOptions &LO) {
   std::vector Tokens;
   IdentifierTable Identifiers(LO);
@@ -322,18 +324,28 @@
 Tokens.push_back(syntax::Token(T));
   };
 
-  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+  auto SrcBuffer = SM.getBufferData(FR.file());
+  Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),
+  SrcBuffer.data() + FR.beginOffset(),
+  // We can't make BufEnd point to FR.endOffset, as Lexer requires a
+  // null terminated buffer.
+  SrcBuffer.data() + SrcBuffer.size());
 
   clang::Token T;
-  while (!L.LexFromRawLexer(T))
+  while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
 AddToken(T);
-  // 'eof' is only the last token if the input is null-terminated. Never store
-  // it, for consistency.
-  if (T.getKind() != tok::eof)
+  // LexFromRawLexer returns true when it parses the last token of the file, add
+  // it iff it starts within the range we are interested in.
+  if (SM.getFileOffset(T.getLocation()) < FR.endOffset())
 AddToken(T);
   return Tokens;
 }
 
+std::vector syntax::tokenize(FileID FID, const SourceManager &SM,
+const LangOptions &LO) {
+  return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO);
+}
+
 /// Records information reqired to construct mappings for the token buffer that
 /// we are collecting.
 class TokenCollector::CollectPPExpansions : public PPCallbacks {
Index: clang/include/clang/Tooling/Syntax/Tokens.h
===
--- clang/include/clang/Tooling/Syntax/Tokens.h
+++ clang/include/clang/Tooling/

[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange

2020-02-26 Thread Kadir Cetinkaya via Phabricator via cfe-commits
kadircet marked 2 inline comments as done.
kadircet added inline comments.



Comment at: clang/lib/Tooling/Syntax/Tokens.cpp:335
   clang::Token T;
-  while (!L.LexFromRawLexer(T))
+  while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
 AddToken(T);

sammccall wrote:
> Discussed offline, this loop includes an extra token if the truncation is at 
> whitespace between tokens. (Please test this case)
> 
> Also the eof comment is confusing.
> 
> I think the loop should be rewritten.
instead of re-writing the loop I just changed the after-the-loop check to 
verify latest lexed token to be in range.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D74962/new/

https://reviews.llvm.org/D74962



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange

2020-02-21 Thread Sam McCall via Phabricator via cfe-commits
sammccall added inline comments.



Comment at: clang/lib/Tooling/Syntax/Tokens.cpp:335
   clang::Token T;
-  while (!L.LexFromRawLexer(T))
+  while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
 AddToken(T);

Discussed offline, this loop includes an extra token if the truncation is at 
whitespace between tokens. (Please test this case)

Also the eof comment is confusing.

I think the loop should be rewritten.



Comment at: clang/lib/Tooling/Syntax/Tokens.cpp:346
+const LangOptions &LO) {
+  return tokenize(syntax::FileRange(SM, SM.getLocForStartOfFile(FID),
+SM.getLocForEndOfFile(FID)),

nit: FileRange(FID, 0, SM.getFileIDSize(FID)) is a lot more direct :-)


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D74962/new/

https://reviews.llvm.org/D74962



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange

2020-02-21 Thread Kadir Cetinkaya via Phabricator via cfe-commits
kadircet created this revision.
kadircet added a reviewer: sammccall.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D74962

Files:
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/unittests/Tooling/Syntax/TokensTest.cpp

Index: clang/unittests/Tooling/Syntax/TokensTest.cpp
===
--- clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -153,11 +153,17 @@
 }
   }
 
-  /// Add a new file, run syntax::tokenize() on it and return the results.
+  /// Add a new file, run syntax::tokenize() on the range if any, run it on the
+  /// whole file otherwise and return the results.
   std::vector tokenize(llvm::StringRef Text) {
+llvm::Annotations Annot(Text);
+auto FID = SourceMgr->createFileID(
+llvm::MemoryBuffer::getMemBufferCopy(Annot.code()));
 // FIXME: pass proper LangOptions.
+if (Annot.ranges().empty())
+  return syntax::tokenize(FID, *SourceMgr, LangOptions());
 return syntax::tokenize(
-SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)),
+syntax::FileRange(FID, Annot.range().Begin, Annot.range().End),
 *SourceMgr, LangOptions());
   }
 
@@ -258,6 +264,16 @@
   ElementsAre(Kind(tok::kw_int),
   AllOf(HasText("a"), Kind(tok::identifier)),
   Kind(tok::semi)));
+  EXPECT_THAT(tokenize("int [[main() {]]}"),
+  ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace)));
+  // First token is partially parsed, last token is fully included even though
+  // only a part of it is contained in the range.
+  EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"),
+  ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)),
+  Kind(tok::l_paren), Kind(tok::r_paren),
+  Kind(tok::l_brace), Kind(tok::kw_return)));
 }
 
 TEST_F(TokenCollectorTest, Basic) {
Index: clang/lib/Tooling/Syntax/Tokens.cpp
===
--- clang/lib/Tooling/Syntax/Tokens.cpp
+++ clang/lib/Tooling/Syntax/Tokens.cpp
@@ -67,7 +67,8 @@
   auto F = First.range(SM);
   auto L = Last.range(SM);
   assert(F.file() == L.file() && "tokens from different files");
-  assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens");
+  assert((F == L || F.endOffset() <= L.beginOffset()) &&
+ "wrong order of tokens");
   return FileRange(F.file(), F.beginOffset(), L.endOffset());
 }
 
@@ -307,7 +308,8 @@
   return Expansions;
 }
 
-std::vector syntax::tokenize(FileID FID, const SourceManager &SM,
+std::vector syntax::tokenize(const FileRange &FR,
+const SourceManager &SM,
 const LangOptions &LO) {
   std::vector Tokens;
   IdentifierTable Identifiers(LO);
@@ -322,10 +324,15 @@
 Tokens.push_back(syntax::Token(T));
   };
 
-  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+  auto SrcBuffer = SM.getBufferData(FR.file());
+  Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),
+  SrcBuffer.data() + FR.beginOffset(),
+  // We can't make BufEnd point to FR.endOffset, as Lexer requires a
+  // null terminated buffer.
+  SrcBuffer.data() + SrcBuffer.size());
 
   clang::Token T;
-  while (!L.LexFromRawLexer(T))
+  while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
 AddToken(T);
   // 'eof' is only the last token if the input is null-terminated. Never store
   // it, for consistency.
@@ -334,6 +341,13 @@
   return Tokens;
 }
 
+std::vector syntax::tokenize(FileID FID, const SourceManager &SM,
+const LangOptions &LO) {
+  return tokenize(syntax::FileRange(SM, SM.getLocForStartOfFile(FID),
+SM.getLocForEndOfFile(FID)),
+  SM, LO);
+}
+
 /// Records information reqired to construct mappings for the token buffer that
 /// we are collecting.
 class TokenCollector::CollectPPExpansions : public PPCallbacks {
Index: clang/include/clang/Tooling/Syntax/Tokens.h
===
--- clang/include/clang/Tooling/Syntax/Tokens.h
+++ clang/include/clang/Tooling/Syntax/Tokens.h
@@ -339,6 +339,12 @@
 /// The result will *not* have a 'eof' token at the end.
 std::vector tokenize(FileID FID, const SourceManager &SM,
 const LangOptions &LO);
+/// Similar to one above, instead of whole file tokenizes a part of it. Note
+/// that, the first token might be incomplete if FR.startOffset is not at the
+/// beginning of a token, and