[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange
This revision was automatically updated to reflect the committed changes. Closed by commit rG8c2cf499e611: [clang][Tooling] Add a way to tokenize a FileRange (authored by kadircet). Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D74962/new/ https://reviews.llvm.org/D74962 Files: clang/include/clang/Tooling/Syntax/Tokens.h clang/lib/Tooling/Syntax/Tokens.cpp clang/unittests/Tooling/Syntax/TokensTest.cpp Index: clang/unittests/Tooling/Syntax/TokensTest.cpp === --- clang/unittests/Tooling/Syntax/TokensTest.cpp +++ clang/unittests/Tooling/Syntax/TokensTest.cpp @@ -153,11 +153,17 @@ } } - /// Add a new file, run syntax::tokenize() on it and return the results. + /// Add a new file, run syntax::tokenize() on the range if any, run it on the + /// whole file otherwise and return the results. std::vector tokenize(llvm::StringRef Text) { +llvm::Annotations Annot(Text); +auto FID = SourceMgr->createFileID( +llvm::MemoryBuffer::getMemBufferCopy(Annot.code())); // FIXME: pass proper LangOptions. +if (Annot.ranges().empty()) + return syntax::tokenize(FID, *SourceMgr, LangOptions()); return syntax::tokenize( -SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)), +syntax::FileRange(FID, Annot.range().Begin, Annot.range().End), *SourceMgr, LangOptions()); } @@ -258,6 +264,20 @@ ElementsAre(Kind(tok::kw_int), AllOf(HasText("a"), Kind(tok::identifier)), Kind(tok::semi))); + EXPECT_THAT(tokenize("int [[main() {]]}"), + ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace))); + EXPECT_THAT(tokenize("int [[main() { ]]}"), + ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace))); + // First token is partially parsed, last token is fully included even though + // only a part of it is contained in the range. + EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"), + ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::kw_return))); } TEST_F(TokenCollectorTest, Basic) { Index: clang/lib/Tooling/Syntax/Tokens.cpp === --- clang/lib/Tooling/Syntax/Tokens.cpp +++ clang/lib/Tooling/Syntax/Tokens.cpp @@ -67,7 +67,8 @@ auto F = First.range(SM); auto L = Last.range(SM); assert(F.file() == L.file() && "tokens from different files"); - assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens"); + assert((F == L || F.endOffset() <= L.beginOffset()) && + "wrong order of tokens"); return FileRange(F.file(), F.beginOffset(), L.endOffset()); } @@ -307,7 +308,8 @@ return Expansions; } -std::vector syntax::tokenize(FileID FID, const SourceManager &SM, +std::vector syntax::tokenize(const FileRange &FR, +const SourceManager &SM, const LangOptions &LO) { std::vector Tokens; IdentifierTable Identifiers(LO); @@ -322,18 +324,28 @@ Tokens.push_back(syntax::Token(T)); }; - Lexer L(FID, SM.getBuffer(FID), SM, LO); + auto SrcBuffer = SM.getBufferData(FR.file()); + Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(), + SrcBuffer.data() + FR.beginOffset(), + // We can't make BufEnd point to FR.endOffset, as Lexer requires a + // null terminated buffer. + SrcBuffer.data() + SrcBuffer.size()); clang::Token T; - while (!L.LexFromRawLexer(T)) + while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset()) AddToken(T); - // 'eof' is only the last token if the input is null-terminated. Never store - // it, for consistency. - if (T.getKind() != tok::eof) + // LexFromRawLexer returns true when it parses the last token of the file, add + // it iff it starts within the range we are interested in. + if (SM.getFileOffset(T.getLocation()) < FR.endOffset()) AddToken(T); return Tokens; } +std::vector syntax::tokenize(FileID FID, const SourceManager &SM, +const LangOptions &LO) { + return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO); +} + /// Records information reqired to construct mappings for the token buffer that /// we are collecting. class TokenCollector::CollectPPExpansions : public PPCallbacks { Index: clang/include/clang/Tooling/Syntax/Tokens.h ===
[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange
kadircet updated this revision to Diff 246646. kadircet added a comment. - Address comments Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D74962/new/ https://reviews.llvm.org/D74962 Files: clang/include/clang/Tooling/Syntax/Tokens.h clang/lib/Tooling/Syntax/Tokens.cpp clang/unittests/Tooling/Syntax/TokensTest.cpp Index: clang/unittests/Tooling/Syntax/TokensTest.cpp === --- clang/unittests/Tooling/Syntax/TokensTest.cpp +++ clang/unittests/Tooling/Syntax/TokensTest.cpp @@ -153,11 +153,17 @@ } } - /// Add a new file, run syntax::tokenize() on it and return the results. + /// Add a new file, run syntax::tokenize() on the range if any, run it on the + /// whole file otherwise and return the results. std::vector tokenize(llvm::StringRef Text) { +llvm::Annotations Annot(Text); +auto FID = SourceMgr->createFileID( +llvm::MemoryBuffer::getMemBufferCopy(Annot.code())); // FIXME: pass proper LangOptions. +if (Annot.ranges().empty()) + return syntax::tokenize(FID, *SourceMgr, LangOptions()); return syntax::tokenize( -SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)), +syntax::FileRange(FID, Annot.range().Begin, Annot.range().End), *SourceMgr, LangOptions()); } @@ -258,6 +264,20 @@ ElementsAre(Kind(tok::kw_int), AllOf(HasText("a"), Kind(tok::identifier)), Kind(tok::semi))); + EXPECT_THAT(tokenize("int [[main() {]]}"), + ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace))); + EXPECT_THAT(tokenize("int [[main() { ]]}"), + ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace))); + // First token is partially parsed, last token is fully included even though + // only a part of it is contained in the range. + EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"), + ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::kw_return))); } TEST_F(TokenCollectorTest, Basic) { Index: clang/lib/Tooling/Syntax/Tokens.cpp === --- clang/lib/Tooling/Syntax/Tokens.cpp +++ clang/lib/Tooling/Syntax/Tokens.cpp @@ -67,7 +67,8 @@ auto F = First.range(SM); auto L = Last.range(SM); assert(F.file() == L.file() && "tokens from different files"); - assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens"); + assert((F == L || F.endOffset() <= L.beginOffset()) && + "wrong order of tokens"); return FileRange(F.file(), F.beginOffset(), L.endOffset()); } @@ -307,7 +308,8 @@ return Expansions; } -std::vector syntax::tokenize(FileID FID, const SourceManager &SM, +std::vector syntax::tokenize(const FileRange &FR, +const SourceManager &SM, const LangOptions &LO) { std::vector Tokens; IdentifierTable Identifiers(LO); @@ -322,18 +324,28 @@ Tokens.push_back(syntax::Token(T)); }; - Lexer L(FID, SM.getBuffer(FID), SM, LO); + auto SrcBuffer = SM.getBufferData(FR.file()); + Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(), + SrcBuffer.data() + FR.beginOffset(), + // We can't make BufEnd point to FR.endOffset, as Lexer requires a + // null terminated buffer. + SrcBuffer.data() + SrcBuffer.size()); clang::Token T; - while (!L.LexFromRawLexer(T)) + while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset()) AddToken(T); - // 'eof' is only the last token if the input is null-terminated. Never store - // it, for consistency. - if (T.getKind() != tok::eof) + // LexFromRawLexer returns true when it parses the last token of the file, add + // it iff it starts within the range we are interested in. + if (SM.getFileOffset(T.getLocation()) < FR.endOffset()) AddToken(T); return Tokens; } +std::vector syntax::tokenize(FileID FID, const SourceManager &SM, +const LangOptions &LO) { + return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO); +} + /// Records information reqired to construct mappings for the token buffer that /// we are collecting. class TokenCollector::CollectPPExpansions : public PPCallbacks { Index: clang/include/clang/Tooling/Syntax/Tokens.h === --- clang/include/clang/Tooling/Syntax/Tokens.h +++ clang/include/clang/Tooling/
[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange
kadircet marked 2 inline comments as done. kadircet added inline comments. Comment at: clang/lib/Tooling/Syntax/Tokens.cpp:335 clang::Token T; - while (!L.LexFromRawLexer(T)) + while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset()) AddToken(T); sammccall wrote: > Discussed offline, this loop includes an extra token if the truncation is at > whitespace between tokens. (Please test this case) > > Also the eof comment is confusing. > > I think the loop should be rewritten. instead of re-writing the loop I just changed the after-the-loop check to verify latest lexed token to be in range. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D74962/new/ https://reviews.llvm.org/D74962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange
sammccall added inline comments. Comment at: clang/lib/Tooling/Syntax/Tokens.cpp:335 clang::Token T; - while (!L.LexFromRawLexer(T)) + while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset()) AddToken(T); Discussed offline, this loop includes an extra token if the truncation is at whitespace between tokens. (Please test this case) Also the eof comment is confusing. I think the loop should be rewritten. Comment at: clang/lib/Tooling/Syntax/Tokens.cpp:346 +const LangOptions &LO) { + return tokenize(syntax::FileRange(SM, SM.getLocForStartOfFile(FID), +SM.getLocForEndOfFile(FID)), nit: FileRange(FID, 0, SM.getFileIDSize(FID)) is a lot more direct :-) Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D74962/new/ https://reviews.llvm.org/D74962 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D74962: [clang][Tooling] Add a way to tokenize a FileRange
kadircet created this revision. kadircet added a reviewer: sammccall. Herald added a project: clang. Herald added a subscriber: cfe-commits. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D74962 Files: clang/include/clang/Tooling/Syntax/Tokens.h clang/lib/Tooling/Syntax/Tokens.cpp clang/unittests/Tooling/Syntax/TokensTest.cpp Index: clang/unittests/Tooling/Syntax/TokensTest.cpp === --- clang/unittests/Tooling/Syntax/TokensTest.cpp +++ clang/unittests/Tooling/Syntax/TokensTest.cpp @@ -153,11 +153,17 @@ } } - /// Add a new file, run syntax::tokenize() on it and return the results. + /// Add a new file, run syntax::tokenize() on the range if any, run it on the + /// whole file otherwise and return the results. std::vector tokenize(llvm::StringRef Text) { +llvm::Annotations Annot(Text); +auto FID = SourceMgr->createFileID( +llvm::MemoryBuffer::getMemBufferCopy(Annot.code())); // FIXME: pass proper LangOptions. +if (Annot.ranges().empty()) + return syntax::tokenize(FID, *SourceMgr, LangOptions()); return syntax::tokenize( -SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)), +syntax::FileRange(FID, Annot.range().Begin, Annot.range().End), *SourceMgr, LangOptions()); } @@ -258,6 +264,16 @@ ElementsAre(Kind(tok::kw_int), AllOf(HasText("a"), Kind(tok::identifier)), Kind(tok::semi))); + EXPECT_THAT(tokenize("int [[main() {]]}"), + ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace))); + // First token is partially parsed, last token is fully included even though + // only a part of it is contained in the range. + EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"), + ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)), + Kind(tok::l_paren), Kind(tok::r_paren), + Kind(tok::l_brace), Kind(tok::kw_return))); } TEST_F(TokenCollectorTest, Basic) { Index: clang/lib/Tooling/Syntax/Tokens.cpp === --- clang/lib/Tooling/Syntax/Tokens.cpp +++ clang/lib/Tooling/Syntax/Tokens.cpp @@ -67,7 +67,8 @@ auto F = First.range(SM); auto L = Last.range(SM); assert(F.file() == L.file() && "tokens from different files"); - assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens"); + assert((F == L || F.endOffset() <= L.beginOffset()) && + "wrong order of tokens"); return FileRange(F.file(), F.beginOffset(), L.endOffset()); } @@ -307,7 +308,8 @@ return Expansions; } -std::vector syntax::tokenize(FileID FID, const SourceManager &SM, +std::vector syntax::tokenize(const FileRange &FR, +const SourceManager &SM, const LangOptions &LO) { std::vector Tokens; IdentifierTable Identifiers(LO); @@ -322,10 +324,15 @@ Tokens.push_back(syntax::Token(T)); }; - Lexer L(FID, SM.getBuffer(FID), SM, LO); + auto SrcBuffer = SM.getBufferData(FR.file()); + Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(), + SrcBuffer.data() + FR.beginOffset(), + // We can't make BufEnd point to FR.endOffset, as Lexer requires a + // null terminated buffer. + SrcBuffer.data() + SrcBuffer.size()); clang::Token T; - while (!L.LexFromRawLexer(T)) + while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset()) AddToken(T); // 'eof' is only the last token if the input is null-terminated. Never store // it, for consistency. @@ -334,6 +341,13 @@ return Tokens; } +std::vector syntax::tokenize(FileID FID, const SourceManager &SM, +const LangOptions &LO) { + return tokenize(syntax::FileRange(SM, SM.getLocForStartOfFile(FID), +SM.getLocForEndOfFile(FID)), + SM, LO); +} + /// Records information reqired to construct mappings for the token buffer that /// we are collecting. class TokenCollector::CollectPPExpansions : public PPCallbacks { Index: clang/include/clang/Tooling/Syntax/Tokens.h === --- clang/include/clang/Tooling/Syntax/Tokens.h +++ clang/include/clang/Tooling/Syntax/Tokens.h @@ -339,6 +339,12 @@ /// The result will *not* have a 'eof' token at the end. std::vector tokenize(FileID FID, const SourceManager &SM, const LangOptions &LO); +/// Similar to one above, instead of whole file tokenizes a part of it. Note +/// that, the first token might be incomplete if FR.startOffset is not at the +/// beginning of a token, and