[PATCH] D119162: [Pseudo] Token/TokenStream, PP directive parser.

Sam McCall via Phabricator via cfe-commits Mon, 07 Feb 2022 10:15:31 -0800

sammccall created this revision.
sammccall added a reviewer: hokein.
Herald added a subscriber: mgorny.
sammccall requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.


The TokenStream class is the representation of the source code that will
be fed into the GLR parser.

This patch allows a "raw" TokenStream to be built by reading source code.
It also supports scanning a TokenStream to find the directive structure.

Next steps (with placeholders in the code): heuristically choosing a
path through #ifs, preprocessing the code by stripping directives and
comments, cooking raw_identifiers.
These will produce a suitable stream to feed into the parser proper.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D119162

Files:
  clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
  clang/include/clang/Tooling/Syntax/Pseudo/Token.h
  clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
  clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
  clang/lib/Tooling/Syntax/Pseudo/Token.cpp
  clang/test/Syntax/Inputs/example.c
  clang/test/Syntax/lex.test
  clang/tools/clang-pseudo/ClangPseudo.cpp

Index: clang/tools/clang-pseudo/ClangPseudo.cpp
===================================================================
--- clang/tools/clang-pseudo/ClangPseudo.cpp
+++ clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -21,19 +24,31 @@
     CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."),
                  init(""));
 
+static opt<std::string> Source("source", desc("Source file"));
+static opt<bool> PrintSource("print-source", desc("Print token stream"));
+static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
+static opt<bool>
+    PrintPPStructure("print-pp-structure",
+                     desc("Print directive structure of source code"));
+
+static std::string readOrDie(llvm::StringRef Path) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFile(Path);
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
+                 << "': " << EC.message() << "\n";
+    ::exit(1);
+  }
+  return Text.get()->getBuffer().str();
+}
+
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "");
 
   if (CheckGrammar.getNumOccurrences()) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-        llvm::MemoryBuffer::getFile(CheckGrammar);
-    if (std::error_code EC = Text.getError()) {
-      llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
-                   << "': " << EC.message() << "\n";
-      return 1;
-    }
+    std::string Text = readOrDie(CheckGrammar);
     std::vector<std::string> Diags;
-    auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags);
+    auto RSpecs = Grammar::parseBNF(Text, Diags);
 
     if (!Diags.empty()) {
       llvm::errs() << llvm::join(Diags, "\n");
@@ -43,5 +58,20 @@
                                   CheckGrammar);
     return 0;
   }
+
+  if (Source.getNumOccurrences()) {
+    std::string Text = readOrDie(Source);
+    clang::LangOptions LangOpts; // FIXME: use real options.
+    auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
+    auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
+
+    if (PrintPPStructure)
+      llvm::outs() << Structure;
+    if (PrintSource)
+      Stream.print(llvm::outs());
+    if (PrintTokens)
+      llvm::outs() << Stream;
+  }
+
   return 0;
 }
Index: clang/test/Syntax/lex.test
===================================================================
--- /dev/null
+++ clang/test/Syntax/lex.test
@@ -0,0 +1,38 @@
+// RUN: clang-pseudo -source %S/Inputs/example.c -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
+     SOURCE: int is_debug() {
+SOURCE-NEXT: #ifndef NDEBUG
+SOURCE-NEXT:   return 1; // in debug mode
+SOURCE-NEXT: #else
+SOURCE-NEXT:  return 0;
+SOURCE-NEXT: #end
+SOURCE-NEXT: }
+// RUN: clang-pseudo -source %S/Inputs/example.c -print-tokens | FileCheck %s -check-prefix=TOKEN
+TOKEN:   0: raw_identifier   0:0 "int" flags=1
+TOKEN-NEXT: raw_identifier   0:0 "is_debug"
+TOKEN-NEXT: l_paren          0:0 "("
+TOKEN-NEXT: r_paren          0:0 ")"
+TOKEN-NEXT: l_brace          0:0 "{"
+TOKEN-NEXT: hash             1:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   1:0 "ifndef"
+TOKEN-NEXT: raw_identifier   1:0 "NDEBUG"
+TOKEN-NEXT: raw_identifier   2:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 2:2 "1"
+TOKEN-NEXT: semi             2:2 ";"
+TOKEN-NEXT: comment          2:2 "// in debug mode"
+TOKEN-NEXT: hash             3:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   3:0 "else"
+TOKEN-NEXT: raw_identifier   4:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 4:2 "0"
+TOKEN-NEXT: semi             4:2 ";"
+TOKEN-NEXT: hash             5:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   5:0 "endif"
+TOKEN-NEXT: r_brace          6:0 "}" flags=1
+// RUN: clang-pseudo -source %S/Inputs/example.c -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
+     PPS: code (5 tokens)
+PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT:   code (4 tokens)
+PPS-NEXT: #else (2 tokens)
+PPS-NEXT:   code (3 tokens)
+PPS-NEXT: #endif (2 tokens)
+PPS-NEXT: code (1 tokens)
+
Index: clang/test/Syntax/Inputs/example.c
===================================================================
--- /dev/null
+++ clang/test/Syntax/Inputs/example.c
@@ -0,0 +1,7 @@
+int is_debug() {
+#ifndef NDEBUG
+  return 1; // in debug mode
+#else
+  return 0;
+#endif
+}
Index: clang/lib/Tooling/Syntax/Pseudo/Token.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Token.cpp
@@ -0,0 +1,94 @@
+//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+  OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
+                      T.Indent);
+  OS << '"';
+  llvm::printEscapedString(T.text(), OS);
+  OS << '"';
+  if (T.Pair != Token::Invalid)
+    OS << "  pair=" << T.Pair;
+  if (T.Flags)
+    OS << llvm::format("  flags=%2x", T.Flags);
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
+  OS << "Index               Kind    Line  Text\n";
+  for (const auto &T : TS.tokens()) {
+    OS << llvm::format("%5d:  %16s %4d:%-2d  ", TS.index(T),
+                       clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
+    OS << '"';
+    llvm::printEscapedString(T.text(), OS);
+    OS << '"';
+    if (T.Pair != Token::Invalid)
+      OS << "  pair=" << T.Pair;
+    if (T.Flags)
+      OS << llvm::format("  flags=%x", T.Flags);
+    OS << '\n';
+  }
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
+  OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
+  return OS;
+}
+
+TokenStream::TokenStream(std::shared_ptr<void> Payload)
+    : Payload(std::move(Payload)) {
+  Storage.emplace_back();
+  Storage.back().Kind = clang::tok::eof;
+}
+
+void TokenStream::finalize() {
+  unsigned LastLine = Storage.back().Line;
+  Storage.emplace_back();
+  Storage.back().Kind = tok::eof;
+  Storage.back().Line = LastLine + 1;
+
+  Tokens = Storage;
+  Tokens = Tokens.drop_front().drop_back();
+}
+
+void TokenStream::print(llvm::raw_ostream &OS) const {
+  bool FirstToken = true;
+  unsigned LastLine = -1;
+  StringRef LastText;
+  for (const auto &T : tokens()) {
+    StringRef Text = T.text();
+    if (FirstToken) {
+      FirstToken = false;
+    } else if (T.Line == LastLine) {
+      if (LastText.data() + LastText.size() != Text.data())
+        OS << ' ';
+    } else {
+      OS << '\n';
+      OS.indent(T.Indent);
+    }
+    OS << Text;
+    LastLine = T.Line;
+    LastText = Text;
+  }
+  if (!FirstToken)
+    OS << '\n';
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
@@ -0,0 +1,197 @@
+//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+class Parser {
+public:
+  explicit Parser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
+  void parse(PPStructure *result) { parse(result, /*TopLevel=*/true); }
+
+private:
+  // Roles that a directive might take within a conditional block.
+  enum class Cond { None, If, Else, End };
+  static Cond classifyDirective(tok::PPKeywordKind kind) {
+    switch (kind) {
+    case clang::tok::pp_if:
+    case clang::tok::pp_ifdef:
+    case clang::tok::pp_ifndef:
+      return Cond::If;
+    case clang::tok::pp_elif:
+    case clang::tok::pp_elifdef:
+    case clang::tok::pp_elifndef:
+    case clang::tok::pp_else:
+      return Cond::Else;
+    case clang::tok::pp_endif:
+      return Cond::End;
+    default:
+      return Cond::None;
+    }
+  }
+
+  // Parses tokens starting at Tok into PP.
+  // If we reach an #end or #else directive that ends PP, returns it.
+  // If TopLevel is true, then we do not expect #end and always return None.
+  llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
+    auto StartsDirective =
+        [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
+          if (Tok->flag(LexFlags::StartsPPLine)) {
+            // If we considered a comment at the start of a PP-line, it doesn't
+            // start a directive but the directive can still start after it.
+            if (Tok->Kind == tok::comment)
+              AllowDirectiveAt = Tok + 1;
+            return Tok->Kind == tok::hash;
+          }
+          return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
+        };
+    while (Tok->Kind != tok::eof) {
+      while (StartsDirective()) {
+        PPStructure::Directive Directive;
+        parseDirective(&Directive);
+        Cond Kind = classifyDirective(Directive.Kind);
+        if (Kind == Cond::If) {
+          PPStructure::Conditional Conditional;
+          Conditional.Branches.emplace_back();
+          Conditional.Branches.back().first = std::move(Directive);
+          parseConditional(&Conditional);
+          PP->Chunks.push_back(std::move(Conditional));
+          continue;
+        }
+        // Unexpected #else or #endif at top level; parse as normal directives.
+        if (Kind == Cond::None || TopLevel) {
+          PP->Chunks.push_back(std::move(Directive));
+          continue;
+        }
+        assert(Kind == Cond::Else || Kind == Cond::End);
+        return std::move(Directive);
+      }
+      const Token *Start = Tok;
+      while (Tok->Kind != tok::eof && !StartsDirective())
+        ++Tok;
+      if (Tok != Start)
+        PP->Chunks.push_back(PPStructure::Code{
+            Token::Range{Code.index(*Start), Code.index(*Tok)}});
+    }
+    return None;
+  }
+
+  // Parse the rest of a conditional section, after seeing the #if directive.
+  // Returns after consuming the #end directive.
+  void parseConditional(PPStructure::Conditional *C) {
+    assert(C->Branches.size() == 1 &&
+           C->Branches.front().second.Chunks.empty() &&
+           "Should be ready to parse first branch body");
+    while (Tok->Kind != tok::eof) {
+      auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
+      if (!Terminator) {
+        assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
+        C->End.Tokens = Token::Range::empty(Code.index(*Tok));
+        return;
+      }
+      if (classifyDirective(Terminator->Kind) == Cond::End) {
+        C->End = std::move(*Terminator);
+        return;
+      }
+      assert(classifyDirective(Terminator->Kind) == Cond::Else &&
+             "ended branch unexpectedly");
+      C->Branches.emplace_back();
+      C->Branches.back().first = std::move(*Terminator);
+    }
+  }
+
+  // Parse a directive. Tok is the hash.
+  void parseDirective(PPStructure::Directive *D) {
+    assert(Tok->Kind == tok::hash);
+    D->Tokens.Begin = Code.index(*Tok);
+    do {
+      ++Tok;
+    } while (Tok->Kind == tok::comment && !Tok->flag(LexFlags::StartsPPLine));
+    // Technically directive names can be spelled with UCNs or split over lines.
+    // In practice, this never happens.
+    if (Tok->Kind == tok::raw_identifier)
+      D->Kind = Idents.get(Tok->text()).getPPKeywordID();
+    while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
+      ++Tok;
+    D->Tokens.End = Code.index(*Tok);
+  }
+
+  const TokenStream &Code;
+  const Token *Tok;
+  clang::IdentifierTable Idents;
+};
+
+} // namespace
+
+PPStructure PPStructure::parse(const TokenStream &Code) {
+  PPStructure Result;
+  Parser(Code).parse(&Result);
+  return Result;
+}
+
+static llvm::StringLiteral ppKeywordName(tok::PPKeywordKind kind) {
+  switch (kind) {
+#define PPKEYWORD(x)                                                           \
+  case tok::pp_##x:                                                            \
+    return #x;
+#include "clang/Basic/TokenKinds.def"
+  default:
+    return "unknown";
+  }
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
+                 unsigned Indent) {
+  auto DumpDirective = [&](const PPStructure::Directive &Directive) {
+    OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
+                                       ppKeywordName(Directive.Kind),
+                                       Directive.Tokens.size());
+  };
+
+  for (const auto &Chunk : PP.Chunks) {
+    switch (Chunk.kind()) {
+    case PPStructure::Chunk::K_Empty:
+      llvm_unreachable("invalid chunk");
+    case PPStructure::Chunk::K_Code: {
+      const PPStructure::Code &Code(Chunk);
+      OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n",
+                                         Code.Tokens.size());
+      break;
+    }
+    case PPStructure::Chunk::K_Directive: {
+      const PPStructure::Directive &Directive(Chunk);
+      DumpDirective(Directive);
+      break;
+    }
+    case PPStructure::Chunk::K_Conditional: {
+      const PPStructure::Conditional &Conditional(Chunk);
+      for (const auto &Branch : Conditional.Branches) {
+        DumpDirective(Branch.first);
+        dump(OS, Branch.second, Indent + 2);
+      }
+      DumpDirective(Conditional.End);
+      break;
+    }
+    }
+  }
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const PPStructure &PP) {
+  dump(OS, PP, 0);
+  return OS;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
@@ -0,0 +1,77 @@
+//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
+  clang::SourceLocation Start;
+  // Tokenize using clang's lexer in raw mode.
+  // std::string guarantees null-termination, which the lexer needs.
+  clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
+                     Code.data() + Code.size());
+  Lexer.SetCommentRetentionState(true);
+
+  TokenStream Result;
+  clang::Token CT;
+  unsigned LastOffset = 0;
+  unsigned Line = 0;
+  unsigned Indent = 0;
+  for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
+       Lexer.LexFromRawLexer(CT)) {
+    unsigned Offset =
+        CT.getLocation().getRawEncoding() - Start.getRawEncoding();
+
+    Token Tok;
+    Tok.Data = &Code[Offset];
+    Tok.Length = CT.getLength();
+    Tok.Kind = CT.getKind();
+
+    // Update current line number and indentation from raw source code.
+    unsigned NewLineStart = 0;
+    for (unsigned i = LastOffset; i < Offset; ++i) {
+      if (Code[i] == '\n') {
+        NewLineStart = i + 1;
+        ++Line;
+      }
+    }
+    // Indentation isn't always well defined when lines are continued.
+    if ((NewLineStart || !LastOffset) && CT.isAtStartOfLine()) {
+      Indent = 0;
+      for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
+        if (c == ' ')
+          ++Indent;
+        else if (c == '\t')
+          Indent += 8;
+        else
+          break;
+      }
+    }
+    Tok.Indent = Indent;
+    Tok.Line = Line;
+
+    if (CT.isAtStartOfLine())
+      Tok.setFlag(LexFlags::StartsPPLine);
+    if (CT.needsCleaning() || CT.hasUCN())
+      Tok.setFlag(LexFlags::DirtyIdentifier);
+
+    Result.push(Tok);
+    LastOffset = Offset;
+  }
+  Result.finalize();
+  return Result;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,7 +3,10 @@
 add_clang_library(clangToolingSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
-  
+  Lex.cpp
+  Preprocess.cpp
+  Token.cpp
+
   LINK_LIBS
   clangBasic
   clangLex
Index: clang/include/clang/Tooling/Syntax/Pseudo/Token.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Token.h
@@ -0,0 +1,172 @@
+//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are the first level of abstraction above bytes used in pseudoparsing.
+// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
+// The tokens is wrapped into pseudo::Token, along with line/indent info.
+//
+// Unlike clang, we make multiple passes over the whole file, out-of-order.
+// Therefore we retain the whole token sequence in memory. (This is feasible as
+// we process one file at a time). pseudo::TokenStream holds such a stream.
+// The initial stream holds the raw tokens read from the file, later passes
+// operate on derived TokenStreams (e.g. with directives stripped).
+//
+// Similar facilities from clang that are *not* used:
+//  - SourceManager: designed around multiple files and precise macro expansion.
+//  - clang::Token: coupled to SourceManager, doesn't retain layout info.
+//                  (pseudo::Token is similar, but without SourceLocations).
+//  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
+//                  (pseudo::TokenStream is similar, but a flat token list).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// A single C++ or preprocessor token.
+///
+/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
+/// SourceManager - we are not dealing with multiple files.
+struct Token {
+  /// An Index identifies a token within a stream.
+  using Index = uint32_t;
+  /// A sentinel Index indicating no token.
+  constexpr static Index Invalid = std::numeric_limits<Index>::max();
+  struct Range;
+
+  /// The token text.
+  ///
+  /// Typically from the original source file, but may have been synthesized.
+  StringRef text() const { return StringRef(Data, Length); }
+  const char *Data;
+  uint32_t Length;
+
+  /// Zero-based line number.
+  uint32_t Line = 0;
+  /// Width of whitespace before the first token on this line.
+  uint8_t Indent = 0;
+  /// Flags have some meaning defined by the function that produced this stream.
+  uint8_t Flags = 0;
+  // Helpers to get/set Flags based on `enum class`.
+  template <class T> bool flag(T Mask) const {
+    return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+  }
+  template <class T> void setFlag(T Mask) {
+    Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+  }
+
+  /// The type of token as determined by clang's lexer.
+  clang::tok::TokenKind Kind = clang::tok::unknown;
+  /// If this token is a bracket, the index of the matching bracket.
+  Index Pair = Invalid;
+
+  const Token &next() const { return *(this + 1); }
+  const Token &prev() const { return *(this - 1); }
+  Token &next() { return *(this + 1); }
+  Token &prev() { return *(this - 1); }
+};
+static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
+
+/// A half-open range of tokens within a stream.
+struct Token::Range {
+  Token::Index Begin = 0;
+  Token::Index End = 0;
+
+  uint32_t size() const { return End - Begin; }
+  static Range empty(unsigned Index) { return Range{Index, Index}; }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
+
+/// A complete sequence of Tokens representing a source file.
+///
+/// This may match a raw file from disk, or be derived from a previous stream.
+/// For example, stripping comments from a TokenStream results in a new stream.
+///
+/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
+///       int      main   (        )        ;
+///   eof kw_int   ident  l_paren  r_paren  semi   eof
+///       front()                           back()
+///       0        1      2        3        4      5
+class TokenStream {
+public:
+  /// Create an empty stream.
+  ///
+  /// Initially, the stream is mutable and not finalized.
+  /// It may only be read after Finalize() is called.
+  ///
+  /// Payload is an opaque object which will be owned by the stream.
+  /// e.g. an allocator to hold backing storage for synthesized token text.
+  explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
+
+  /// Append a token to the stream, which must not be finalized.
+  void push(Token T) { Storage.push_back(std::move(T)); }
+
+  /// Finalize the token stream, allowing it to be read, but no longer written.
+  void finalize();
+
+  /// Returns the index of T within the stream.
+  ///
+  /// T must be within the stream or the end sentinel (not the start sentinel).
+  Token::Index index(const Token &T) const {
+    assert(&T != Storage.data() && "start sentinel");
+    assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
+    return &T - Tokens.data();
+  }
+
+  MutableArrayRef<Token> tokens() { return Tokens; }
+  ArrayRef<Token> tokens() const { return Tokens; }
+  MutableArrayRef<Token> tokens(Token::Range R) {
+    return Tokens.slice(R.Begin, R.End);
+  }
+  ArrayRef<Token> tokens(Token::Range R) const {
+    return Tokens.slice(R.Begin, R.End);
+  }
+
+  /// May return the end sentinel if the stream is empty.
+  Token &front() { return Storage[1]; }
+  const Token &front() const { return Storage[1]; }
+
+  /// Print the tokens in this stream to the output stream.
+  ///
+  /// The presence of newlines/spaces is preserved, but not the quantity.
+  void print(llvm::raw_ostream &) const;
+
+private:
+  std::shared_ptr<void> Payload;
+
+  MutableArrayRef<Token> Tokens;
+  std::vector<Token> Storage;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
+
+/// Extracts a token stream from the source code.
+///
+/// The tokens will reference the data of the provided string.
+TokenStream lex(const std::string &, const clang::LangOptions &);
+enum class LexFlags : uint8_t { DirtyIdentifier, StartsPPLine };
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
Index: clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
@@ -0,0 +1,145 @@
+//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// XXX
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// Describes the structure of a source file, as seen by the preprocessor.
+///
+/// The structure is a tree, whose leaves are plain source code and directives,
+/// and whose internal nodes are #if...#endif sections.
+///
+/// (root)
+/// |-+ Directive                    #include <stdio.h>
+/// |-+ Code                         int main() {
+/// | `                                printf("hello, ");
+/// |-+ Conditional -+ Directive     #ifndef NDEBUG
+/// | |-+ Code                         printf("debug\n");
+/// | |-+ Directive                  #else
+/// | |-+ Code                         printf("production\n");
+/// | `-+ Directive                  #endif
+/// |-+ Code                           return 0;
+///   `                              }
+///
+/// Unlike the clang preprocessor, we model the full tree explicitly.
+/// This class does not recognize macro usage, only directives.
+struct PPStructure {
+  /// A range of code containing no directives.
+  struct Code {
+    Token::Range Tokens;
+  };
+  /// A preprocessor directive.
+  struct Directive {
+    /// Raw tokens making up the directive, starting with `#`.
+    Token::Range Tokens;
+    clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
+  };
+  /// A preprocessor conditional section.
+  ///
+  /// This starts with an #if, #ifdef, #ifndef etc directive.
+  /// It covers all #else branches, and spans until the matching #endif.
+  struct Conditional {
+    /// The sequence of directives that introduce top-level alternative parses.
+    ///
+    /// The first branch will have an #if type directive.
+    /// Subsequent branches will have #else type directives.
+    std::vector<std::pair<Directive, PPStructure>> Branches;
+    /// The directive terminating the conditional, should be #endif.
+    Directive End;
+  };
+
+  /// Some piece of the file. {One of Code, Directive, Conditional}.
+  class Chunk; // Defined below.
+  std::vector<Chunk> Chunks;
+
+  /// Extract preprocessor structure by examining the raw tokens.
+  static PPStructure parse(const TokenStream &);
+
+  /// Determine heuristically a set of conditional branches to take.
+  ///
+  /// Current heuristics (in preference order):
+  ///  - respect constants: `#if 1`, `#elif false` etc.
+  ///  - avoid paths that reach #error
+  ///  - maximize non-comment tokens seen
+  ///  - maximize number of directives seen
+  void chooseBranches(const TokenStream &) {
+    llvm_unreachable("unimplemented");
+  }
+
+  /// Produce a derived token stream without directives and not-taken branches.
+  ///
+  /// Additionally, raw identifiers are "cooked", converting them to identifiers
+  /// or keywords according to the LangOptions.
+  ///
+  /// The input TokenStream should be the one this structure describes.
+  TokenStream preprocess(const TokenStream &,
+                         const clang::LangOptions &) const {
+    llvm_unreachable("unimplemented");
+  }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
+
+// FIXME: This approximates std::variant<Code, Directive, Conditional>.
+//         Switch once we can use C++17.
+class PPStructure::Chunk {
+public:
+  enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
+  Kind kind() const {
+    return CodeVariant        ? K_Code
+           : DirectiveVariant ? K_Directive
+                              : K_Conditional;
+  }
+
+  Chunk() = delete;
+  Chunk(const Chunk &) = delete;
+  Chunk(Chunk &&) = default;
+  Chunk &operator=(const Chunk &) = delete;
+  Chunk &operator=(Chunk &&) = default;
+  ~Chunk() = default;
+
+  // T => Chunk constructor.
+  Chunk(Code C) : CodeVariant(std::move(C)) {}
+  Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
+  Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
+
+  // Chunk => T& and const T& conversions.
+#define CONVERSION(CONST, V)                                                   \
+  explicit operator CONST V &() CONST { return *V##Variant; }
+  CONVERSION(const, Code);
+  CONVERSION(, Code);
+  CONVERSION(const, Directive);
+  CONVERSION(, Directive);
+  CONVERSION(const, Conditional);
+  CONVERSION(, Conditional);
+#undef CONVERSION
+
+private:
+  // Wasteful, a union variant would be better!
+  llvm::Optional<Code> CodeVariant;
+  llvm::Optional<Directive> DirectiveVariant;
+  llvm::Optional<Conditional> ConditionalVariant;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D119162: [Pseudo] Token/TokenStream, PP directive parser.

Reply via email to