[Pseudo] Token/TokenStream, PP directive parser.

The TokenStream class is the representation of the source code that will be fed into the GLR parser. This patch allows a "raw" TokenStream to be built by reading source code. It also supports scanning a TokenStream to find the directive structure. Next steps (with placeholders in the code): heuristically choosing a path through #ifs, preprocessing the code by stripping directives and comments. These will produce a suitable stream to feed into the parser proper. Differential Revision: https://reviews.llvm.org/D119162
2022-02-07 19:11:16 +01:00 · 2022-02-07 19:11:16 +01:00 · 7c1ee5e95f
parent 70ff6fbeb9
commit 7c1ee5e95f
13 changed files with 1192 additions and 2 deletions
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@ -68,6 +68,9 @@ const char *getPunctuatorSpelling(TokenKind Kind) LLVM_READNONE;
 /// tokens like 'int' and 'dynamic_cast'. Returns NULL for other token kinds.
 const char *getKeywordSpelling(TokenKind Kind) LLVM_READNONE;

+/// Returns the spelling of preprocessor keywords, such as "else".
+const char *getPPKeywordSpelling(PPKeywordKind Kind) LLVM_READNONE;
+
 /// Return true if this is a raw identifier or an identifier kind.
 inline bool isAnyIdentifier(TokenKind K) {
  return (K == tok::identifier) || (K == tok::raw_identifier);
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
@ -0,0 +1,148 @@
+//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pseudoparser tries to match a token stream to the C++ grammar.
+// Preprocessor #defines and other directives are not part of this grammar, and
+// should be removed before the file can be parsed.
+//
+// Conditional blocks like #if...#else...#endif are particularly tricky, as
+// simply stripping the directives may not produce a grammatical result:
+//
+//   return
+//     #ifndef DEBUG
+//       1
+//     #else
+//       0
+//     #endif
+//       ;
+//
+// This header supports analyzing and removing the directives in a source file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// Describes the structure of a source file, as seen by the preprocessor.
+///
+/// The structure is a tree, whose leaves are plain source code and directives,
+/// and whose internal nodes are #if...#endif sections.
+///
+/// (root)
+/// |-+ Directive                    #include <stdio.h>
+/// |-+ Code                         int main() {
+/// | `                                printf("hello, ");
+/// |-+ Conditional -+ Directive     #ifndef NDEBUG
+/// | |-+ Code                         printf("debug\n");
+/// | |-+ Directive                  #else
+/// | |-+ Code                         printf("production\n");
+/// | `-+ Directive                  #endif
+/// |-+ Code                           return 0;
+///   `                              }
+///
+/// Unlike the clang preprocessor, we model the full tree explicitly.
+/// This class does not recognize macro usage, only directives.
+struct PPStructure {
+  /// A range of code (and possibly comments) containing no directives.
+  struct Code {
+    Token::Range Tokens;
+  };
+  /// A preprocessor directive.
+  struct Directive {
+    /// Raw tokens making up the directive, starting with `#`.
+    Token::Range Tokens;
+    clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
+  };
+  /// A preprocessor conditional section.
+  ///
+  /// This starts with an #if, #ifdef, #ifndef etc directive.
+  /// It covers all #else branches, and spans until the matching #endif.
+  struct Conditional {
+    /// The sequence of directives that introduce top-level alternative parses.
+    ///
+    /// The first branch will have an #if type directive.
+    /// Subsequent branches will have #else type directives.
+    std::vector<std::pair<Directive, PPStructure>> Branches;
+    /// The directive terminating the conditional, should be #endif.
+    Directive End;
+  };
+
+  /// Some piece of the file. {One of Code, Directive, Conditional}.
+  class Chunk; // Defined below.
+  std::vector<Chunk> Chunks;
+
+  /// Extract preprocessor structure by examining the raw tokens.
+  static PPStructure parse(const TokenStream &);
+
+  // FIXME: add heuristically selection of conditional branches.
+  // FIXME: allow deriving a preprocessed stream
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+                              const PPStructure::Directive &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+                              const PPStructure::Conditional &);
+
+// FIXME: This approximates std::variant<Code, Directive, Conditional>.
+//         Switch once we can use C++17.
+class PPStructure::Chunk {
+public:
+  enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
+  Kind kind() const {
+    return CodeVariant          ? K_Code
+           : DirectiveVariant   ? K_Directive
+           : ConditionalVariant ? K_Conditional
+                                : K_Empty;
+  }
+
+  Chunk() = delete;
+  Chunk(const Chunk &) = delete;
+  Chunk(Chunk &&) = default;
+  Chunk &operator=(const Chunk &) = delete;
+  Chunk &operator=(Chunk &&) = default;
+  ~Chunk() = default;
+
+  // T => Chunk constructor.
+  Chunk(Code C) : CodeVariant(std::move(C)) {}
+  Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
+  Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
+
+  // Chunk => T& and const T& conversions.
+#define CONVERSION(CONST, V)                                                   \
+  explicit operator CONST V &() CONST { return *V##Variant; }
+  CONVERSION(const, Code);
+  CONVERSION(, Code);
+  CONVERSION(const, Directive);
+  CONVERSION(, Directive);
+  CONVERSION(const, Conditional);
+  CONVERSION(, Conditional);
+#undef CONVERSION
+
+private:
+  // Wasteful, a union variant would be better!
+  llvm::Optional<Code> CodeVariant;
+  llvm::Optional<Directive> DirectiveVariant;
+  llvm::Optional<Conditional> ConditionalVariant;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Token.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h
@ -0,0 +1,202 @@
+//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are the first level of abstraction above bytes used in pseudoparsing.
+// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
+// The tokens is wrapped into pseudo::Token, along with line/indent info.
+//
+// Unlike clang, we make multiple passes over the whole file, out-of-order.
+// Therefore we retain the whole token sequence in memory. (This is feasible as
+// we process one file at a time). pseudo::TokenStream holds such a stream.
+// The initial stream holds the raw tokens read from the file, later passes
+// operate on derived TokenStreams (e.g. with directives stripped).
+//
+// Similar facilities from clang that are *not* used:
+//  - SourceManager: designed around multiple files and precise macro expansion.
+//  - clang::Token: coupled to SourceManager, doesn't retain layout info.
+//                  (pseudo::Token is similar, but without SourceLocations).
+//  - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
+//                  (pseudo::TokenStream is similar, but a flat token list).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// A single C++ or preprocessor token.
+///
+/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
+/// SourceManager - we are not dealing with multiple files.
+struct Token {
+  /// An Index identifies a token within a stream.
+  using Index = uint32_t;
+  /// A sentinel Index indicating no token.
+  constexpr static Index Invalid = std::numeric_limits<Index>::max();
+  struct Range;
+
+  /// The token text.
+  ///
+  /// Typically from the original source file, but may have been synthesized.
+  StringRef text() const { return StringRef(Data, Length); }
+  const char *Data = nullptr;
+  uint32_t Length = 0;
+
+  /// Zero-based line number for the start of the token.
+  /// This refers to the original source file as written.
+  uint32_t Line = 0;
+  /// Width of whitespace before the first token on this line.
+  uint8_t Indent = 0;
+  /// Flags have some meaning defined by the function that produced this stream.
+  uint8_t Flags = 0;
+  // Helpers to get/set Flags based on `enum class`.
+  template <class T> bool flag(T Mask) const {
+    return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+  }
+  template <class T> void setFlag(T Mask) {
+    Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+  }
+
+  /// The type of token as determined by clang's lexer.
+  clang::tok::TokenKind Kind = clang::tok::unknown;
+};
+static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
+
+/// A half-open range of tokens within a stream.
+struct Token::Range {
+  Index Begin = 0;
+  Index End = 0;
+
+  uint32_t size() const { return End - Begin; }
+  static Range emptyAt(Index Index) { return Range{Index, Index}; }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
+
+/// A complete sequence of Tokens representing a source file.
+///
+/// This may match a raw file from disk, or be derived from a previous stream.
+/// For example, stripping comments from a TokenStream results in a new stream.
+///
+/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
+///       int      main   (        )        ;
+///   eof kw_int   ident  l_paren  r_paren  semi   eof
+///       front()                           back()
+///       0        1      2        3        4      5
+class TokenStream {
+public:
+  /// Create an empty stream.
+  ///
+  /// Initially, the stream is appendable and not finalized.
+  /// The token sequence may only be accessed after finalize() is called.
+  ///
+  /// Payload is an opaque object which will be owned by the stream.
+  /// e.g. an allocator to hold backing storage for synthesized token text.
+  explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
+
+  /// Append a token to the stream, which must not be finalized.
+  void push(Token T) {
+    assert(!isFinalized());
+    Storage.push_back(std::move(T));
+  }
+
+  /// Finalize the token stream, allowing tokens to be accessed.
+  /// Tokens may no longer be appended.
+  void finalize();
+  bool isFinalized() const;
+
+  /// Returns the index of T within the stream.
+  ///
+  /// T must be within the stream or the end sentinel (not the start sentinel).
+  Token::Index index(const Token &T) const {
+    assert(isFinalized());
+    assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
+    assert(&T != Storage.data() && "start sentinel");
+    return &T - Tokens.data();
+  }
+
+  ArrayRef<Token> tokens() const {
+    assert(isFinalized());
+    return Tokens;
+  }
+  ArrayRef<Token> tokens(Token::Range R) const {
+    return tokens().slice(R.Begin, R.End - R.Begin);
+  }
+
+  /// May return the end sentinel if the stream is empty.
+  const Token &front() const {
+    assert(isFinalized());
+    return Storage[1];
+  }
+
+  /// Print the tokens in this stream to the output stream.
+  ///
+  /// The presence of newlines/spaces is preserved, but not the quantity.
+  void print(llvm::raw_ostream &) const;
+
+private:
+  std::shared_ptr<void> Payload;
+
+  MutableArrayRef<Token> Tokens;
+  std::vector<Token> Storage; // eof + Tokens + eof
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
+
+/// Extracts a raw token stream from the source code.
+///
+/// All tokens will reference the data of the provided string.
+/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
+TokenStream lex(const std::string &, const clang::LangOptions &);
+enum class LexFlags : uint8_t {
+  /// Marks the token at the start of a logical preprocessor line.
+  /// This is a position where a directive might start.
+  ///
+  /// Here, the first # is StartsPPLine, but second is not (same logical line).
+  ///   #define X(error) \
+  ///   #error // not a directive!
+  ///
+  /// Careful, the directive may not start exactly on the StartsPPLine token:
+  ///   /*comment*/ #include <foo.h>
+  StartsPPLine = 1 << 0,
+  /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
+  /// The text() of such tokens will contain the raw trigrah.
+  NeedsCleaning = 1 << 1,
+};
+
+/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
+///
+/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
+/// their backing data is owned by the returned stream.
+/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
+///
+/// The StartsPPLine flag is preserved.
+///
+/// Formally the identifier correctly happens before preprocessing, while we
+/// should only cook raw_identifiers that survive preprocessing.
+/// However, ignoring the Token::Kind of tokens in directives achieves the same.
+/// (And having cooked token kinds in PP-disabled sections is useful for us).
+TokenStream cook(const TokenStream &, const clang::LangOptions &);
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
--- a/clang/lib/Basic/TokenKinds.cpp
+++ b/clang/lib/Basic/TokenKinds.cpp
@ -46,6 +46,15 @@ const char *tok::getKeywordSpelling(TokenKind Kind) {
  return nullptr;
 }

+const char *tok::getPPKeywordSpelling(tok::PPKeywordKind Kind) {
+  switch (Kind) {
+#define PPKEYWORD(x) case tok::pp_##x: return #x;
+#include "clang/Basic/TokenKinds.def"
+  default: break;
+  }
+  return nullptr;
+}
+
 bool tok::isAnnotation(TokenKind Kind) {
  switch (Kind) {
 #define ANNOTATION(X) case annot_ ## X: return true;
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@ -3,9 +3,12 @@ set(LLVM_LINK_COMPONENTS Support)
 add_clang_library(clangToolingSyntaxPseudo
  Grammar.cpp
  GrammarBNF.cpp
+  Lex.cpp
  LRGraph.cpp
  LRTable.cpp
  LRTableBuild.cpp
+  Preprocess.cpp
+  Token.cpp

  LINK_LIBS
  clangBasic
--- a/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
@ -0,0 +1,114 @@
+//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralSupport.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
+  clang::SourceLocation Start;
+  // Tokenize using clang's lexer in raw mode.
+  // std::string guarantees null-termination, which the lexer needs.
+  clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
+                     Code.data() + Code.size());
+  Lexer.SetCommentRetentionState(true);
+
+  TokenStream Result;
+  clang::Token CT;
+  unsigned LastOffset = 0;
+  unsigned Line = 0;
+  unsigned Indent = 0;
+  for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
+       Lexer.LexFromRawLexer(CT)) {
+    unsigned Offset =
+        CT.getLocation().getRawEncoding() - Start.getRawEncoding();
+
+    Token Tok;
+    Tok.Data = &Code[Offset];
+    Tok.Length = CT.getLength();
+    Tok.Kind = CT.getKind();
+
+    // Update current line number and indentation from raw source code.
+    unsigned NewLineStart = 0;
+    for (unsigned i = LastOffset; i < Offset; ++i) {
+      if (Code[i] == '\n') {
+        NewLineStart = i + 1;
+        ++Line;
+      }
+    }
+    if (NewLineStart || !LastOffset) {
+      Indent = 0;
+      for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
+        if (c == ' ')
+          ++Indent;
+        else if (c == '\t')
+          Indent += 8;
+        else
+          break;
+      }
+    }
+    Tok.Indent = Indent;
+    Tok.Line = Line;
+
+    if (CT.isAtStartOfLine())
+      Tok.setFlag(LexFlags::StartsPPLine);
+    if (CT.needsCleaning() || CT.hasUCN())
+      Tok.setFlag(LexFlags::NeedsCleaning);
+
+    Result.push(Tok);
+    LastOffset = Offset;
+  }
+  Result.finalize();
+  return Result;
+}
+
+TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
+  auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
+  clang::IdentifierTable Identifiers(LangOpts);
+  TokenStream Result(CleanedStorage);
+
+  for (auto Tok : Code.tokens()) {
+    if (Tok.flag(LexFlags::NeedsCleaning)) {
+      // Remove escaped newlines and trigraphs.
+      llvm::SmallString<64> CleanBuffer;
+      const char *Pos = Tok.text().begin();
+      while (Pos < Tok.text().end()) {
+        unsigned CharSize = 0;
+        CleanBuffer.push_back(
+            clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
+        assert(CharSize != 0 && "no progress!");
+        Pos += CharSize;
+      }
+      // Remove universal character names (UCN).
+      llvm::SmallString<64> UCNBuffer;
+      clang::expandUCNs(UCNBuffer, CleanBuffer);
+
+      llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
+      Tok.Data = Text.data();
+      Tok.Length = Text.size();
+      Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
+    }
+    // Cook raw_identifiers into identifier, keyword, etc.
+    if (Tok.Kind == tok::raw_identifier)
+      Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
+    Result.push(std::move(Tok));
+  }
+
+  Result.finalize();
+  return Result;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
--- a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
@ -0,0 +1,206 @@
+//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+class PPParser {
+public:
+  explicit PPParser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
+  void parse(PPStructure *Result) { parse(Result, /*TopLevel=*/true); }
+
+private:
+  // Roles that a directive might take within a conditional block.
+  enum class Cond { None, If, Else, End };
+  static Cond classifyDirective(tok::PPKeywordKind K) {
+    switch (K) {
+    case clang::tok::pp_if:
+    case clang::tok::pp_ifdef:
+    case clang::tok::pp_ifndef:
+      return Cond::If;
+    case clang::tok::pp_elif:
+    case clang::tok::pp_elifdef:
+    case clang::tok::pp_elifndef:
+    case clang::tok::pp_else:
+      return Cond::Else;
+    case clang::tok::pp_endif:
+      return Cond::End;
+    default:
+      return Cond::None;
+    }
+  }
+
+  // Parses tokens starting at Tok into PP.
+  // If we reach an End or Else directive that ends PP, returns it.
+  // If TopLevel is true, then we do not expect End and always return None.
+  llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
+    auto StartsDirective =
+        [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
+          if (Tok->flag(LexFlags::StartsPPLine)) {
+            // If we considered a comment at the start of a PP-line, it doesn't
+            // start a directive but the directive can still start after it.
+            if (Tok->Kind == tok::comment)
+              AllowDirectiveAt = Tok + 1;
+            return Tok->Kind == tok::hash;
+          }
+          return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
+        };
+    // Each iteration adds one chunk (or returns, if we see #endif).
+    while (Tok->Kind != tok::eof) {
+      // If there's no directive here, we have a code chunk.
+      if (!StartsDirective()) {
+        const Token *Start = Tok;
+        do
+          ++Tok;
+        while (Tok->Kind != tok::eof && !StartsDirective());
+        PP->Chunks.push_back(PPStructure::Code{
+            Token::Range{Code.index(*Start), Code.index(*Tok)}});
+        continue;
+      }
+
+      // We have some kind of directive.
+      PPStructure::Directive Directive;
+      parseDirective(&Directive);
+      Cond Kind = classifyDirective(Directive.Kind);
+      if (Kind == Cond::If) {
+        // #if or similar, starting a nested conditional block.
+        PPStructure::Conditional Conditional;
+        Conditional.Branches.emplace_back();
+        Conditional.Branches.back().first = std::move(Directive);
+        parseConditional(&Conditional);
+        PP->Chunks.push_back(std::move(Conditional));
+      } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
+        // #endif or similar, ending this PPStructure scope.
+        // (#endif is unexpected at the top level, treat as simple directive).
+        return std::move(Directive);
+      } else {
+        // #define or similar, a simple directive at the current scope.
+        PP->Chunks.push_back(std::move(Directive));
+      }
+    }
+    return None;
+  }
+
+  // Parse the rest of a conditional section, after seeing the If directive.
+  // Returns after consuming the End directive.
+  void parseConditional(PPStructure::Conditional *C) {
+    assert(C->Branches.size() == 1 &&
+           C->Branches.front().second.Chunks.empty() &&
+           "Should be ready to parse first branch body");
+    while (Tok->Kind != tok::eof) {
+      auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
+      if (!Terminator) {
+        assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
+        C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
+        return;
+      }
+      if (classifyDirective(Terminator->Kind) == Cond::End) {
+        C->End = std::move(*Terminator);
+        return;
+      }
+      assert(classifyDirective(Terminator->Kind) == Cond::Else &&
+             "ended branch unexpectedly");
+      C->Branches.emplace_back();
+      C->Branches.back().first = std::move(*Terminator);
+    }
+  }
+
+  // Parse a directive. Tok is the hash.
+  void parseDirective(PPStructure::Directive *D) {
+    assert(Tok->Kind == tok::hash);
+
+    // Directive spans from the hash until the end of line or file.
+    const Token *Begin = Tok++;
+    while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
+      ++Tok;
+    ArrayRef<Token> Tokens{Begin, Tok};
+    D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
+
+    // Directive name is the first non-comment token after the hash.
+    Tokens = Tokens.drop_front().drop_while(
+        [](const Token &T) { return T.Kind == tok::comment; });
+    if (!Tokens.empty())
+      D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
+  }
+
+  const TokenStream &Code;
+  const Token *Tok;
+  clang::IdentifierTable PPKeywords;
+};
+
+} // namespace
+
+PPStructure PPStructure::parse(const TokenStream &Code) {
+  PPStructure Result;
+  PPParser(Code).parse(&Result);
+  return Result;
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent);
+static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive,
+                 unsigned Indent) {
+  OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
+                                     tok::getPPKeywordSpelling(Directive.Kind),
+                                     Directive.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code,
+                 unsigned Indent) {
+  OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS,
+                 const PPStructure::Conditional &Conditional, unsigned Indent) {
+  for (const auto &Branch : Conditional.Branches) {
+    dump(OS, Branch.first, Indent);
+    dump(OS, Branch.second, Indent + 2);
+  }
+  dump(OS, Conditional.End, Indent);
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk,
+                 unsigned Indent) {
+  switch (Chunk.kind()) {
+  case PPStructure::Chunk::K_Empty:
+    llvm_unreachable("invalid chunk");
+  case PPStructure::Chunk::K_Code:
+    return dump(OS, (const PPStructure::Code &)Chunk, Indent);
+  case PPStructure::Chunk::K_Directive:
+    return dump(OS, (const PPStructure::Directive &)Chunk, Indent);
+  case PPStructure::Chunk::K_Conditional:
+    return dump(OS, (const PPStructure::Conditional &)Chunk, Indent);
+  }
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
+                 unsigned Indent) {
+  for (const auto &Chunk : PP.Chunks)
+    dump(OS, Chunk, Indent);
+}
+
+// Define operator<< in terms of dump() functions above.
+#define OSTREAM_DUMP(Type)                                                     \
+  llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) {        \
+    dump(OS, T, 0);                                                            \
+    return OS;                                                                 \
+  }
+OSTREAM_DUMP(PPStructure)
+OSTREAM_DUMP(PPStructure::Chunk)
+OSTREAM_DUMP(PPStructure::Directive)
+OSTREAM_DUMP(PPStructure::Conditional)
+OSTREAM_DUMP(PPStructure::Code)
+#undef OSTREAM_DUMP
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
--- a/clang/lib/Tooling/Syntax/Pseudo/Token.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp
@ -0,0 +1,98 @@
+//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+  OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
+                      T.Indent);
+  OS << '"';
+  llvm::printEscapedString(T.text(), OS);
+  OS << '"';
+  if (T.Flags)
+    OS << llvm::format(" flags=%x", T.Flags);
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
+  OS << "Index               Kind    Line  Text\n";
+  for (const auto &T : TS.tokens()) {
+    OS << llvm::format("%5d:  %16s %4d:%-2d  ", TS.index(T),
+                       clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
+    OS << '"';
+    llvm::printEscapedString(T.text(), OS);
+    OS << '"';
+    if (T.Flags)
+      OS << llvm::format("  flags=%x", T.Flags);
+    OS << '\n';
+  }
+  return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
+  OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
+  return OS;
+}
+
+TokenStream::TokenStream(std::shared_ptr<void> Payload)
+    : Payload(std::move(Payload)) {
+  Storage.emplace_back();
+  Storage.back().Kind = clang::tok::eof;
+}
+
+void TokenStream::finalize() {
+  assert(!isFinalized());
+  unsigned LastLine = Storage.back().Line;
+  Storage.emplace_back();
+  Storage.back().Kind = tok::eof;
+  Storage.back().Line = LastLine + 1;
+
+  Tokens = Storage;
+  Tokens = Tokens.drop_front().drop_back();
+}
+
+bool TokenStream::isFinalized() const {
+  assert(!Storage.empty() && Storage.front().Kind == tok::eof);
+  if (Storage.size() == 1)
+    return false;
+  return Storage.back().Kind == tok::eof;
+}
+
+void TokenStream::print(llvm::raw_ostream &OS) const {
+  bool FirstToken = true;
+  unsigned LastLine = -1;
+  StringRef LastText;
+  for (const auto &T : tokens()) {
+    StringRef Text = T.text();
+    if (FirstToken) {
+      FirstToken = false;
+    } else if (T.Line == LastLine) {
+      if (LastText.data() + LastText.size() != Text.data())
+        OS << ' ';
+    } else {
+      OS << '\n';
+      OS.indent(T.Indent);
+    }
+    OS << Text;
+    LastLine = T.Line;
+    LastText = Text;
+  }
+  if (!FirstToken)
+    OS << '\n';
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
--- a/clang/test/Syntax/lex.c
+++ b/clang/test/Syntax/lex.c
@ -0,0 +1,52 @@
+int is_debug() {
+#ifndef NDEBUG
+  return 1; // in debug mode
+#else
+  return 0;
+#endif
+}
+
+/* This comment gets lexed along with the input above! We just don't CHECK it.
+
+RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
+     SOURCE: int is_debug() {
+SOURCE-NEXT: #ifndef NDEBUG
+SOURCE-NEXT:   return 1; // in debug mode
+SOURCE-NEXT: #else
+SOURCE-NEXT:  return 0;
+SOURCE-NEXT: #end
+SOURCE-NEXT: }
+
+RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
+     TOKEN:   0: raw_identifier   0:0 "int" flags=1
+TOKEN-NEXT: raw_identifier   0:0 "is_debug"
+TOKEN-NEXT: l_paren          0:0 "("
+TOKEN-NEXT: r_paren          0:0 ")"
+TOKEN-NEXT: l_brace          0:0 "{"
+TOKEN-NEXT: hash             1:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   1:0 "ifndef"
+TOKEN-NEXT: raw_identifier   1:0 "NDEBUG"
+TOKEN-NEXT: raw_identifier   2:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 2:2 "1"
+TOKEN-NEXT: semi             2:2 ";"
+TOKEN-NEXT: comment          2:2 "// in debug mode"
+TOKEN-NEXT: hash             3:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   3:0 "else"
+TOKEN-NEXT: raw_identifier   4:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 4:2 "0"
+TOKEN-NEXT: semi             4:2 ";"
+TOKEN-NEXT: hash             5:0 "#" flags=1
+TOKEN-NEXT: raw_identifier   5:0 "endif"
+TOKEN-NEXT: r_brace          6:0 "}" flags=1
+
+RUN: clang-pseudo -source %s -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
+     PPS: code (5 tokens)
+PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT:   code (4 tokens)
+PPS-NEXT: #else (2 tokens)
+PPS-NEXT:   code (3 tokens)
+PPS-NEXT: #endif (2 tokens)
+PPS-NEXT: code (2 tokens)
+                ^ including this block comment
+
+*******************************************************************************/
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@ -6,9 +6,12 @@
 //
 //===----------------------------------------------------------------------===//

+#include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
 #include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
 #include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@ -25,13 +28,19 @@ static opt<bool> PrintGraph("print-graph",
                            desc("Print the LR graph for the grammar"));
 static opt<bool> PrintTable("print-table",
                            desc("Print the LR table for the grammar"));
+static opt<std::string> Source("source", desc("Source file"));
+static opt<bool> PrintSource("print-source", desc("Print token stream"));
+static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
+static opt<bool>
+    PrintPPStructure("print-pp-structure",
+                     desc("Print directive structure of source code"));

 static std::string readOrDie(llvm::StringRef Path) {
  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
      llvm::MemoryBuffer::getFile(Path);
  if (std::error_code EC = Text.getError()) {
-    llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
-                 << "\n";
+    llvm::errs() << "Error: can't read grammar file '" << Path
+                 << "': " << EC.message() << "\n";
    ::exit(1);
  }
  return Text.get()->getBuffer().str();
@ -60,5 +69,19 @@ int main(int argc, char *argv[]) {
    return 0;
  }

+  if (Source.getNumOccurrences()) {
+    std::string Text = readOrDie(Source);
+    clang::LangOptions LangOpts; // FIXME: use real options.
+    auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
+    auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
+
+    if (PrintPPStructure)
+      llvm::outs() << Structure;
+    if (PrintSource)
+      Stream.print(llvm::outs());
+    if (PrintTokens)
+      llvm::outs() << Stream;
+  }
+
  return 0;
 }
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
 add_clang_unittest(ClangPseudoTests
  GrammarTest.cpp
  LRTableTest.cpp
+  PreprocessTest.cpp
+  TokenTest.cpp
 )

 clang_target_link_libraries(ClangPseudoTests
--- a/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
+++ b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
@ -0,0 +1,152 @@
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::_;
+using testing::ElementsAre;
+using testing::Matcher;
+using testing::Pair;
+using testing::StrEq;
+using Chunk = PPStructure::Chunk;
+
+MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
+  std::vector<llvm::StringRef> Texts;
+  for (const Token &Tok : TS.tokens(arg.Tokens))
+    Texts.push_back(Tok.text());
+  return Matcher<std::string>(StrEq(Tokens))
+      .MatchAndExplain(llvm::join(Texts, " "), result_listener);
+}
+
+MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
+
+TEST(PPStructure, Parse) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+  #include <foo.h>
+
+  int main() {
+  #ifdef HAS_FOO
+  #if HAS_BAR
+    foo(bar);
+  #else
+    foo(0)
+  #endif
+  #elif NEEDS_FOO
+    #error missing_foo
+  #endif
+  }
+  )cpp";
+
+  TokenStream S = cook(lex(Code, Opts), Opts);
+  PPStructure PP = PPStructure::parse(S);
+
+  ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
+                                     chunkKind(Chunk::K_Code),
+                                     chunkKind(Chunk::K_Conditional),
+                                     chunkKind(Chunk::K_Code)));
+
+  EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0],
+              tokensAre(S, "# include < foo . h >"));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1],
+              tokensAre(S, "int main ( ) {"));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}"));
+
+  const PPStructure::Conditional &Ifdef(PP.Chunks[2]);
+  EXPECT_THAT(Ifdef.Branches,
+              ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
+                          Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
+  EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
+
+  const PPStructure &HasFoo(Ifdef.Branches[0].second);
+  const PPStructure &NeedsFoo(Ifdef.Branches[1].second);
+
+  EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
+  const PPStructure::Conditional &If(HasFoo.Chunks[0]);
+  EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
+                                       Pair(tokensAre(S, "# else"), _)));
+  EXPECT_THAT(If.Branches[0].second.Chunks,
+              ElementsAre(chunkKind(Chunk::K_Code)));
+  EXPECT_THAT(If.Branches[1].second.Chunks,
+              ElementsAre(chunkKind(Chunk::K_Code)));
+
+  EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
+  const PPStructure::Directive &Error(NeedsFoo.Chunks[0]);
+  EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
+  EXPECT_EQ(Error.Kind, tok::pp_error);
+}
+
+TEST(PPStructure, ParseUgly) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+  /*A*/ # /*B*/ \
+   /*C*/ \
+define \
+BAR /*D*/
+/*E*/
+)cpp";
+  TokenStream S = cook(lex(Code, Opts), Opts);
+  PPStructure PP = PPStructure::parse(S);
+
+  ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+                                     chunkKind(Chunk::K_Directive),
+                                     chunkKind(Chunk::K_Code)));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
+  const PPStructure::Directive &Define(PP.Chunks[1]);
+  EXPECT_EQ(Define.Kind, tok::pp_define);
+  EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
+}
+
+TEST(PPStructure, ParseBroken) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+  a
+  #endif // mismatched
+  #if X
+  b
+)cpp";
+  TokenStream S = cook(lex(Code, Opts), Opts);
+  PPStructure PP = PPStructure::parse(S);
+
+  ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+                                     chunkKind(Chunk::K_Directive),
+                                     chunkKind(Chunk::K_Conditional)));
+  EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "a"));
+  const PPStructure::Directive &Endif(PP.Chunks[1]);
+  EXPECT_EQ(Endif.Kind, tok::pp_endif);
+  EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
+
+  const PPStructure::Conditional &X(PP.Chunks[2]);
+  EXPECT_EQ(1u, X.Branches.size());
+  // The (only) branch of the broken conditional section runs until eof.
+  EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
+  EXPECT_THAT(X.Branches.front().second.Chunks,
+              ElementsAre(chunkKind(Chunk::K_Code)));
+  // The missing terminating directive is marked as pp_not_keyword.
+  EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
+  EXPECT_EQ(0u, X.End.Tokens.size());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
--- a/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
+++ b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
@ -0,0 +1,178 @@
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Not;
+
+MATCHER_P2(token, Text, Kind, "") {
+  return arg.Kind == Kind && arg.text() == Text;
+}
+
+MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
+
+MATCHER_P2(lineIndent, Line, Indent, "") {
+  return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
+}
+
+TEST(TokenTest, Lex) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+    #include <stdio.h>
+    int main() {
+      return 42; // the answer
+    }
+  )cpp";
+  TokenStream Raw = lex(Code, Opts);
+  ASSERT_TRUE(Raw.isFinalized());
+  EXPECT_THAT(Raw.tokens(),
+              ElementsAreArray({
+                  // Lexing of directives is weird, especially <angled> strings.
+                  token("#", tok::hash),
+                  token("include", tok::raw_identifier),
+                  token("<", tok::less),
+                  token("stdio", tok::raw_identifier),
+                  token(".", tok::period),
+                  token("h", tok::raw_identifier),
+                  token(">", tok::greater),
+
+                  token("int", tok::raw_identifier),
+                  token("main", tok::raw_identifier),
+                  token("(", tok::l_paren),
+                  token(")", tok::r_paren),
+                  token("{", tok::l_brace),
+                  token("return", tok::raw_identifier),
+                  token("42", tok::numeric_constant),
+                  token(";", tok::semi),
+                  token("// the answer", tok::comment),
+                  token("}", tok::r_brace),
+              }));
+
+  TokenStream Cooked = cook(Raw, Opts);
+  ASSERT_TRUE(Cooked.isFinalized());
+  EXPECT_THAT(Cooked.tokens(),
+              ElementsAreArray({
+                  // Cooked identifier types in directives are not meaningful.
+                  token("#", tok::hash),
+                  token("include", tok::identifier),
+                  token("<", tok::less),
+                  token("stdio", tok::identifier),
+                  token(".", tok::period),
+                  token("h", tok::identifier),
+                  token(">", tok::greater),
+
+                  token("int", tok::kw_int),
+                  token("main", tok::identifier),
+                  token("(", tok::l_paren),
+                  token(")", tok::r_paren),
+                  token("{", tok::l_brace),
+                  token("return", tok::kw_return),
+                  token("42", tok::numeric_constant),
+                  token(";", tok::semi),
+                  token("// the answer", tok::comment),
+                  token("}", tok::r_brace),
+              }));
+  // Check raw tokens point back into original source code.
+  EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
+}
+
+TEST(TokenTest, LineContinuation) {
+  LangOptions Opts;
+  std::string Code = R"cpp(
+one_\
+token
+two \
+tokens
+  )cpp";
+  TokenStream Raw = lex(Code, Opts);
+  EXPECT_THAT(
+      Raw.tokens(),
+      ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
+                        hasFlag(LexFlags::StartsPPLine),
+                        hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
+                  AllOf(token("two", tok::raw_identifier),
+                        hasFlag(LexFlags::StartsPPLine),
+                        Not(hasFlag(LexFlags::NeedsCleaning))),
+                  AllOf(token("\\\ntokens", tok::raw_identifier),
+                        Not(hasFlag(LexFlags::StartsPPLine)),
+                        hasFlag(LexFlags::NeedsCleaning))));
+
+  TokenStream Cooked = cook(Raw, Opts);
+  EXPECT_THAT(
+      Cooked.tokens(),
+      ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
+                  token("two", tok::identifier),
+                  token("tokens", tok::identifier)));
+}
+
+TEST(TokenTest, EncodedCharacters) {
+  LangOptions Opts;
+  Opts.Trigraphs = true;
+  Opts.Digraphs = true;
+  Opts.C99 = true; // UCNs
+  Opts.CXXOperatorNames = true;
+  std::string Code = R"(and <: ??! '??=' \u00E9)";
+  TokenStream Raw = lex(Code, Opts);
+  EXPECT_THAT(
+      Raw.tokens(),
+      ElementsAre( // and is not recognized as && until cook().
+          AllOf(token("and", tok::raw_identifier),
+                Not(hasFlag(LexFlags::NeedsCleaning))),
+          // Digraphs are just different spellings of tokens.
+          AllOf(token("<:", tok::l_square),
+                Not(hasFlag(LexFlags::NeedsCleaning))),
+          // Trigraps are interpreted, still need text cleaning.
+          AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
+          // Trigraphs must be substituted inside constants too.
+          AllOf(token(R"('??=')", tok::char_constant),
+                hasFlag(LexFlags::NeedsCleaning)),
+          // UCNs need substitution.
+          AllOf(token(R"(\u00E9)", tok::raw_identifier),
+                hasFlag(LexFlags::NeedsCleaning))));
+
+  TokenStream Cooked = cook(Raw, Opts);
+  EXPECT_THAT(
+      Cooked.tokens(),
+      ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
+                  token("<:", tok::l_square),
+                  token("|", tok::pipe),            // trigraph substituted
+                  token("'#'", tok::char_constant), // trigraph substituted
+                  token("é", tok::identifier)));    // UCN substituted
+}
+
+TEST(TokenTest, Indentation) {
+  LangOptions Opts;
+  std::string Code = R"cpp(   hello world
+no_indent \
+  line_was_continued
+)cpp";
+  TokenStream Raw = lex(Code, Opts);
+  EXPECT_THAT(Raw.tokens(), ElementsAreArray({
+                                lineIndent(0, 3), // hello
+                                lineIndent(0, 3), // world
+                                lineIndent(1, 0), // no_indent
+                                lineIndent(2, 2), // line_was_continued
+                            }));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang