[Pseudo] Token/TokenStream, PP directive parser.

The TokenStream class is the representation of the source code that will
be fed into the GLR parser.

This patch allows a "raw" TokenStream to be built by reading source code.
It also supports scanning a TokenStream to find the directive structure.

Next steps (with placeholders in the code): heuristically choosing a
path through #ifs, preprocessing the code by stripping directives and comments.
These will produce a suitable stream to feed into the parser proper.

Differential Revision: https://reviews.llvm.org/D119162
This commit is contained in:
Sam McCall 2022-02-07 19:11:16 +01:00
parent 70ff6fbeb9
commit 7c1ee5e95f
13 changed files with 1192 additions and 2 deletions

View File

@ -68,6 +68,9 @@ const char *getPunctuatorSpelling(TokenKind Kind) LLVM_READNONE;
/// tokens like 'int' and 'dynamic_cast'. Returns NULL for other token kinds.
const char *getKeywordSpelling(TokenKind Kind) LLVM_READNONE;
/// Returns the spelling of preprocessor keywords, such as "else".
const char *getPPKeywordSpelling(PPKeywordKind Kind) LLVM_READNONE;
/// Return true if this is a raw identifier or an identifier kind.
inline bool isAnyIdentifier(TokenKind K) {
return (K == tok::identifier) || (K == tok::raw_identifier);

View File

@ -0,0 +1,148 @@
//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The pseudoparser tries to match a token stream to the C++ grammar.
// Preprocessor #defines and other directives are not part of this grammar, and
// should be removed before the file can be parsed.
//
// Conditional blocks like #if...#else...#endif are particularly tricky, as
// simply stripping the directives may not produce a grammatical result:
//
// return
// #ifndef DEBUG
// 1
// #else
// 0
// #endif
// ;
//
// This header supports analyzing and removing the directives in a source file.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
#include "clang/Basic/TokenKinds.h"
#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include <vector>
namespace clang {
class LangOptions;
namespace syntax {
namespace pseudo {
/// Describes the structure of a source file, as seen by the preprocessor.
///
/// The structure is a tree, whose leaves are plain source code and directives,
/// and whose internal nodes are #if...#endif sections.
///
/// (root)
/// |-+ Directive #include <stdio.h>
/// |-+ Code int main() {
/// | ` printf("hello, ");
/// |-+ Conditional -+ Directive #ifndef NDEBUG
/// | |-+ Code printf("debug\n");
/// | |-+ Directive #else
/// | |-+ Code printf("production\n");
/// | `-+ Directive #endif
/// |-+ Code return 0;
/// ` }
///
/// Unlike the clang preprocessor, we model the full tree explicitly.
/// This class does not recognize macro usage, only directives.
struct PPStructure {
/// A range of code (and possibly comments) containing no directives.
struct Code {
Token::Range Tokens;
};
/// A preprocessor directive.
struct Directive {
/// Raw tokens making up the directive, starting with `#`.
Token::Range Tokens;
clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
};
/// A preprocessor conditional section.
///
/// This starts with an #if, #ifdef, #ifndef etc directive.
/// It covers all #else branches, and spans until the matching #endif.
struct Conditional {
/// The sequence of directives that introduce top-level alternative parses.
///
/// The first branch will have an #if type directive.
/// Subsequent branches will have #else type directives.
std::vector<std::pair<Directive, PPStructure>> Branches;
/// The directive terminating the conditional, should be #endif.
Directive End;
};
/// Some piece of the file. {One of Code, Directive, Conditional}.
class Chunk; // Defined below.
std::vector<Chunk> Chunks;
/// Extract preprocessor structure by examining the raw tokens.
static PPStructure parse(const TokenStream &);
// FIXME: add heuristically selection of conditional branches.
// FIXME: allow deriving a preprocessed stream
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &);
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &);
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
const PPStructure::Directive &);
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
const PPStructure::Conditional &);
// FIXME: This approximates std::variant<Code, Directive, Conditional>.
// Switch once we can use C++17.
class PPStructure::Chunk {
public:
enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
Kind kind() const {
return CodeVariant ? K_Code
: DirectiveVariant ? K_Directive
: ConditionalVariant ? K_Conditional
: K_Empty;
}
Chunk() = delete;
Chunk(const Chunk &) = delete;
Chunk(Chunk &&) = default;
Chunk &operator=(const Chunk &) = delete;
Chunk &operator=(Chunk &&) = default;
~Chunk() = default;
// T => Chunk constructor.
Chunk(Code C) : CodeVariant(std::move(C)) {}
Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
// Chunk => T& and const T& conversions.
#define CONVERSION(CONST, V) \
explicit operator CONST V &() CONST { return *V##Variant; }
CONVERSION(const, Code);
CONVERSION(, Code);
CONVERSION(const, Directive);
CONVERSION(, Directive);
CONVERSION(const, Conditional);
CONVERSION(, Conditional);
#undef CONVERSION
private:
// Wasteful, a union variant would be better!
llvm::Optional<Code> CodeVariant;
llvm::Optional<Directive> DirectiveVariant;
llvm::Optional<Conditional> ConditionalVariant;
};
} // namespace pseudo
} // namespace syntax
} // namespace clang
#endif

View File

@ -0,0 +1,202 @@
//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Tokens are the first level of abstraction above bytes used in pseudoparsing.
// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
// The tokens is wrapped into pseudo::Token, along with line/indent info.
//
// Unlike clang, we make multiple passes over the whole file, out-of-order.
// Therefore we retain the whole token sequence in memory. (This is feasible as
// we process one file at a time). pseudo::TokenStream holds such a stream.
// The initial stream holds the raw tokens read from the file, later passes
// operate on derived TokenStreams (e.g. with directives stripped).
//
// Similar facilities from clang that are *not* used:
// - SourceManager: designed around multiple files and precise macro expansion.
// - clang::Token: coupled to SourceManager, doesn't retain layout info.
// (pseudo::Token is similar, but without SourceLocations).
// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
// (pseudo::TokenStream is similar, but a flat token list).
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
namespace clang {
class LangOptions;
namespace syntax {
namespace pseudo {
/// A single C++ or preprocessor token.
///
/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
/// SourceManager - we are not dealing with multiple files.
struct Token {
/// An Index identifies a token within a stream.
using Index = uint32_t;
/// A sentinel Index indicating no token.
constexpr static Index Invalid = std::numeric_limits<Index>::max();
struct Range;
/// The token text.
///
/// Typically from the original source file, but may have been synthesized.
StringRef text() const { return StringRef(Data, Length); }
const char *Data = nullptr;
uint32_t Length = 0;
/// Zero-based line number for the start of the token.
/// This refers to the original source file as written.
uint32_t Line = 0;
/// Width of whitespace before the first token on this line.
uint8_t Indent = 0;
/// Flags have some meaning defined by the function that produced this stream.
uint8_t Flags = 0;
// Helpers to get/set Flags based on `enum class`.
template <class T> bool flag(T Mask) const {
return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
}
template <class T> void setFlag(T Mask) {
Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
}
/// The type of token as determined by clang's lexer.
clang::tok::TokenKind Kind = clang::tok::unknown;
};
static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
/// A half-open range of tokens within a stream.
struct Token::Range {
Index Begin = 0;
Index End = 0;
uint32_t size() const { return End - Begin; }
static Range emptyAt(Index Index) { return Range{Index, Index}; }
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
/// A complete sequence of Tokens representing a source file.
///
/// This may match a raw file from disk, or be derived from a previous stream.
/// For example, stripping comments from a TokenStream results in a new stream.
///
/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
/// int main ( ) ;
/// eof kw_int ident l_paren r_paren semi eof
/// front() back()
/// 0 1 2 3 4 5
class TokenStream {
public:
/// Create an empty stream.
///
/// Initially, the stream is appendable and not finalized.
/// The token sequence may only be accessed after finalize() is called.
///
/// Payload is an opaque object which will be owned by the stream.
/// e.g. an allocator to hold backing storage for synthesized token text.
explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
/// Append a token to the stream, which must not be finalized.
void push(Token T) {
assert(!isFinalized());
Storage.push_back(std::move(T));
}
/// Finalize the token stream, allowing tokens to be accessed.
/// Tokens may no longer be appended.
void finalize();
bool isFinalized() const;
/// Returns the index of T within the stream.
///
/// T must be within the stream or the end sentinel (not the start sentinel).
Token::Index index(const Token &T) const {
assert(isFinalized());
assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
assert(&T != Storage.data() && "start sentinel");
return &T - Tokens.data();
}
ArrayRef<Token> tokens() const {
assert(isFinalized());
return Tokens;
}
ArrayRef<Token> tokens(Token::Range R) const {
return tokens().slice(R.Begin, R.End - R.Begin);
}
/// May return the end sentinel if the stream is empty.
const Token &front() const {
assert(isFinalized());
return Storage[1];
}
/// Print the tokens in this stream to the output stream.
///
/// The presence of newlines/spaces is preserved, but not the quantity.
void print(llvm::raw_ostream &) const;
private:
std::shared_ptr<void> Payload;
MutableArrayRef<Token> Tokens;
std::vector<Token> Storage; // eof + Tokens + eof
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
/// Extracts a raw token stream from the source code.
///
/// All tokens will reference the data of the provided string.
/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
TokenStream lex(const std::string &, const clang::LangOptions &);
enum class LexFlags : uint8_t {
/// Marks the token at the start of a logical preprocessor line.
/// This is a position where a directive might start.
///
/// Here, the first # is StartsPPLine, but second is not (same logical line).
/// #define X(error) \
/// #error // not a directive!
///
/// Careful, the directive may not start exactly on the StartsPPLine token:
/// /*comment*/ #include <foo.h>
StartsPPLine = 1 << 0,
/// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
/// The text() of such tokens will contain the raw trigrah.
NeedsCleaning = 1 << 1,
};
/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
///
/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
/// their backing data is owned by the returned stream.
/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
///
/// The StartsPPLine flag is preserved.
///
/// Formally the identifier correctly happens before preprocessing, while we
/// should only cook raw_identifiers that survive preprocessing.
/// However, ignoring the Token::Kind of tokens in directives achieves the same.
/// (And having cooked token kinds in PP-disabled sections is useful for us).
TokenStream cook(const TokenStream &, const clang::LangOptions &);
} // namespace pseudo
} // namespace syntax
} // namespace clang
#endif

View File

@ -46,6 +46,15 @@ const char *tok::getKeywordSpelling(TokenKind Kind) {
return nullptr;
}
const char *tok::getPPKeywordSpelling(tok::PPKeywordKind Kind) {
switch (Kind) {
#define PPKEYWORD(x) case tok::pp_##x: return #x;
#include "clang/Basic/TokenKinds.def"
default: break;
}
return nullptr;
}
bool tok::isAnnotation(TokenKind Kind) {
switch (Kind) {
#define ANNOTATION(X) case annot_ ## X: return true;

View File

@ -3,9 +3,12 @@ set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangToolingSyntaxPseudo
Grammar.cpp
GrammarBNF.cpp
Lex.cpp
LRGraph.cpp
LRTable.cpp
LRTableBuild.cpp
Preprocess.cpp
Token.cpp
LINK_LIBS
clangBasic

View File

@ -0,0 +1,114 @@
//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/LiteralSupport.h"
#include "clang/Tooling/Syntax/Pseudo/Token.h"
namespace clang {
namespace syntax {
namespace pseudo {
TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
clang::SourceLocation Start;
// Tokenize using clang's lexer in raw mode.
// std::string guarantees null-termination, which the lexer needs.
clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
Code.data() + Code.size());
Lexer.SetCommentRetentionState(true);
TokenStream Result;
clang::Token CT;
unsigned LastOffset = 0;
unsigned Line = 0;
unsigned Indent = 0;
for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
Lexer.LexFromRawLexer(CT)) {
unsigned Offset =
CT.getLocation().getRawEncoding() - Start.getRawEncoding();
Token Tok;
Tok.Data = &Code[Offset];
Tok.Length = CT.getLength();
Tok.Kind = CT.getKind();
// Update current line number and indentation from raw source code.
unsigned NewLineStart = 0;
for (unsigned i = LastOffset; i < Offset; ++i) {
if (Code[i] == '\n') {
NewLineStart = i + 1;
++Line;
}
}
if (NewLineStart || !LastOffset) {
Indent = 0;
for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
if (c == ' ')
++Indent;
else if (c == '\t')
Indent += 8;
else
break;
}
}
Tok.Indent = Indent;
Tok.Line = Line;
if (CT.isAtStartOfLine())
Tok.setFlag(LexFlags::StartsPPLine);
if (CT.needsCleaning() || CT.hasUCN())
Tok.setFlag(LexFlags::NeedsCleaning);
Result.push(Tok);
LastOffset = Offset;
}
Result.finalize();
return Result;
}
TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
clang::IdentifierTable Identifiers(LangOpts);
TokenStream Result(CleanedStorage);
for (auto Tok : Code.tokens()) {
if (Tok.flag(LexFlags::NeedsCleaning)) {
// Remove escaped newlines and trigraphs.
llvm::SmallString<64> CleanBuffer;
const char *Pos = Tok.text().begin();
while (Pos < Tok.text().end()) {
unsigned CharSize = 0;
CleanBuffer.push_back(
clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
assert(CharSize != 0 && "no progress!");
Pos += CharSize;
}
// Remove universal character names (UCN).
llvm::SmallString<64> UCNBuffer;
clang::expandUCNs(UCNBuffer, CleanBuffer);
llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
Tok.Data = Text.data();
Tok.Length = Text.size();
Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
}
// Cook raw_identifiers into identifier, keyword, etc.
if (Tok.Kind == tok::raw_identifier)
Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
Result.push(std::move(Tok));
}
Result.finalize();
return Result;
}
} // namespace pseudo
} // namespace syntax
} // namespace clang

View File

@ -0,0 +1,206 @@
//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
#include "clang/Basic/IdentifierTable.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/Support/FormatVariadic.h"
namespace clang {
namespace syntax {
namespace pseudo {
namespace {
class PPParser {
public:
explicit PPParser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
void parse(PPStructure *Result) { parse(Result, /*TopLevel=*/true); }
private:
// Roles that a directive might take within a conditional block.
enum class Cond { None, If, Else, End };
static Cond classifyDirective(tok::PPKeywordKind K) {
switch (K) {
case clang::tok::pp_if:
case clang::tok::pp_ifdef:
case clang::tok::pp_ifndef:
return Cond::If;
case clang::tok::pp_elif:
case clang::tok::pp_elifdef:
case clang::tok::pp_elifndef:
case clang::tok::pp_else:
return Cond::Else;
case clang::tok::pp_endif:
return Cond::End;
default:
return Cond::None;
}
}
// Parses tokens starting at Tok into PP.
// If we reach an End or Else directive that ends PP, returns it.
// If TopLevel is true, then we do not expect End and always return None.
llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
auto StartsDirective =
[&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
if (Tok->flag(LexFlags::StartsPPLine)) {
// If we considered a comment at the start of a PP-line, it doesn't
// start a directive but the directive can still start after it.
if (Tok->Kind == tok::comment)
AllowDirectiveAt = Tok + 1;
return Tok->Kind == tok::hash;
}
return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
};
// Each iteration adds one chunk (or returns, if we see #endif).
while (Tok->Kind != tok::eof) {
// If there's no directive here, we have a code chunk.
if (!StartsDirective()) {
const Token *Start = Tok;
do
++Tok;
while (Tok->Kind != tok::eof && !StartsDirective());
PP->Chunks.push_back(PPStructure::Code{
Token::Range{Code.index(*Start), Code.index(*Tok)}});
continue;
}
// We have some kind of directive.
PPStructure::Directive Directive;
parseDirective(&Directive);
Cond Kind = classifyDirective(Directive.Kind);
if (Kind == Cond::If) {
// #if or similar, starting a nested conditional block.
PPStructure::Conditional Conditional;
Conditional.Branches.emplace_back();
Conditional.Branches.back().first = std::move(Directive);
parseConditional(&Conditional);
PP->Chunks.push_back(std::move(Conditional));
} else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
// #endif or similar, ending this PPStructure scope.
// (#endif is unexpected at the top level, treat as simple directive).
return std::move(Directive);
} else {
// #define or similar, a simple directive at the current scope.
PP->Chunks.push_back(std::move(Directive));
}
}
return None;
}
// Parse the rest of a conditional section, after seeing the If directive.
// Returns after consuming the End directive.
void parseConditional(PPStructure::Conditional *C) {
assert(C->Branches.size() == 1 &&
C->Branches.front().second.Chunks.empty() &&
"Should be ready to parse first branch body");
while (Tok->Kind != tok::eof) {
auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
if (!Terminator) {
assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
return;
}
if (classifyDirective(Terminator->Kind) == Cond::End) {
C->End = std::move(*Terminator);
return;
}
assert(classifyDirective(Terminator->Kind) == Cond::Else &&
"ended branch unexpectedly");
C->Branches.emplace_back();
C->Branches.back().first = std::move(*Terminator);
}
}
// Parse a directive. Tok is the hash.
void parseDirective(PPStructure::Directive *D) {
assert(Tok->Kind == tok::hash);
// Directive spans from the hash until the end of line or file.
const Token *Begin = Tok++;
while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
++Tok;
ArrayRef<Token> Tokens{Begin, Tok};
D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
// Directive name is the first non-comment token after the hash.
Tokens = Tokens.drop_front().drop_while(
[](const Token &T) { return T.Kind == tok::comment; });
if (!Tokens.empty())
D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
}
const TokenStream &Code;
const Token *Tok;
clang::IdentifierTable PPKeywords;
};
} // namespace
PPStructure PPStructure::parse(const TokenStream &Code) {
PPStructure Result;
PPParser(Code).parse(&Result);
return Result;
}
static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent);
static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive,
unsigned Indent) {
OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
tok::getPPKeywordSpelling(Directive.Kind),
Directive.Tokens.size());
}
static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code,
unsigned Indent) {
OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
}
static void dump(llvm::raw_ostream &OS,
const PPStructure::Conditional &Conditional, unsigned Indent) {
for (const auto &Branch : Conditional.Branches) {
dump(OS, Branch.first, Indent);
dump(OS, Branch.second, Indent + 2);
}
dump(OS, Conditional.End, Indent);
}
static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk,
unsigned Indent) {
switch (Chunk.kind()) {
case PPStructure::Chunk::K_Empty:
llvm_unreachable("invalid chunk");
case PPStructure::Chunk::K_Code:
return dump(OS, (const PPStructure::Code &)Chunk, Indent);
case PPStructure::Chunk::K_Directive:
return dump(OS, (const PPStructure::Directive &)Chunk, Indent);
case PPStructure::Chunk::K_Conditional:
return dump(OS, (const PPStructure::Conditional &)Chunk, Indent);
}
}
static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
unsigned Indent) {
for (const auto &Chunk : PP.Chunks)
dump(OS, Chunk, Indent);
}
// Define operator<< in terms of dump() functions above.
#define OSTREAM_DUMP(Type) \
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \
dump(OS, T, 0); \
return OS; \
}
OSTREAM_DUMP(PPStructure)
OSTREAM_DUMP(PPStructure::Chunk)
OSTREAM_DUMP(PPStructure::Directive)
OSTREAM_DUMP(PPStructure::Conditional)
OSTREAM_DUMP(PPStructure::Code)
#undef OSTREAM_DUMP
} // namespace pseudo
} // namespace syntax
} // namespace clang

View File

@ -0,0 +1,98 @@
//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/FormatVariadic.h"
namespace clang {
namespace syntax {
namespace pseudo {
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
T.Indent);
OS << '"';
llvm::printEscapedString(T.text(), OS);
OS << '"';
if (T.Flags)
OS << llvm::format(" flags=%x", T.Flags);
return OS;
}
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
OS << "Index Kind Line Text\n";
for (const auto &T : TS.tokens()) {
OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T),
clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
OS << '"';
llvm::printEscapedString(T.text(), OS);
OS << '"';
if (T.Flags)
OS << llvm::format(" flags=%x", T.Flags);
OS << '\n';
}
return OS;
}
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
return OS;
}
TokenStream::TokenStream(std::shared_ptr<void> Payload)
: Payload(std::move(Payload)) {
Storage.emplace_back();
Storage.back().Kind = clang::tok::eof;
}
void TokenStream::finalize() {
assert(!isFinalized());
unsigned LastLine = Storage.back().Line;
Storage.emplace_back();
Storage.back().Kind = tok::eof;
Storage.back().Line = LastLine + 1;
Tokens = Storage;
Tokens = Tokens.drop_front().drop_back();
}
bool TokenStream::isFinalized() const {
assert(!Storage.empty() && Storage.front().Kind == tok::eof);
if (Storage.size() == 1)
return false;
return Storage.back().Kind == tok::eof;
}
void TokenStream::print(llvm::raw_ostream &OS) const {
bool FirstToken = true;
unsigned LastLine = -1;
StringRef LastText;
for (const auto &T : tokens()) {
StringRef Text = T.text();
if (FirstToken) {
FirstToken = false;
} else if (T.Line == LastLine) {
if (LastText.data() + LastText.size() != Text.data())
OS << ' ';
} else {
OS << '\n';
OS.indent(T.Indent);
}
OS << Text;
LastLine = T.Line;
LastText = Text;
}
if (!FirstToken)
OS << '\n';
}
} // namespace pseudo
} // namespace syntax
} // namespace clang

52
clang/test/Syntax/lex.c Normal file
View File

@ -0,0 +1,52 @@
int is_debug() {
#ifndef NDEBUG
return 1; // in debug mode
#else
return 0;
#endif
}
/* This comment gets lexed along with the input above! We just don't CHECK it.
RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
SOURCE: int is_debug() {
SOURCE-NEXT: #ifndef NDEBUG
SOURCE-NEXT: return 1; // in debug mode
SOURCE-NEXT: #else
SOURCE-NEXT: return 0;
SOURCE-NEXT: #end
SOURCE-NEXT: }
RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
TOKEN: 0: raw_identifier 0:0 "int" flags=1
TOKEN-NEXT: raw_identifier 0:0 "is_debug"
TOKEN-NEXT: l_paren 0:0 "("
TOKEN-NEXT: r_paren 0:0 ")"
TOKEN-NEXT: l_brace 0:0 "{"
TOKEN-NEXT: hash 1:0 "#" flags=1
TOKEN-NEXT: raw_identifier 1:0 "ifndef"
TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
TOKEN-NEXT: numeric_constant 2:2 "1"
TOKEN-NEXT: semi 2:2 ";"
TOKEN-NEXT: comment 2:2 "// in debug mode"
TOKEN-NEXT: hash 3:0 "#" flags=1
TOKEN-NEXT: raw_identifier 3:0 "else"
TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
TOKEN-NEXT: numeric_constant 4:2 "0"
TOKEN-NEXT: semi 4:2 ";"
TOKEN-NEXT: hash 5:0 "#" flags=1
TOKEN-NEXT: raw_identifier 5:0 "endif"
TOKEN-NEXT: r_brace 6:0 "}" flags=1
RUN: clang-pseudo -source %s -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
PPS: code (5 tokens)
PPS-NEXT: #ifndef (3 tokens)
PPS-NEXT: code (4 tokens)
PPS-NEXT: #else (2 tokens)
PPS-NEXT: code (3 tokens)
PPS-NEXT: #endif (2 tokens)
PPS-NEXT: code (2 tokens)
^ including this block comment
*******************************************************************************/

View File

@ -6,9 +6,12 @@
//
//===----------------------------------------------------------------------===//
#include "clang/Basic/LangOptions.h"
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
@ -25,13 +28,19 @@ static opt<bool> PrintGraph("print-graph",
desc("Print the LR graph for the grammar"));
static opt<bool> PrintTable("print-table",
desc("Print the LR table for the grammar"));
static opt<std::string> Source("source", desc("Source file"));
static opt<bool> PrintSource("print-source", desc("Print token stream"));
static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
static opt<bool>
PrintPPStructure("print-pp-structure",
desc("Print directive structure of source code"));
static std::string readOrDie(llvm::StringRef Path) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
llvm::MemoryBuffer::getFile(Path);
if (std::error_code EC = Text.getError()) {
llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
<< "\n";
llvm::errs() << "Error: can't read grammar file '" << Path
<< "': " << EC.message() << "\n";
::exit(1);
}
return Text.get()->getBuffer().str();
@ -60,5 +69,19 @@ int main(int argc, char *argv[]) {
return 0;
}
if (Source.getNumOccurrences()) {
std::string Text = readOrDie(Source);
clang::LangOptions LangOpts; // FIXME: use real options.
auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
if (PrintPPStructure)
llvm::outs() << Structure;
if (PrintSource)
Stream.print(llvm::outs());
if (PrintTokens)
llvm::outs() << Stream;
}
return 0;
}

View File

@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
add_clang_unittest(ClangPseudoTests
GrammarTest.cpp
LRTableTest.cpp
PreprocessTest.cpp
TokenTest.cpp
)
clang_target_link_libraries(ClangPseudoTests

View File

@ -0,0 +1,152 @@
//===--- TokenTest.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace clang {
namespace syntax {
namespace pseudo {
namespace {
using testing::_;
using testing::ElementsAre;
using testing::Matcher;
using testing::Pair;
using testing::StrEq;
using Chunk = PPStructure::Chunk;
MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
std::vector<llvm::StringRef> Texts;
for (const Token &Tok : TS.tokens(arg.Tokens))
Texts.push_back(Tok.text());
return Matcher<std::string>(StrEq(Tokens))
.MatchAndExplain(llvm::join(Texts, " "), result_listener);
}
MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
TEST(PPStructure, Parse) {
LangOptions Opts;
std::string Code = R"cpp(
#include <foo.h>
int main() {
#ifdef HAS_FOO
#if HAS_BAR
foo(bar);
#else
foo(0)
#endif
#elif NEEDS_FOO
#error missing_foo
#endif
}
)cpp";
TokenStream S = cook(lex(Code, Opts), Opts);
PPStructure PP = PPStructure::parse(S);
ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
chunkKind(Chunk::K_Code),
chunkKind(Chunk::K_Conditional),
chunkKind(Chunk::K_Code)));
EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0],
tokensAre(S, "# include < foo . h >"));
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1],
tokensAre(S, "int main ( ) {"));
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}"));
const PPStructure::Conditional &Ifdef(PP.Chunks[2]);
EXPECT_THAT(Ifdef.Branches,
ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
const PPStructure &HasFoo(Ifdef.Branches[0].second);
const PPStructure &NeedsFoo(Ifdef.Branches[1].second);
EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
const PPStructure::Conditional &If(HasFoo.Chunks[0]);
EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
Pair(tokensAre(S, "# else"), _)));
EXPECT_THAT(If.Branches[0].second.Chunks,
ElementsAre(chunkKind(Chunk::K_Code)));
EXPECT_THAT(If.Branches[1].second.Chunks,
ElementsAre(chunkKind(Chunk::K_Code)));
EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
const PPStructure::Directive &Error(NeedsFoo.Chunks[0]);
EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
EXPECT_EQ(Error.Kind, tok::pp_error);
}
TEST(PPStructure, ParseUgly) {
LangOptions Opts;
std::string Code = R"cpp(
/*A*/ # /*B*/ \
/*C*/ \
define \
BAR /*D*/
/*E*/
)cpp";
TokenStream S = cook(lex(Code, Opts), Opts);
PPStructure PP = PPStructure::parse(S);
ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
chunkKind(Chunk::K_Directive),
chunkKind(Chunk::K_Code)));
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
const PPStructure::Directive &Define(PP.Chunks[1]);
EXPECT_EQ(Define.Kind, tok::pp_define);
EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
}
TEST(PPStructure, ParseBroken) {
LangOptions Opts;
std::string Code = R"cpp(
a
#endif // mismatched
#if X
b
)cpp";
TokenStream S = cook(lex(Code, Opts), Opts);
PPStructure PP = PPStructure::parse(S);
ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
chunkKind(Chunk::K_Directive),
chunkKind(Chunk::K_Conditional)));
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "a"));
const PPStructure::Directive &Endif(PP.Chunks[1]);
EXPECT_EQ(Endif.Kind, tok::pp_endif);
EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
const PPStructure::Conditional &X(PP.Chunks[2]);
EXPECT_EQ(1u, X.Branches.size());
// The (only) branch of the broken conditional section runs until eof.
EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
EXPECT_THAT(X.Branches.front().second.Chunks,
ElementsAre(chunkKind(Chunk::K_Code)));
// The missing terminating directive is marked as pp_not_keyword.
EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
EXPECT_EQ(0u, X.End.Tokens.size());
}
} // namespace
} // namespace pseudo
} // namespace syntax
} // namespace clang

View File

@ -0,0 +1,178 @@
//===--- TokenTest.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/TokenKinds.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace clang {
namespace syntax {
namespace pseudo {
namespace {
using testing::AllOf;
using testing::ElementsAre;
using testing::ElementsAreArray;
using testing::Not;
MATCHER_P2(token, Text, Kind, "") {
return arg.Kind == Kind && arg.text() == Text;
}
MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
MATCHER_P2(lineIndent, Line, Indent, "") {
return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
}
TEST(TokenTest, Lex) {
LangOptions Opts;
std::string Code = R"cpp(
#include <stdio.h>
int main() {
return 42; // the answer
}
)cpp";
TokenStream Raw = lex(Code, Opts);
ASSERT_TRUE(Raw.isFinalized());
EXPECT_THAT(Raw.tokens(),
ElementsAreArray({
// Lexing of directives is weird, especially <angled> strings.
token("#", tok::hash),
token("include", tok::raw_identifier),
token("<", tok::less),
token("stdio", tok::raw_identifier),
token(".", tok::period),
token("h", tok::raw_identifier),
token(">", tok::greater),
token("int", tok::raw_identifier),
token("main", tok::raw_identifier),
token("(", tok::l_paren),
token(")", tok::r_paren),
token("{", tok::l_brace),
token("return", tok::raw_identifier),
token("42", tok::numeric_constant),
token(";", tok::semi),
token("// the answer", tok::comment),
token("}", tok::r_brace),
}));
TokenStream Cooked = cook(Raw, Opts);
ASSERT_TRUE(Cooked.isFinalized());
EXPECT_THAT(Cooked.tokens(),
ElementsAreArray({
// Cooked identifier types in directives are not meaningful.
token("#", tok::hash),
token("include", tok::identifier),
token("<", tok::less),
token("stdio", tok::identifier),
token(".", tok::period),
token("h", tok::identifier),
token(">", tok::greater),
token("int", tok::kw_int),
token("main", tok::identifier),
token("(", tok::l_paren),
token(")", tok::r_paren),
token("{", tok::l_brace),
token("return", tok::kw_return),
token("42", tok::numeric_constant),
token(";", tok::semi),
token("// the answer", tok::comment),
token("}", tok::r_brace),
}));
// Check raw tokens point back into original source code.
EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
}
TEST(TokenTest, LineContinuation) {
LangOptions Opts;
std::string Code = R"cpp(
one_\
token
two \
tokens
)cpp";
TokenStream Raw = lex(Code, Opts);
EXPECT_THAT(
Raw.tokens(),
ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
hasFlag(LexFlags::StartsPPLine),
hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
AllOf(token("two", tok::raw_identifier),
hasFlag(LexFlags::StartsPPLine),
Not(hasFlag(LexFlags::NeedsCleaning))),
AllOf(token("\\\ntokens", tok::raw_identifier),
Not(hasFlag(LexFlags::StartsPPLine)),
hasFlag(LexFlags::NeedsCleaning))));
TokenStream Cooked = cook(Raw, Opts);
EXPECT_THAT(
Cooked.tokens(),
ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
token("two", tok::identifier),
token("tokens", tok::identifier)));
}
TEST(TokenTest, EncodedCharacters) {
LangOptions Opts;
Opts.Trigraphs = true;
Opts.Digraphs = true;
Opts.C99 = true; // UCNs
Opts.CXXOperatorNames = true;
std::string Code = R"(and <: ??! '??=' \u00E9)";
TokenStream Raw = lex(Code, Opts);
EXPECT_THAT(
Raw.tokens(),
ElementsAre( // and is not recognized as && until cook().
AllOf(token("and", tok::raw_identifier),
Not(hasFlag(LexFlags::NeedsCleaning))),
// Digraphs are just different spellings of tokens.
AllOf(token("<:", tok::l_square),
Not(hasFlag(LexFlags::NeedsCleaning))),
// Trigraps are interpreted, still need text cleaning.
AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
// Trigraphs must be substituted inside constants too.
AllOf(token(R"('??=')", tok::char_constant),
hasFlag(LexFlags::NeedsCleaning)),
// UCNs need substitution.
AllOf(token(R"(\u00E9)", tok::raw_identifier),
hasFlag(LexFlags::NeedsCleaning))));
TokenStream Cooked = cook(Raw, Opts);
EXPECT_THAT(
Cooked.tokens(),
ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
token("<:", tok::l_square),
token("|", tok::pipe), // trigraph substituted
token("'#'", tok::char_constant), // trigraph substituted
token("é", tok::identifier))); // UCN substituted
}
TEST(TokenTest, Indentation) {
LangOptions Opts;
std::string Code = R"cpp( hello world
no_indent \
line_was_continued
)cpp";
TokenStream Raw = lex(Code, Opts);
EXPECT_THAT(Raw.tokens(), ElementsAreArray({
lineIndent(0, 3), // hello
lineIndent(0, 3), // world
lineIndent(1, 0), // no_indent
lineIndent(2, 2), // line_was_continued
}));
}
} // namespace
} // namespace pseudo
} // namespace syntax
} // namespace clang