[Pseudo] Token/TokenStream, PP directive parser.
The TokenStream class is the representation of the source code that will be fed into the GLR parser. This patch allows a "raw" TokenStream to be built by reading source code. It also supports scanning a TokenStream to find the directive structure. Next steps (with placeholders in the code): heuristically choosing a path through #ifs, preprocessing the code by stripping directives and comments. These will produce a suitable stream to feed into the parser proper. Differential Revision: https://reviews.llvm.org/D119162
This commit is contained in:
parent
70ff6fbeb9
commit
7c1ee5e95f
|
@ -68,6 +68,9 @@ const char *getPunctuatorSpelling(TokenKind Kind) LLVM_READNONE;
|
|||
/// tokens like 'int' and 'dynamic_cast'. Returns NULL for other token kinds.
|
||||
const char *getKeywordSpelling(TokenKind Kind) LLVM_READNONE;
|
||||
|
||||
/// Returns the spelling of preprocessor keywords, such as "else".
|
||||
const char *getPPKeywordSpelling(PPKeywordKind Kind) LLVM_READNONE;
|
||||
|
||||
/// Return true if this is a raw identifier or an identifier kind.
|
||||
inline bool isAnyIdentifier(TokenKind K) {
|
||||
return (K == tok::identifier) || (K == tok::raw_identifier);
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The pseudoparser tries to match a token stream to the C++ grammar.
|
||||
// Preprocessor #defines and other directives are not part of this grammar, and
|
||||
// should be removed before the file can be parsed.
|
||||
//
|
||||
// Conditional blocks like #if...#else...#endif are particularly tricky, as
|
||||
// simply stripping the directives may not produce a grammatical result:
|
||||
//
|
||||
// return
|
||||
// #ifndef DEBUG
|
||||
// 1
|
||||
// #else
|
||||
// 0
|
||||
// #endif
|
||||
// ;
|
||||
//
|
||||
// This header supports analyzing and removing the directives in a source file.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
|
||||
#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
|
||||
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/Token.h"
|
||||
#include <vector>
|
||||
|
||||
namespace clang {
|
||||
class LangOptions;
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
|
||||
/// Describes the structure of a source file, as seen by the preprocessor.
|
||||
///
|
||||
/// The structure is a tree, whose leaves are plain source code and directives,
|
||||
/// and whose internal nodes are #if...#endif sections.
|
||||
///
|
||||
/// (root)
|
||||
/// |-+ Directive #include <stdio.h>
|
||||
/// |-+ Code int main() {
|
||||
/// | ` printf("hello, ");
|
||||
/// |-+ Conditional -+ Directive #ifndef NDEBUG
|
||||
/// | |-+ Code printf("debug\n");
|
||||
/// | |-+ Directive #else
|
||||
/// | |-+ Code printf("production\n");
|
||||
/// | `-+ Directive #endif
|
||||
/// |-+ Code return 0;
|
||||
/// ` }
|
||||
///
|
||||
/// Unlike the clang preprocessor, we model the full tree explicitly.
|
||||
/// This class does not recognize macro usage, only directives.
|
||||
struct PPStructure {
|
||||
/// A range of code (and possibly comments) containing no directives.
|
||||
struct Code {
|
||||
Token::Range Tokens;
|
||||
};
|
||||
/// A preprocessor directive.
|
||||
struct Directive {
|
||||
/// Raw tokens making up the directive, starting with `#`.
|
||||
Token::Range Tokens;
|
||||
clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
|
||||
};
|
||||
/// A preprocessor conditional section.
|
||||
///
|
||||
/// This starts with an #if, #ifdef, #ifndef etc directive.
|
||||
/// It covers all #else branches, and spans until the matching #endif.
|
||||
struct Conditional {
|
||||
/// The sequence of directives that introduce top-level alternative parses.
|
||||
///
|
||||
/// The first branch will have an #if type directive.
|
||||
/// Subsequent branches will have #else type directives.
|
||||
std::vector<std::pair<Directive, PPStructure>> Branches;
|
||||
/// The directive terminating the conditional, should be #endif.
|
||||
Directive End;
|
||||
};
|
||||
|
||||
/// Some piece of the file. {One of Code, Directive, Conditional}.
|
||||
class Chunk; // Defined below.
|
||||
std::vector<Chunk> Chunks;
|
||||
|
||||
/// Extract preprocessor structure by examining the raw tokens.
|
||||
static PPStructure parse(const TokenStream &);
|
||||
|
||||
// FIXME: add heuristically selection of conditional branches.
|
||||
// FIXME: allow deriving a preprocessed stream
|
||||
};
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &);
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &);
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
|
||||
const PPStructure::Directive &);
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
|
||||
const PPStructure::Conditional &);
|
||||
|
||||
// FIXME: This approximates std::variant<Code, Directive, Conditional>.
|
||||
// Switch once we can use C++17.
|
||||
class PPStructure::Chunk {
|
||||
public:
|
||||
enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
|
||||
Kind kind() const {
|
||||
return CodeVariant ? K_Code
|
||||
: DirectiveVariant ? K_Directive
|
||||
: ConditionalVariant ? K_Conditional
|
||||
: K_Empty;
|
||||
}
|
||||
|
||||
Chunk() = delete;
|
||||
Chunk(const Chunk &) = delete;
|
||||
Chunk(Chunk &&) = default;
|
||||
Chunk &operator=(const Chunk &) = delete;
|
||||
Chunk &operator=(Chunk &&) = default;
|
||||
~Chunk() = default;
|
||||
|
||||
// T => Chunk constructor.
|
||||
Chunk(Code C) : CodeVariant(std::move(C)) {}
|
||||
Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
|
||||
Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
|
||||
|
||||
// Chunk => T& and const T& conversions.
|
||||
#define CONVERSION(CONST, V) \
|
||||
explicit operator CONST V &() CONST { return *V##Variant; }
|
||||
CONVERSION(const, Code);
|
||||
CONVERSION(, Code);
|
||||
CONVERSION(const, Directive);
|
||||
CONVERSION(, Directive);
|
||||
CONVERSION(const, Conditional);
|
||||
CONVERSION(, Conditional);
|
||||
#undef CONVERSION
|
||||
|
||||
private:
|
||||
// Wasteful, a union variant would be better!
|
||||
llvm::Optional<Code> CodeVariant;
|
||||
llvm::Optional<Directive> DirectiveVariant;
|
||||
llvm::Optional<Conditional> ConditionalVariant;
|
||||
};
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
||||
|
||||
#endif
|
|
@ -0,0 +1,202 @@
|
|||
//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Tokens are the first level of abstraction above bytes used in pseudoparsing.
|
||||
// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
|
||||
// The tokens is wrapped into pseudo::Token, along with line/indent info.
|
||||
//
|
||||
// Unlike clang, we make multiple passes over the whole file, out-of-order.
|
||||
// Therefore we retain the whole token sequence in memory. (This is feasible as
|
||||
// we process one file at a time). pseudo::TokenStream holds such a stream.
|
||||
// The initial stream holds the raw tokens read from the file, later passes
|
||||
// operate on derived TokenStreams (e.g. with directives stripped).
|
||||
//
|
||||
// Similar facilities from clang that are *not* used:
|
||||
// - SourceManager: designed around multiple files and precise macro expansion.
|
||||
// - clang::Token: coupled to SourceManager, doesn't retain layout info.
|
||||
// (pseudo::Token is similar, but without SourceLocations).
|
||||
// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
|
||||
// (pseudo::TokenStream is similar, but a flat token list).
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
|
||||
#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
|
||||
|
||||
#include "clang/Basic/LLVM.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace clang {
|
||||
class LangOptions;
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
|
||||
/// A single C++ or preprocessor token.
|
||||
///
|
||||
/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
|
||||
/// SourceManager - we are not dealing with multiple files.
|
||||
struct Token {
|
||||
/// An Index identifies a token within a stream.
|
||||
using Index = uint32_t;
|
||||
/// A sentinel Index indicating no token.
|
||||
constexpr static Index Invalid = std::numeric_limits<Index>::max();
|
||||
struct Range;
|
||||
|
||||
/// The token text.
|
||||
///
|
||||
/// Typically from the original source file, but may have been synthesized.
|
||||
StringRef text() const { return StringRef(Data, Length); }
|
||||
const char *Data = nullptr;
|
||||
uint32_t Length = 0;
|
||||
|
||||
/// Zero-based line number for the start of the token.
|
||||
/// This refers to the original source file as written.
|
||||
uint32_t Line = 0;
|
||||
/// Width of whitespace before the first token on this line.
|
||||
uint8_t Indent = 0;
|
||||
/// Flags have some meaning defined by the function that produced this stream.
|
||||
uint8_t Flags = 0;
|
||||
// Helpers to get/set Flags based on `enum class`.
|
||||
template <class T> bool flag(T Mask) const {
|
||||
return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
|
||||
}
|
||||
template <class T> void setFlag(T Mask) {
|
||||
Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
|
||||
}
|
||||
|
||||
/// The type of token as determined by clang's lexer.
|
||||
clang::tok::TokenKind Kind = clang::tok::unknown;
|
||||
};
|
||||
static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
|
||||
|
||||
/// A half-open range of tokens within a stream.
|
||||
struct Token::Range {
|
||||
Index Begin = 0;
|
||||
Index End = 0;
|
||||
|
||||
uint32_t size() const { return End - Begin; }
|
||||
static Range emptyAt(Index Index) { return Range{Index, Index}; }
|
||||
};
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
|
||||
|
||||
/// A complete sequence of Tokens representing a source file.
|
||||
///
|
||||
/// This may match a raw file from disk, or be derived from a previous stream.
|
||||
/// For example, stripping comments from a TokenStream results in a new stream.
|
||||
///
|
||||
/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
|
||||
/// int main ( ) ;
|
||||
/// eof kw_int ident l_paren r_paren semi eof
|
||||
/// front() back()
|
||||
/// 0 1 2 3 4 5
|
||||
class TokenStream {
|
||||
public:
|
||||
/// Create an empty stream.
|
||||
///
|
||||
/// Initially, the stream is appendable and not finalized.
|
||||
/// The token sequence may only be accessed after finalize() is called.
|
||||
///
|
||||
/// Payload is an opaque object which will be owned by the stream.
|
||||
/// e.g. an allocator to hold backing storage for synthesized token text.
|
||||
explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
|
||||
|
||||
/// Append a token to the stream, which must not be finalized.
|
||||
void push(Token T) {
|
||||
assert(!isFinalized());
|
||||
Storage.push_back(std::move(T));
|
||||
}
|
||||
|
||||
/// Finalize the token stream, allowing tokens to be accessed.
|
||||
/// Tokens may no longer be appended.
|
||||
void finalize();
|
||||
bool isFinalized() const;
|
||||
|
||||
/// Returns the index of T within the stream.
|
||||
///
|
||||
/// T must be within the stream or the end sentinel (not the start sentinel).
|
||||
Token::Index index(const Token &T) const {
|
||||
assert(isFinalized());
|
||||
assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
|
||||
assert(&T != Storage.data() && "start sentinel");
|
||||
return &T - Tokens.data();
|
||||
}
|
||||
|
||||
ArrayRef<Token> tokens() const {
|
||||
assert(isFinalized());
|
||||
return Tokens;
|
||||
}
|
||||
ArrayRef<Token> tokens(Token::Range R) const {
|
||||
return tokens().slice(R.Begin, R.End - R.Begin);
|
||||
}
|
||||
|
||||
/// May return the end sentinel if the stream is empty.
|
||||
const Token &front() const {
|
||||
assert(isFinalized());
|
||||
return Storage[1];
|
||||
}
|
||||
|
||||
/// Print the tokens in this stream to the output stream.
|
||||
///
|
||||
/// The presence of newlines/spaces is preserved, but not the quantity.
|
||||
void print(llvm::raw_ostream &) const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<void> Payload;
|
||||
|
||||
MutableArrayRef<Token> Tokens;
|
||||
std::vector<Token> Storage; // eof + Tokens + eof
|
||||
};
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
|
||||
|
||||
/// Extracts a raw token stream from the source code.
|
||||
///
|
||||
/// All tokens will reference the data of the provided string.
|
||||
/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
|
||||
TokenStream lex(const std::string &, const clang::LangOptions &);
|
||||
enum class LexFlags : uint8_t {
|
||||
/// Marks the token at the start of a logical preprocessor line.
|
||||
/// This is a position where a directive might start.
|
||||
///
|
||||
/// Here, the first # is StartsPPLine, but second is not (same logical line).
|
||||
/// #define X(error) \
|
||||
/// #error // not a directive!
|
||||
///
|
||||
/// Careful, the directive may not start exactly on the StartsPPLine token:
|
||||
/// /*comment*/ #include <foo.h>
|
||||
StartsPPLine = 1 << 0,
|
||||
/// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
|
||||
/// The text() of such tokens will contain the raw trigrah.
|
||||
NeedsCleaning = 1 << 1,
|
||||
};
|
||||
|
||||
/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
|
||||
///
|
||||
/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
|
||||
/// their backing data is owned by the returned stream.
|
||||
/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
|
||||
///
|
||||
/// The StartsPPLine flag is preserved.
|
||||
///
|
||||
/// Formally the identifier correctly happens before preprocessing, while we
|
||||
/// should only cook raw_identifiers that survive preprocessing.
|
||||
/// However, ignoring the Token::Kind of tokens in directives achieves the same.
|
||||
/// (And having cooked token kinds in PP-disabled sections is useful for us).
|
||||
TokenStream cook(const TokenStream &, const clang::LangOptions &);
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
||||
|
||||
#endif
|
|
@ -46,6 +46,15 @@ const char *tok::getKeywordSpelling(TokenKind Kind) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const char *tok::getPPKeywordSpelling(tok::PPKeywordKind Kind) {
|
||||
switch (Kind) {
|
||||
#define PPKEYWORD(x) case tok::pp_##x: return #x;
|
||||
#include "clang/Basic/TokenKinds.def"
|
||||
default: break;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool tok::isAnnotation(TokenKind Kind) {
|
||||
switch (Kind) {
|
||||
#define ANNOTATION(X) case annot_ ## X: return true;
|
||||
|
|
|
@ -3,9 +3,12 @@ set(LLVM_LINK_COMPONENTS Support)
|
|||
add_clang_library(clangToolingSyntaxPseudo
|
||||
Grammar.cpp
|
||||
GrammarBNF.cpp
|
||||
Lex.cpp
|
||||
LRGraph.cpp
|
||||
LRTable.cpp
|
||||
LRTableBuild.cpp
|
||||
Preprocess.cpp
|
||||
Token.cpp
|
||||
|
||||
LINK_LIBS
|
||||
clangBasic
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang/Basic/SourceLocation.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "clang/Lex/Lexer.h"
|
||||
#include "clang/Lex/LiteralSupport.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/Token.h"
|
||||
|
||||
namespace clang {
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
|
||||
TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
|
||||
clang::SourceLocation Start;
|
||||
// Tokenize using clang's lexer in raw mode.
|
||||
// std::string guarantees null-termination, which the lexer needs.
|
||||
clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
|
||||
Code.data() + Code.size());
|
||||
Lexer.SetCommentRetentionState(true);
|
||||
|
||||
TokenStream Result;
|
||||
clang::Token CT;
|
||||
unsigned LastOffset = 0;
|
||||
unsigned Line = 0;
|
||||
unsigned Indent = 0;
|
||||
for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
|
||||
Lexer.LexFromRawLexer(CT)) {
|
||||
unsigned Offset =
|
||||
CT.getLocation().getRawEncoding() - Start.getRawEncoding();
|
||||
|
||||
Token Tok;
|
||||
Tok.Data = &Code[Offset];
|
||||
Tok.Length = CT.getLength();
|
||||
Tok.Kind = CT.getKind();
|
||||
|
||||
// Update current line number and indentation from raw source code.
|
||||
unsigned NewLineStart = 0;
|
||||
for (unsigned i = LastOffset; i < Offset; ++i) {
|
||||
if (Code[i] == '\n') {
|
||||
NewLineStart = i + 1;
|
||||
++Line;
|
||||
}
|
||||
}
|
||||
if (NewLineStart || !LastOffset) {
|
||||
Indent = 0;
|
||||
for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
|
||||
if (c == ' ')
|
||||
++Indent;
|
||||
else if (c == '\t')
|
||||
Indent += 8;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
Tok.Indent = Indent;
|
||||
Tok.Line = Line;
|
||||
|
||||
if (CT.isAtStartOfLine())
|
||||
Tok.setFlag(LexFlags::StartsPPLine);
|
||||
if (CT.needsCleaning() || CT.hasUCN())
|
||||
Tok.setFlag(LexFlags::NeedsCleaning);
|
||||
|
||||
Result.push(Tok);
|
||||
LastOffset = Offset;
|
||||
}
|
||||
Result.finalize();
|
||||
return Result;
|
||||
}
|
||||
|
||||
TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
|
||||
auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
|
||||
clang::IdentifierTable Identifiers(LangOpts);
|
||||
TokenStream Result(CleanedStorage);
|
||||
|
||||
for (auto Tok : Code.tokens()) {
|
||||
if (Tok.flag(LexFlags::NeedsCleaning)) {
|
||||
// Remove escaped newlines and trigraphs.
|
||||
llvm::SmallString<64> CleanBuffer;
|
||||
const char *Pos = Tok.text().begin();
|
||||
while (Pos < Tok.text().end()) {
|
||||
unsigned CharSize = 0;
|
||||
CleanBuffer.push_back(
|
||||
clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
|
||||
assert(CharSize != 0 && "no progress!");
|
||||
Pos += CharSize;
|
||||
}
|
||||
// Remove universal character names (UCN).
|
||||
llvm::SmallString<64> UCNBuffer;
|
||||
clang::expandUCNs(UCNBuffer, CleanBuffer);
|
||||
|
||||
llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
|
||||
Tok.Data = Text.data();
|
||||
Tok.Length = Text.size();
|
||||
Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
|
||||
}
|
||||
// Cook raw_identifiers into identifier, keyword, etc.
|
||||
if (Tok.Kind == tok::raw_identifier)
|
||||
Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
|
||||
Result.push(std::move(Tok));
|
||||
}
|
||||
|
||||
Result.finalize();
|
||||
return Result;
|
||||
}
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
|
@ -0,0 +1,206 @@
|
|||
//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
|
||||
#include "clang/Basic/IdentifierTable.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
||||
namespace clang {
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
namespace {
|
||||
|
||||
class PPParser {
|
||||
public:
|
||||
explicit PPParser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
|
||||
void parse(PPStructure *Result) { parse(Result, /*TopLevel=*/true); }
|
||||
|
||||
private:
|
||||
// Roles that a directive might take within a conditional block.
|
||||
enum class Cond { None, If, Else, End };
|
||||
static Cond classifyDirective(tok::PPKeywordKind K) {
|
||||
switch (K) {
|
||||
case clang::tok::pp_if:
|
||||
case clang::tok::pp_ifdef:
|
||||
case clang::tok::pp_ifndef:
|
||||
return Cond::If;
|
||||
case clang::tok::pp_elif:
|
||||
case clang::tok::pp_elifdef:
|
||||
case clang::tok::pp_elifndef:
|
||||
case clang::tok::pp_else:
|
||||
return Cond::Else;
|
||||
case clang::tok::pp_endif:
|
||||
return Cond::End;
|
||||
default:
|
||||
return Cond::None;
|
||||
}
|
||||
}
|
||||
|
||||
// Parses tokens starting at Tok into PP.
|
||||
// If we reach an End or Else directive that ends PP, returns it.
|
||||
// If TopLevel is true, then we do not expect End and always return None.
|
||||
llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
|
||||
auto StartsDirective =
|
||||
[&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
|
||||
if (Tok->flag(LexFlags::StartsPPLine)) {
|
||||
// If we considered a comment at the start of a PP-line, it doesn't
|
||||
// start a directive but the directive can still start after it.
|
||||
if (Tok->Kind == tok::comment)
|
||||
AllowDirectiveAt = Tok + 1;
|
||||
return Tok->Kind == tok::hash;
|
||||
}
|
||||
return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
|
||||
};
|
||||
// Each iteration adds one chunk (or returns, if we see #endif).
|
||||
while (Tok->Kind != tok::eof) {
|
||||
// If there's no directive here, we have a code chunk.
|
||||
if (!StartsDirective()) {
|
||||
const Token *Start = Tok;
|
||||
do
|
||||
++Tok;
|
||||
while (Tok->Kind != tok::eof && !StartsDirective());
|
||||
PP->Chunks.push_back(PPStructure::Code{
|
||||
Token::Range{Code.index(*Start), Code.index(*Tok)}});
|
||||
continue;
|
||||
}
|
||||
|
||||
// We have some kind of directive.
|
||||
PPStructure::Directive Directive;
|
||||
parseDirective(&Directive);
|
||||
Cond Kind = classifyDirective(Directive.Kind);
|
||||
if (Kind == Cond::If) {
|
||||
// #if or similar, starting a nested conditional block.
|
||||
PPStructure::Conditional Conditional;
|
||||
Conditional.Branches.emplace_back();
|
||||
Conditional.Branches.back().first = std::move(Directive);
|
||||
parseConditional(&Conditional);
|
||||
PP->Chunks.push_back(std::move(Conditional));
|
||||
} else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
|
||||
// #endif or similar, ending this PPStructure scope.
|
||||
// (#endif is unexpected at the top level, treat as simple directive).
|
||||
return std::move(Directive);
|
||||
} else {
|
||||
// #define or similar, a simple directive at the current scope.
|
||||
PP->Chunks.push_back(std::move(Directive));
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
// Parse the rest of a conditional section, after seeing the If directive.
|
||||
// Returns after consuming the End directive.
|
||||
void parseConditional(PPStructure::Conditional *C) {
|
||||
assert(C->Branches.size() == 1 &&
|
||||
C->Branches.front().second.Chunks.empty() &&
|
||||
"Should be ready to parse first branch body");
|
||||
while (Tok->Kind != tok::eof) {
|
||||
auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
|
||||
if (!Terminator) {
|
||||
assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
|
||||
C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
|
||||
return;
|
||||
}
|
||||
if (classifyDirective(Terminator->Kind) == Cond::End) {
|
||||
C->End = std::move(*Terminator);
|
||||
return;
|
||||
}
|
||||
assert(classifyDirective(Terminator->Kind) == Cond::Else &&
|
||||
"ended branch unexpectedly");
|
||||
C->Branches.emplace_back();
|
||||
C->Branches.back().first = std::move(*Terminator);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse a directive. Tok is the hash.
|
||||
void parseDirective(PPStructure::Directive *D) {
|
||||
assert(Tok->Kind == tok::hash);
|
||||
|
||||
// Directive spans from the hash until the end of line or file.
|
||||
const Token *Begin = Tok++;
|
||||
while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
|
||||
++Tok;
|
||||
ArrayRef<Token> Tokens{Begin, Tok};
|
||||
D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
|
||||
|
||||
// Directive name is the first non-comment token after the hash.
|
||||
Tokens = Tokens.drop_front().drop_while(
|
||||
[](const Token &T) { return T.Kind == tok::comment; });
|
||||
if (!Tokens.empty())
|
||||
D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
|
||||
}
|
||||
|
||||
const TokenStream &Code;
|
||||
const Token *Tok;
|
||||
clang::IdentifierTable PPKeywords;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
PPStructure PPStructure::parse(const TokenStream &Code) {
|
||||
PPStructure Result;
|
||||
PPParser(Code).parse(&Result);
|
||||
return Result;
|
||||
}
|
||||
|
||||
static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent);
|
||||
static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive,
|
||||
unsigned Indent) {
|
||||
OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
|
||||
tok::getPPKeywordSpelling(Directive.Kind),
|
||||
Directive.Tokens.size());
|
||||
}
|
||||
static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code,
|
||||
unsigned Indent) {
|
||||
OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
|
||||
}
|
||||
static void dump(llvm::raw_ostream &OS,
|
||||
const PPStructure::Conditional &Conditional, unsigned Indent) {
|
||||
for (const auto &Branch : Conditional.Branches) {
|
||||
dump(OS, Branch.first, Indent);
|
||||
dump(OS, Branch.second, Indent + 2);
|
||||
}
|
||||
dump(OS, Conditional.End, Indent);
|
||||
}
|
||||
|
||||
static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk,
|
||||
unsigned Indent) {
|
||||
switch (Chunk.kind()) {
|
||||
case PPStructure::Chunk::K_Empty:
|
||||
llvm_unreachable("invalid chunk");
|
||||
case PPStructure::Chunk::K_Code:
|
||||
return dump(OS, (const PPStructure::Code &)Chunk, Indent);
|
||||
case PPStructure::Chunk::K_Directive:
|
||||
return dump(OS, (const PPStructure::Directive &)Chunk, Indent);
|
||||
case PPStructure::Chunk::K_Conditional:
|
||||
return dump(OS, (const PPStructure::Conditional &)Chunk, Indent);
|
||||
}
|
||||
}
|
||||
|
||||
static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
|
||||
unsigned Indent) {
|
||||
for (const auto &Chunk : PP.Chunks)
|
||||
dump(OS, Chunk, Indent);
|
||||
}
|
||||
|
||||
// Define operator<< in terms of dump() functions above.
|
||||
#define OSTREAM_DUMP(Type) \
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \
|
||||
dump(OS, T, 0); \
|
||||
return OS; \
|
||||
}
|
||||
OSTREAM_DUMP(PPStructure)
|
||||
OSTREAM_DUMP(PPStructure::Chunk)
|
||||
OSTREAM_DUMP(PPStructure::Directive)
|
||||
OSTREAM_DUMP(PPStructure::Conditional)
|
||||
OSTREAM_DUMP(PPStructure::Code)
|
||||
#undef OSTREAM_DUMP
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
|
@ -0,0 +1,98 @@
|
|||
//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang/Tooling/Syntax/Pseudo/Token.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/Support/Format.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
||||
namespace clang {
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
|
||||
OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
|
||||
T.Indent);
|
||||
OS << '"';
|
||||
llvm::printEscapedString(T.text(), OS);
|
||||
OS << '"';
|
||||
if (T.Flags)
|
||||
OS << llvm::format(" flags=%x", T.Flags);
|
||||
return OS;
|
||||
}
|
||||
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
|
||||
OS << "Index Kind Line Text\n";
|
||||
for (const auto &T : TS.tokens()) {
|
||||
OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T),
|
||||
clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
|
||||
OS << '"';
|
||||
llvm::printEscapedString(T.text(), OS);
|
||||
OS << '"';
|
||||
if (T.Flags)
|
||||
OS << llvm::format(" flags=%x", T.Flags);
|
||||
OS << '\n';
|
||||
}
|
||||
return OS;
|
||||
}
|
||||
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
|
||||
OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
|
||||
return OS;
|
||||
}
|
||||
|
||||
TokenStream::TokenStream(std::shared_ptr<void> Payload)
|
||||
: Payload(std::move(Payload)) {
|
||||
Storage.emplace_back();
|
||||
Storage.back().Kind = clang::tok::eof;
|
||||
}
|
||||
|
||||
void TokenStream::finalize() {
|
||||
assert(!isFinalized());
|
||||
unsigned LastLine = Storage.back().Line;
|
||||
Storage.emplace_back();
|
||||
Storage.back().Kind = tok::eof;
|
||||
Storage.back().Line = LastLine + 1;
|
||||
|
||||
Tokens = Storage;
|
||||
Tokens = Tokens.drop_front().drop_back();
|
||||
}
|
||||
|
||||
bool TokenStream::isFinalized() const {
|
||||
assert(!Storage.empty() && Storage.front().Kind == tok::eof);
|
||||
if (Storage.size() == 1)
|
||||
return false;
|
||||
return Storage.back().Kind == tok::eof;
|
||||
}
|
||||
|
||||
void TokenStream::print(llvm::raw_ostream &OS) const {
|
||||
bool FirstToken = true;
|
||||
unsigned LastLine = -1;
|
||||
StringRef LastText;
|
||||
for (const auto &T : tokens()) {
|
||||
StringRef Text = T.text();
|
||||
if (FirstToken) {
|
||||
FirstToken = false;
|
||||
} else if (T.Line == LastLine) {
|
||||
if (LastText.data() + LastText.size() != Text.data())
|
||||
OS << ' ';
|
||||
} else {
|
||||
OS << '\n';
|
||||
OS.indent(T.Indent);
|
||||
}
|
||||
OS << Text;
|
||||
LastLine = T.Line;
|
||||
LastText = Text;
|
||||
}
|
||||
if (!FirstToken)
|
||||
OS << '\n';
|
||||
}
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
|
@ -0,0 +1,52 @@
|
|||
int is_debug() {
|
||||
#ifndef NDEBUG
|
||||
return 1; // in debug mode
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* This comment gets lexed along with the input above! We just don't CHECK it.
|
||||
|
||||
RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
|
||||
SOURCE: int is_debug() {
|
||||
SOURCE-NEXT: #ifndef NDEBUG
|
||||
SOURCE-NEXT: return 1; // in debug mode
|
||||
SOURCE-NEXT: #else
|
||||
SOURCE-NEXT: return 0;
|
||||
SOURCE-NEXT: #end
|
||||
SOURCE-NEXT: }
|
||||
|
||||
RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
|
||||
TOKEN: 0: raw_identifier 0:0 "int" flags=1
|
||||
TOKEN-NEXT: raw_identifier 0:0 "is_debug"
|
||||
TOKEN-NEXT: l_paren 0:0 "("
|
||||
TOKEN-NEXT: r_paren 0:0 ")"
|
||||
TOKEN-NEXT: l_brace 0:0 "{"
|
||||
TOKEN-NEXT: hash 1:0 "#" flags=1
|
||||
TOKEN-NEXT: raw_identifier 1:0 "ifndef"
|
||||
TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
|
||||
TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
|
||||
TOKEN-NEXT: numeric_constant 2:2 "1"
|
||||
TOKEN-NEXT: semi 2:2 ";"
|
||||
TOKEN-NEXT: comment 2:2 "// in debug mode"
|
||||
TOKEN-NEXT: hash 3:0 "#" flags=1
|
||||
TOKEN-NEXT: raw_identifier 3:0 "else"
|
||||
TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
|
||||
TOKEN-NEXT: numeric_constant 4:2 "0"
|
||||
TOKEN-NEXT: semi 4:2 ";"
|
||||
TOKEN-NEXT: hash 5:0 "#" flags=1
|
||||
TOKEN-NEXT: raw_identifier 5:0 "endif"
|
||||
TOKEN-NEXT: r_brace 6:0 "}" flags=1
|
||||
|
||||
RUN: clang-pseudo -source %s -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
|
||||
PPS: code (5 tokens)
|
||||
PPS-NEXT: #ifndef (3 tokens)
|
||||
PPS-NEXT: code (4 tokens)
|
||||
PPS-NEXT: #else (2 tokens)
|
||||
PPS-NEXT: code (3 tokens)
|
||||
PPS-NEXT: #endif (2 tokens)
|
||||
PPS-NEXT: code (2 tokens)
|
||||
^ including this block comment
|
||||
|
||||
*******************************************************************************/
|
|
@ -6,9 +6,12 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang/Basic/LangOptions.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/Token.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
@ -25,13 +28,19 @@ static opt<bool> PrintGraph("print-graph",
|
|||
desc("Print the LR graph for the grammar"));
|
||||
static opt<bool> PrintTable("print-table",
|
||||
desc("Print the LR table for the grammar"));
|
||||
static opt<std::string> Source("source", desc("Source file"));
|
||||
static opt<bool> PrintSource("print-source", desc("Print token stream"));
|
||||
static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
|
||||
static opt<bool>
|
||||
PrintPPStructure("print-pp-structure",
|
||||
desc("Print directive structure of source code"));
|
||||
|
||||
static std::string readOrDie(llvm::StringRef Path) {
|
||||
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
|
||||
llvm::MemoryBuffer::getFile(Path);
|
||||
if (std::error_code EC = Text.getError()) {
|
||||
llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
|
||||
<< "\n";
|
||||
llvm::errs() << "Error: can't read grammar file '" << Path
|
||||
<< "': " << EC.message() << "\n";
|
||||
::exit(1);
|
||||
}
|
||||
return Text.get()->getBuffer().str();
|
||||
|
@ -60,5 +69,19 @@ int main(int argc, char *argv[]) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (Source.getNumOccurrences()) {
|
||||
std::string Text = readOrDie(Source);
|
||||
clang::LangOptions LangOpts; // FIXME: use real options.
|
||||
auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
|
||||
auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
|
||||
|
||||
if (PrintPPStructure)
|
||||
llvm::outs() << Structure;
|
||||
if (PrintSource)
|
||||
Stream.print(llvm::outs());
|
||||
if (PrintTokens)
|
||||
llvm::outs() << Stream;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
|
|||
add_clang_unittest(ClangPseudoTests
|
||||
GrammarTest.cpp
|
||||
LRTableTest.cpp
|
||||
PreprocessTest.cpp
|
||||
TokenTest.cpp
|
||||
)
|
||||
|
||||
clang_target_link_libraries(ClangPseudoTests
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
//===--- TokenTest.cpp ----------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
|
||||
|
||||
#include "clang/Basic/LangOptions.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "clang/Tooling/Syntax/Pseudo/Token.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace clang {
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
namespace {
|
||||
|
||||
using testing::_;
|
||||
using testing::ElementsAre;
|
||||
using testing::Matcher;
|
||||
using testing::Pair;
|
||||
using testing::StrEq;
|
||||
using Chunk = PPStructure::Chunk;
|
||||
|
||||
MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
|
||||
std::vector<llvm::StringRef> Texts;
|
||||
for (const Token &Tok : TS.tokens(arg.Tokens))
|
||||
Texts.push_back(Tok.text());
|
||||
return Matcher<std::string>(StrEq(Tokens))
|
||||
.MatchAndExplain(llvm::join(Texts, " "), result_listener);
|
||||
}
|
||||
|
||||
MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
|
||||
|
||||
TEST(PPStructure, Parse) {
|
||||
LangOptions Opts;
|
||||
std::string Code = R"cpp(
|
||||
#include <foo.h>
|
||||
|
||||
int main() {
|
||||
#ifdef HAS_FOO
|
||||
#if HAS_BAR
|
||||
foo(bar);
|
||||
#else
|
||||
foo(0)
|
||||
#endif
|
||||
#elif NEEDS_FOO
|
||||
#error missing_foo
|
||||
#endif
|
||||
}
|
||||
)cpp";
|
||||
|
||||
TokenStream S = cook(lex(Code, Opts), Opts);
|
||||
PPStructure PP = PPStructure::parse(S);
|
||||
|
||||
ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
|
||||
chunkKind(Chunk::K_Code),
|
||||
chunkKind(Chunk::K_Conditional),
|
||||
chunkKind(Chunk::K_Code)));
|
||||
|
||||
EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0],
|
||||
tokensAre(S, "# include < foo . h >"));
|
||||
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1],
|
||||
tokensAre(S, "int main ( ) {"));
|
||||
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}"));
|
||||
|
||||
const PPStructure::Conditional &Ifdef(PP.Chunks[2]);
|
||||
EXPECT_THAT(Ifdef.Branches,
|
||||
ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
|
||||
Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
|
||||
EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
|
||||
|
||||
const PPStructure &HasFoo(Ifdef.Branches[0].second);
|
||||
const PPStructure &NeedsFoo(Ifdef.Branches[1].second);
|
||||
|
||||
EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
|
||||
const PPStructure::Conditional &If(HasFoo.Chunks[0]);
|
||||
EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
|
||||
Pair(tokensAre(S, "# else"), _)));
|
||||
EXPECT_THAT(If.Branches[0].second.Chunks,
|
||||
ElementsAre(chunkKind(Chunk::K_Code)));
|
||||
EXPECT_THAT(If.Branches[1].second.Chunks,
|
||||
ElementsAre(chunkKind(Chunk::K_Code)));
|
||||
|
||||
EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
|
||||
const PPStructure::Directive &Error(NeedsFoo.Chunks[0]);
|
||||
EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
|
||||
EXPECT_EQ(Error.Kind, tok::pp_error);
|
||||
}
|
||||
|
||||
TEST(PPStructure, ParseUgly) {
|
||||
LangOptions Opts;
|
||||
std::string Code = R"cpp(
|
||||
/*A*/ # /*B*/ \
|
||||
/*C*/ \
|
||||
define \
|
||||
BAR /*D*/
|
||||
/*E*/
|
||||
)cpp";
|
||||
TokenStream S = cook(lex(Code, Opts), Opts);
|
||||
PPStructure PP = PPStructure::parse(S);
|
||||
|
||||
ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
|
||||
chunkKind(Chunk::K_Directive),
|
||||
chunkKind(Chunk::K_Code)));
|
||||
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
|
||||
const PPStructure::Directive &Define(PP.Chunks[1]);
|
||||
EXPECT_EQ(Define.Kind, tok::pp_define);
|
||||
EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
|
||||
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
|
||||
}
|
||||
|
||||
TEST(PPStructure, ParseBroken) {
|
||||
LangOptions Opts;
|
||||
std::string Code = R"cpp(
|
||||
a
|
||||
#endif // mismatched
|
||||
#if X
|
||||
b
|
||||
)cpp";
|
||||
TokenStream S = cook(lex(Code, Opts), Opts);
|
||||
PPStructure PP = PPStructure::parse(S);
|
||||
|
||||
ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
|
||||
chunkKind(Chunk::K_Directive),
|
||||
chunkKind(Chunk::K_Conditional)));
|
||||
EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "a"));
|
||||
const PPStructure::Directive &Endif(PP.Chunks[1]);
|
||||
EXPECT_EQ(Endif.Kind, tok::pp_endif);
|
||||
EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
|
||||
|
||||
const PPStructure::Conditional &X(PP.Chunks[2]);
|
||||
EXPECT_EQ(1u, X.Branches.size());
|
||||
// The (only) branch of the broken conditional section runs until eof.
|
||||
EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
|
||||
EXPECT_THAT(X.Branches.front().second.Chunks,
|
||||
ElementsAre(chunkKind(Chunk::K_Code)));
|
||||
// The missing terminating directive is marked as pp_not_keyword.
|
||||
EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
|
||||
EXPECT_EQ(0u, X.End.Tokens.size());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
|
@ -0,0 +1,178 @@
|
|||
//===--- TokenTest.cpp ----------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang/Tooling/Syntax/Pseudo/Token.h"
|
||||
#include "clang/Basic/LangOptions.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace clang {
|
||||
namespace syntax {
|
||||
namespace pseudo {
|
||||
namespace {
|
||||
|
||||
using testing::AllOf;
|
||||
using testing::ElementsAre;
|
||||
using testing::ElementsAreArray;
|
||||
using testing::Not;
|
||||
|
||||
MATCHER_P2(token, Text, Kind, "") {
|
||||
return arg.Kind == Kind && arg.text() == Text;
|
||||
}
|
||||
|
||||
MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
|
||||
|
||||
MATCHER_P2(lineIndent, Line, Indent, "") {
|
||||
return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
|
||||
}
|
||||
|
||||
TEST(TokenTest, Lex) {
|
||||
LangOptions Opts;
|
||||
std::string Code = R"cpp(
|
||||
#include <stdio.h>
|
||||
int main() {
|
||||
return 42; // the answer
|
||||
}
|
||||
)cpp";
|
||||
TokenStream Raw = lex(Code, Opts);
|
||||
ASSERT_TRUE(Raw.isFinalized());
|
||||
EXPECT_THAT(Raw.tokens(),
|
||||
ElementsAreArray({
|
||||
// Lexing of directives is weird, especially <angled> strings.
|
||||
token("#", tok::hash),
|
||||
token("include", tok::raw_identifier),
|
||||
token("<", tok::less),
|
||||
token("stdio", tok::raw_identifier),
|
||||
token(".", tok::period),
|
||||
token("h", tok::raw_identifier),
|
||||
token(">", tok::greater),
|
||||
|
||||
token("int", tok::raw_identifier),
|
||||
token("main", tok::raw_identifier),
|
||||
token("(", tok::l_paren),
|
||||
token(")", tok::r_paren),
|
||||
token("{", tok::l_brace),
|
||||
token("return", tok::raw_identifier),
|
||||
token("42", tok::numeric_constant),
|
||||
token(";", tok::semi),
|
||||
token("// the answer", tok::comment),
|
||||
token("}", tok::r_brace),
|
||||
}));
|
||||
|
||||
TokenStream Cooked = cook(Raw, Opts);
|
||||
ASSERT_TRUE(Cooked.isFinalized());
|
||||
EXPECT_THAT(Cooked.tokens(),
|
||||
ElementsAreArray({
|
||||
// Cooked identifier types in directives are not meaningful.
|
||||
token("#", tok::hash),
|
||||
token("include", tok::identifier),
|
||||
token("<", tok::less),
|
||||
token("stdio", tok::identifier),
|
||||
token(".", tok::period),
|
||||
token("h", tok::identifier),
|
||||
token(">", tok::greater),
|
||||
|
||||
token("int", tok::kw_int),
|
||||
token("main", tok::identifier),
|
||||
token("(", tok::l_paren),
|
||||
token(")", tok::r_paren),
|
||||
token("{", tok::l_brace),
|
||||
token("return", tok::kw_return),
|
||||
token("42", tok::numeric_constant),
|
||||
token(";", tok::semi),
|
||||
token("// the answer", tok::comment),
|
||||
token("}", tok::r_brace),
|
||||
}));
|
||||
// Check raw tokens point back into original source code.
|
||||
EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
|
||||
}
|
||||
|
||||
TEST(TokenTest, LineContinuation) {
|
||||
LangOptions Opts;
|
||||
std::string Code = R"cpp(
|
||||
one_\
|
||||
token
|
||||
two \
|
||||
tokens
|
||||
)cpp";
|
||||
TokenStream Raw = lex(Code, Opts);
|
||||
EXPECT_THAT(
|
||||
Raw.tokens(),
|
||||
ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
|
||||
hasFlag(LexFlags::StartsPPLine),
|
||||
hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
|
||||
AllOf(token("two", tok::raw_identifier),
|
||||
hasFlag(LexFlags::StartsPPLine),
|
||||
Not(hasFlag(LexFlags::NeedsCleaning))),
|
||||
AllOf(token("\\\ntokens", tok::raw_identifier),
|
||||
Not(hasFlag(LexFlags::StartsPPLine)),
|
||||
hasFlag(LexFlags::NeedsCleaning))));
|
||||
|
||||
TokenStream Cooked = cook(Raw, Opts);
|
||||
EXPECT_THAT(
|
||||
Cooked.tokens(),
|
||||
ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
|
||||
token("two", tok::identifier),
|
||||
token("tokens", tok::identifier)));
|
||||
}
|
||||
|
||||
TEST(TokenTest, EncodedCharacters) {
|
||||
LangOptions Opts;
|
||||
Opts.Trigraphs = true;
|
||||
Opts.Digraphs = true;
|
||||
Opts.C99 = true; // UCNs
|
||||
Opts.CXXOperatorNames = true;
|
||||
std::string Code = R"(and <: ??! '??=' \u00E9)";
|
||||
TokenStream Raw = lex(Code, Opts);
|
||||
EXPECT_THAT(
|
||||
Raw.tokens(),
|
||||
ElementsAre( // and is not recognized as && until cook().
|
||||
AllOf(token("and", tok::raw_identifier),
|
||||
Not(hasFlag(LexFlags::NeedsCleaning))),
|
||||
// Digraphs are just different spellings of tokens.
|
||||
AllOf(token("<:", tok::l_square),
|
||||
Not(hasFlag(LexFlags::NeedsCleaning))),
|
||||
// Trigraps are interpreted, still need text cleaning.
|
||||
AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
|
||||
// Trigraphs must be substituted inside constants too.
|
||||
AllOf(token(R"('??=')", tok::char_constant),
|
||||
hasFlag(LexFlags::NeedsCleaning)),
|
||||
// UCNs need substitution.
|
||||
AllOf(token(R"(\u00E9)", tok::raw_identifier),
|
||||
hasFlag(LexFlags::NeedsCleaning))));
|
||||
|
||||
TokenStream Cooked = cook(Raw, Opts);
|
||||
EXPECT_THAT(
|
||||
Cooked.tokens(),
|
||||
ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
|
||||
token("<:", tok::l_square),
|
||||
token("|", tok::pipe), // trigraph substituted
|
||||
token("'#'", tok::char_constant), // trigraph substituted
|
||||
token("é", tok::identifier))); // UCN substituted
|
||||
}
|
||||
|
||||
TEST(TokenTest, Indentation) {
|
||||
LangOptions Opts;
|
||||
std::string Code = R"cpp( hello world
|
||||
no_indent \
|
||||
line_was_continued
|
||||
)cpp";
|
||||
TokenStream Raw = lex(Code, Opts);
|
||||
EXPECT_THAT(Raw.tokens(), ElementsAreArray({
|
||||
lineIndent(0, 3), // hello
|
||||
lineIndent(0, 3), // world
|
||||
lineIndent(1, 0), // no_indent
|
||||
lineIndent(2, 2), // line_was_continued
|
||||
}));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace pseudo
|
||||
} // namespace syntax
|
||||
} // namespace clang
|
Loading…
Reference in New Issue