[AsmParser][ms][X86] Fix possible misbehaviour in parsing of special tokens at start of string.
- Previously, https://reviews.llvm.org/D72680 introduced a new attribute called `AllowSymbolAtNameStart` (in relation to the MAsmParser changes) in `MCAsmInfo.h` which (according to the comment in the header) allows the following behaviour: ``` /// This is true if the assembler allows $ @ ? characters at the start of /// symbol names. Defaults to false. ``` - However, the usage of this field in AsmLexer.cpp doesn't seem completely accurate* for a couple of reasons. ``` default: if (MAI.doesAllowSymbolAtNameStart()) { // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]* if (!isDigit(CurChar) && isIdentifierChar(CurChar, MAI.doesAllowAtInName(), AllowHashInIdentifier)) return LexIdentifier(); } ``` 1. The Dollar and At tokens, when occurring at the start of the string, are treated as separate tokens (AsmToken::Dollar and AsmToken::At respectively) and not lexed as an Identifier. 2. I'm not too sure why `MAI.doesAllowAtInName()` is used when `AllowAtInIdentifier` could be used. For X86 platforms, afaict, this shouldn't be an issue, since the `CommentString` attribute isn't "@". (alternatively the call to the setter can be set anywhere else as needed). The `AllowAtInName` does have an additional important meaning, but in the context of AsmLexer, shouldn't mean anything different compared to `AllowAtInIdentifier` My proposal is the following: - Introduce 3 new fields called `AllowQuestionTokenAtStartOfString`, `AllowDollarTokenAtStartOfString` and `AllowAtTokenAtStartOfString` in MCAsmInfo.h which will encapsulate the previously documented behaviour of "allowing $, @, ? characters at the start of symbol names") - Introduce these fields where "$", "@" are lexed, and treat them as identifiers depending on whether `Allow[Dollar|At]TokenAtStartOfString` is set. - For the sole case of "?", append it to the existing logic for treating a "default" token as an Identifier. z/OS (HLASM) will also make use of some of these fields in follow up patches. completely accurate* - This was based on the comments and the intended behaviour the code. I might have completely misinterpreted it, and if that is the case my sincere apologies. We can close this patch if necessary, if there are no changes to be made :) Depends on https://reviews.llvm.org/D99374 Reviewed By: Jonathan.Crowther Differential Revision: https://reviews.llvm.org/D99889
This commit is contained in:
parent
ded18708f9
commit
8f6185c713
|
@ -181,9 +181,26 @@ protected:
|
||||||
/// Defaults to false.
|
/// Defaults to false.
|
||||||
bool AllowAtInName = false;
|
bool AllowAtInName = false;
|
||||||
|
|
||||||
/// This is true if the assembler allows $ @ ? characters at the start of
|
/// This is true if the assembler allows the "?" character at the start of
|
||||||
/// symbol names. Defaults to false.
|
/// of a string to be lexed as an AsmToken::Identifier.
|
||||||
bool AllowSymbolAtNameStart = false;
|
/// If the CommentString is also set to "?", setting this option will have
|
||||||
|
/// no effect, and the string will be lexed as a comment.
|
||||||
|
/// Defaults to false.
|
||||||
|
bool AllowQuestionAtStartOfIdentifier = false;
|
||||||
|
|
||||||
|
/// This is true if the assembler allows the "$" character at the start of
|
||||||
|
/// of a string to be lexed as an AsmToken::Identifier.
|
||||||
|
/// If the CommentString is also set to "$", setting this option will have
|
||||||
|
/// no effect, and the string will be lexed as a comment.
|
||||||
|
/// Defaults to false.
|
||||||
|
bool AllowDollarAtStartOfIdentifier = false;
|
||||||
|
|
||||||
|
/// This is true if the assembler allows the "@" character at the start of
|
||||||
|
/// a string to be lexed as an AsmToken::Identifier.
|
||||||
|
/// If the CommentString is also set to "@", setting this option will have
|
||||||
|
/// no effect, and the string will be lexed as a comment.
|
||||||
|
/// Defaults to false.
|
||||||
|
bool AllowAtAtStartOfIdentifier = false;
|
||||||
|
|
||||||
/// If this is true, symbol names with invalid characters will be printed in
|
/// If this is true, symbol names with invalid characters will be printed in
|
||||||
/// quotes.
|
/// quotes.
|
||||||
|
@ -600,7 +617,15 @@ public:
|
||||||
const char *getCode64Directive() const { return Code64Directive; }
|
const char *getCode64Directive() const { return Code64Directive; }
|
||||||
unsigned getAssemblerDialect() const { return AssemblerDialect; }
|
unsigned getAssemblerDialect() const { return AssemblerDialect; }
|
||||||
bool doesAllowAtInName() const { return AllowAtInName; }
|
bool doesAllowAtInName() const { return AllowAtInName; }
|
||||||
bool doesAllowSymbolAtNameStart() const { return AllowSymbolAtNameStart; }
|
bool doesAllowQuestionAtStartOfIdentifier() const {
|
||||||
|
return AllowQuestionAtStartOfIdentifier;
|
||||||
|
}
|
||||||
|
bool doesAllowAtAtStartOfIdentifier() const {
|
||||||
|
return AllowAtAtStartOfIdentifier;
|
||||||
|
}
|
||||||
|
bool doesAllowDollarAtStartOfIdentifier() const {
|
||||||
|
return AllowDollarAtStartOfIdentifier;
|
||||||
|
}
|
||||||
bool supportsNameQuoting() const { return SupportsQuotedNames; }
|
bool supportsNameQuoting() const { return SupportsQuotedNames; }
|
||||||
|
|
||||||
bool doesSupportDataRegionDirectives() const {
|
bool doesSupportDataRegionDirectives() const {
|
||||||
|
|
|
@ -144,7 +144,7 @@ AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
|
||||||
return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
|
return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@#?]*
|
/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
|
||||||
static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
|
static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
|
||||||
return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
|
return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
|
||||||
(AllowAt && C == '@') || (AllowHash && C == '#');
|
(AllowAt && C == '@') || (AllowHash && C == '#');
|
||||||
|
@ -769,17 +769,10 @@ AsmToken AsmLexer::LexToken() {
|
||||||
IsAtStartOfStatement = false;
|
IsAtStartOfStatement = false;
|
||||||
switch (CurChar) {
|
switch (CurChar) {
|
||||||
default:
|
default:
|
||||||
if (MAI.doesAllowSymbolAtNameStart()) {
|
// Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]*
|
||||||
// Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
|
if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' ||
|
||||||
if (!isDigit(CurChar) &&
|
(MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?'))
|
||||||
isIdentifierChar(CurChar, MAI.doesAllowAtInName(),
|
return LexIdentifier();
|
||||||
AllowHashInIdentifier))
|
|
||||||
return LexIdentifier();
|
|
||||||
} else {
|
|
||||||
// Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
|
|
||||||
if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
|
|
||||||
return LexIdentifier();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Unknown character, emit an error.
|
// Unknown character, emit an error.
|
||||||
return ReturnError(TokStart, "invalid character in input");
|
return ReturnError(TokStart, "invalid character in input");
|
||||||
|
@ -823,13 +816,18 @@ AsmToken AsmLexer::LexToken() {
|
||||||
case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
|
case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
|
||||||
case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
|
case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
|
||||||
case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
|
case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
|
||||||
case '$':
|
case '$': {
|
||||||
if (LexMotorolaIntegers && isHexDigit(*CurPtr)) {
|
if (LexMotorolaIntegers && isHexDigit(*CurPtr))
|
||||||
return LexDigit();
|
return LexDigit();
|
||||||
}
|
if (MAI.doesAllowDollarAtStartOfIdentifier())
|
||||||
|
return LexIdentifier();
|
||||||
return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
|
return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
|
||||||
case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
|
}
|
||||||
|
case '@': {
|
||||||
|
if (MAI.doesAllowAtAtStartOfIdentifier())
|
||||||
|
return LexIdentifier();
|
||||||
|
return AsmToken(AsmToken::At, StringRef(TokStart, 1));
|
||||||
|
}
|
||||||
case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
|
case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
|
||||||
case '=':
|
case '=':
|
||||||
if (*CurPtr == '=') {
|
if (*CurPtr == '=') {
|
||||||
|
|
|
@ -144,7 +144,9 @@ X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM(const Triple &Triple)
|
||||||
DollarIsPC = true;
|
DollarIsPC = true;
|
||||||
SeparatorString = "\n";
|
SeparatorString = "\n";
|
||||||
CommentString = ";";
|
CommentString = ";";
|
||||||
AllowSymbolAtNameStart = true;
|
AllowQuestionAtStartOfIdentifier = true;
|
||||||
|
AllowDollarAtStartOfIdentifier = true;
|
||||||
|
AllowAtAtStartOfIdentifier = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void X86MCAsmInfoGNUCOFF::anchor() { }
|
void X86MCAsmInfoGNUCOFF::anchor() { }
|
||||||
|
|
|
@ -35,6 +35,15 @@ public:
|
||||||
void setAllowAdditionalComments(bool Value) {
|
void setAllowAdditionalComments(bool Value) {
|
||||||
AllowAdditionalComments = Value;
|
AllowAdditionalComments = Value;
|
||||||
}
|
}
|
||||||
|
void setAllowQuestionAtStartOfIdentifier(bool Value) {
|
||||||
|
AllowQuestionAtStartOfIdentifier = Value;
|
||||||
|
}
|
||||||
|
void setAllowAtAtStartOfIdentifier(bool Value) {
|
||||||
|
AllowAtAtStartOfIdentifier = Value;
|
||||||
|
}
|
||||||
|
void setAllowDollarAtStartOfIdentifier(bool Value) {
|
||||||
|
AllowDollarAtStartOfIdentifier = Value;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Setup a testing class that the GTest framework can call.
|
// Setup a testing class that the GTest framework can call.
|
||||||
|
@ -454,4 +463,112 @@ TEST_F(SystemZAsmLexerTest, CheckDefaultFloats) {
|
||||||
ExpectedTokens.push_back(AsmToken::Eof);
|
ExpectedTokens.push_back(AsmToken::Eof);
|
||||||
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckDefaultQuestionAtStartOfIdentifier) {
|
||||||
|
StringRef AsmStr = "?lh1?23";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::Error, AsmToken::Identifier, AsmToken::EndOfStatement,
|
||||||
|
AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckAcceptQuestionAtStartOfIdentifier) {
|
||||||
|
StringRef AsmStr = "?????lh1?23";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
MUPMAI->setAllowQuestionAtStartOfIdentifier(true);
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckDefaultAtAtStartOfIdentifier) {
|
||||||
|
StringRef AsmStr = "@@lh1?23";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
MUPMAI->setAllowQuestionAtStartOfIdentifier(true);
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::At, AsmToken::At, AsmToken::Identifier,
|
||||||
|
AsmToken::EndOfStatement, AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckAcceptAtAtStartOfIdentifier) {
|
||||||
|
StringRef AsmStr = "@@lh1?23";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
MUPMAI->setAllowAtAtStartOfIdentifier(true);
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckAccpetAtAtStartOfIdentifier2) {
|
||||||
|
StringRef AsmStr = "@@lj1?23";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
MUPMAI->setCommentString("@");
|
||||||
|
MUPMAI->setAllowAtAtStartOfIdentifier(true);
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
// "@@lj1?23" -> still lexed as a comment as that takes precedence.
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::EndOfStatement, AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckDefaultDollarAtStartOfIdentifier) {
|
||||||
|
StringRef AsmStr = "$$ac$c";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::Dollar, AsmToken::Dollar, AsmToken::Identifier,
|
||||||
|
AsmToken::EndOfStatement, AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(SystemZAsmLexerTest, CheckAcceptDollarAtStartOfIdentifier) {
|
||||||
|
StringRef AsmStr = "$$ab$c";
|
||||||
|
|
||||||
|
// Setup.
|
||||||
|
MUPMAI->setAllowDollarAtStartOfIdentifier(true);
|
||||||
|
setupCallToAsmParser(AsmStr);
|
||||||
|
|
||||||
|
// Lex initially to get the string.
|
||||||
|
Parser->getLexer().Lex();
|
||||||
|
|
||||||
|
SmallVector<AsmToken::TokenKind> ExpectedTokens(
|
||||||
|
{AsmToken::Identifier, AsmToken::EndOfStatement, AsmToken::Eof});
|
||||||
|
lexAndCheckTokens(AsmStr, ExpectedTokens);
|
||||||
|
}
|
||||||
} // end anonymous namespace
|
} // end anonymous namespace
|
||||||
|
|
Loading…
Reference in New Issue