SwiftSoup/Sources/parser/TokeniserState.swift

1690 lines
58 KiB
Swift

//
// TokeniserState.swift
// SwiftSoup
//
// Created by Nabil Chatbi on 12/10/16.
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
//
import Foundation
protocol TokeniserStateProtocol {
func read(_ t: Tokeniser, _ r: CharacterReader)throws
}
public class TokeniserStateVars
{
public static let nullScalr : UnicodeScalar = "\u{0000}"
static let attributeSingleValueCharsSorted = ["'", "&", nullScalr].sorted()
static let attributeDoubleValueCharsSorted = ["\"", "&", nullScalr].sorted()
static let attributeNameCharsSorted = ["\t", "\n", "\r", UnicodeScalar.BackslashF, " ", "/", "=", ">", nullScalr, "\"", "'", "<"].sorted()
static let attributeValueUnquoted = ["\t", "\n", "\r", UnicodeScalar.BackslashF, " ", "&", ">", nullScalr, "\"", "'", "<", "=", "`"].sorted()
static let replacementChar : UnicodeScalar = Tokeniser.replacementChar;
static let replacementStr : String = String(Tokeniser.replacementChar);
static let eof : UnicodeScalar = CharacterReader.EOF;
}
enum TokeniserState: TokeniserStateProtocol
{
case Data
case CharacterReferenceInData
case Rcdata
case CharacterReferenceInRcdata
case Rawtext
case ScriptData
case PLAINTEXT
case TagOpen
case EndTagOpen
case TagName
case RcdataLessthanSign
case RCDATAEndTagOpen
case RCDATAEndTagName
case RawtextLessthanSign
case RawtextEndTagOpen
case RawtextEndTagName
case ScriptDataLessthanSign
case ScriptDataEndTagOpen
case ScriptDataEndTagName
case ScriptDataEscapeStart
case ScriptDataEscapeStartDash
case ScriptDataEscaped
case ScriptDataEscapedDash
case ScriptDataEscapedDashDash
case ScriptDataEscapedLessthanSign
case ScriptDataEscapedEndTagOpen
case ScriptDataEscapedEndTagName
case ScriptDataDoubleEscapeStart
case ScriptDataDoubleEscaped
case ScriptDataDoubleEscapedDash
case ScriptDataDoubleEscapedDashDash
case ScriptDataDoubleEscapedLessthanSign
case ScriptDataDoubleEscapeEnd
case BeforeAttributeName
case AttributeName
case AfterAttributeName
case BeforeAttributeValue
case AttributeValue_doubleQuoted
case AttributeValue_singleQuoted
case AttributeValue_unquoted
case AfterAttributeValue_quoted
case SelfClosingStartTag
case BogusComment
case MarkupDeclarationOpen
case CommentStart
case CommentStartDash
case Comment
case CommentEndDash
case CommentEnd
case CommentEndBang
case Doctype
case BeforeDoctypeName
case DoctypeName
case AfterDoctypeName
case AfterDoctypePublicKeyword
case BeforeDoctypePublicIdentifier
case DoctypePublicIdentifier_doubleQuoted
case DoctypePublicIdentifier_singleQuoted
case AfterDoctypePublicIdentifier
case BetweenDoctypePublicAndSystemIdentifiers
case AfterDoctypeSystemKeyword
case BeforeDoctypeSystemIdentifier
case DoctypeSystemIdentifier_doubleQuoted
case DoctypeSystemIdentifier_singleQuoted
case AfterDoctypeSystemIdentifier
case BogusDoctype
case CdataSection
internal func read(_ t: Tokeniser, _ r: CharacterReader)throws
{
switch self
{
case .Data:
switch (r.current()) {
case "&":
t.advanceTransition(.CharacterReferenceInData);
break;
case "<":
t.advanceTransition(.TagOpen);
break;
case TokeniserStateVars.nullScalr:
t.error(self); // NOT replacement character (oddly?)
t.emit(r.consume());
break;
case TokeniserStateVars.eof:
try t.emit(Token.EOF());
break;
default:
let data: String = r.consumeData();
t.emit(data);
break;
}
break
case .CharacterReferenceInData:
try TokeniserState.readCharRef(t, .Data)
break
case .Rcdata:
switch (r.current()) {
case "&":
t.advanceTransition(.CharacterReferenceInRcdata);
break;
case "<":
t.advanceTransition(.RcdataLessthanSign);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
r.advance();
t.emit(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
try t.emit(Token.EOF());
break;
default:
let data = r.consumeToAny("&", "<", TokeniserStateVars.nullScalr);
t.emit(data);
break;
}
break
case .CharacterReferenceInRcdata:
try TokeniserState.readCharRef(t, .Rcdata);
break
case .Rawtext:
try TokeniserState.readData(t, r, self, .RawtextLessthanSign);
break
case .ScriptData:
try TokeniserState.readData(t, r, self, .ScriptDataLessthanSign);
break
case .PLAINTEXT:
switch (r.current()) {
case TokeniserStateVars.nullScalr:
t.error(self);
r.advance();
t.emit(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
try t.emit(Token.EOF());
break;
default:
let data = r.consumeTo(TokeniserStateVars.nullScalr);
t.emit(data);
break;
}
break
case .TagOpen:
// from < in data
switch (r.current()) {
case "!":
t.advanceTransition(.MarkupDeclarationOpen);
break;
case "/":
t.advanceTransition(.EndTagOpen);
break;
case "?":
t.advanceTransition(.BogusComment);
break;
default:
if (r.matchesLetter()) {
t.createTagPending(true);
t.transition(.TagName);
} else {
t.error(self);
t.emit("<"); // char that got us here
t.transition(.Data);
}
break;
}
break
case .EndTagOpen:
if (r.isEmpty()) {
t.eofError(self);
t.emit("</");
t.transition(.Data);
} else if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(.TagName);
} else if (r.matches(">")) {
t.error(self);
t.advanceTransition(.Data);
} else {
t.error(self);
t.advanceTransition(.BogusComment);
}
break
case .TagName:
// from < or </ in data, will have start or end tag pending
// previous TagOpen state did NOT consume, will have a letter char in current
//String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase();
let tagName = r.consumeTagName();
t.tagPending.appendTagName(tagName);
switch (r.consume()) {
case "\t":
t.transition(.BeforeAttributeName)
break
case "\n":
t.transition(.BeforeAttributeName)
break
case "\r":
t.transition(.BeforeAttributeName)
break
case UnicodeScalar.BackslashF:
t.transition(.BeforeAttributeName)
break
case " ":
t.transition(.BeforeAttributeName)
break
case "/":
t.transition(.SelfClosingStartTag);
break;
case ">":
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr: // replacement
t.tagPending.appendTagName(TokeniserStateVars.replacementStr);
break;
case TokeniserStateVars.eof: // should emit pending tag?
t.eofError(self);
t.transition(.Data);
// no default, as covered with above consumeToAny
default:
break
}
case .RcdataLessthanSign:
if (r.matches("/")) {
t.createTempBuffer();
t.advanceTransition(.RCDATAEndTagOpen);
} else if (r.matchesLetter() && t.appropriateEndTagName() != nil && !r.containsIgnoreCase("</" + t.appropriateEndTagName()!)) {
// diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than
// consuming to EOF; break out here
t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName()!);
try t.emitTagPending();
r.unconsume(); // undo "<"
t.transition(.Data);
} else {
t.emit("<");
t.transition(.Rcdata);
}
break
case .RCDATAEndTagOpen:
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(r.current());
t.dataBuffer.append(r.current());
t.advanceTransition(.RCDATAEndTagName);
} else {
t.emit("</");
t.transition(.Rcdata);
}
break
case .RCDATAEndTagName:
if (r.matchesLetter()) {
let name = r.consumeLetterSequence();
t.tagPending.appendTagName(name);
t.dataBuffer.append(name);
return;
}
func anythingElse(_ t: Tokeniser, _ r: CharacterReader)
{
t.emit("</" + t.dataBuffer.toString());
r.unconsume();
t.transition(.Rcdata);
}
let c = r.consume();
switch (c) {
case "\t":
if (try t.isAppropriateEndTagToken()){
t.transition(.BeforeAttributeName);
}else{
anythingElse(t, r);
}
break;
case "\n":
if (try t.isAppropriateEndTagToken()){
t.transition(.BeforeAttributeName);
}else{
anythingElse(t, r);
}
break;
case "\r":
if (try t.isAppropriateEndTagToken()){
t.transition(.BeforeAttributeName);
}else{
anythingElse(t, r);
}
break;
case UnicodeScalar.BackslashF:
if (try t.isAppropriateEndTagToken()){
t.transition(.BeforeAttributeName);
}else{
anythingElse(t, r);
}
break;
case " ":
if (try t.isAppropriateEndTagToken()){
t.transition(.BeforeAttributeName);
}else{
anythingElse(t, r);
}
break;
case "/":
if (try t.isAppropriateEndTagToken()){
t.transition(.SelfClosingStartTag);
}else{
anythingElse(t, r);
}
break;
case ">":
if (try t.isAppropriateEndTagToken()) {
try t.emitTagPending();
t.transition(.Data);
}
else{anythingElse(t, r);}
break;
default:
anythingElse(t, r);
break
}
break
case .RawtextLessthanSign:
if (r.matches("/")) {
t.createTempBuffer();
t.advanceTransition(.RawtextEndTagOpen);
} else {
t.emit("<");
t.transition(.Rawtext);
}
break
case .RawtextEndTagOpen:
TokeniserState.readEndTag(t, r, .RawtextEndTagName, .Rawtext);
break
case .RawtextEndTagName:
try TokeniserState.handleDataEndTag(t, r, .Rawtext);
break
case .ScriptDataLessthanSign:
switch (r.consume()) {
case "/":
t.createTempBuffer();
t.transition(.ScriptDataEndTagOpen);
break;
case "!":
t.emit("<!");
t.transition(.ScriptDataEscapeStart);
break;
default:
t.emit("<");
r.unconsume();
t.transition(.ScriptData);
}
break
case .ScriptDataEndTagOpen:
TokeniserState.readEndTag(t, r, .ScriptDataEndTagName, .ScriptData);
break
case .ScriptDataEndTagName:
try TokeniserState.handleDataEndTag(t, r, .ScriptData);
break
case .ScriptDataEscapeStart:
if (r.matches("-")) {
t.emit("-");
t.advanceTransition(.ScriptDataEscapeStartDash);
} else {
t.transition(.ScriptData);
}
break
case .ScriptDataEscapeStartDash:
if (r.matches("-")) {
t.emit("-");
t.advanceTransition(.ScriptDataEscapedDashDash);
} else {
t.transition(.ScriptData);
}
break
case .ScriptDataEscaped:
if (r.isEmpty()) {
t.eofError(self);
t.transition(.Data);
return;
}
switch (r.current()) {
case "-":
t.emit("-");
t.advanceTransition(.ScriptDataEscapedDash);
break;
case "<":
t.advanceTransition(.ScriptDataEscapedLessthanSign);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
r.advance();
t.emit(TokeniserStateVars.replacementChar);
break;
default:
let data = r.consumeToAny("-", "<", TokeniserStateVars.nullScalr);
t.emit(data);
}
break
case .ScriptDataEscapedDash:
if (r.isEmpty()) {
t.eofError(self);
t.transition(.Data);
return;
}
let c = r.consume();
switch (c) {
case "-":
t.emit(c);
t.transition(.ScriptDataEscapedDashDash);
break;
case "<":
t.transition(.ScriptDataEscapedLessthanSign);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.emit(TokeniserStateVars.replacementChar);
t.transition(.ScriptDataEscaped);
break;
default:
t.emit(c);
t.transition(.ScriptDataEscaped);
}
break
case .ScriptDataEscapedDashDash:
if (r.isEmpty()) {
t.eofError(self);
t.transition(.Data);
return;
}
let c = r.consume();
switch (c) {
case "-":
t.emit(c);
break;
case "<":
t.transition(.ScriptDataEscapedLessthanSign);
break;
case ">":
t.emit(c);
t.transition(.ScriptData);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.emit(TokeniserStateVars.replacementChar);
t.transition(.ScriptDataEscaped);
break;
default:
t.emit(c);
t.transition(.ScriptDataEscaped);
}
break
case .ScriptDataEscapedLessthanSign:
if (r.matchesLetter()) {
t.createTempBuffer();
t.dataBuffer.append(r.current());
t.emit("<" + String(r.current()));
t.advanceTransition(.ScriptDataDoubleEscapeStart);
} else if (r.matches("/")) {
t.createTempBuffer();
t.advanceTransition(.ScriptDataEscapedEndTagOpen);
} else {
t.emit("<");
t.transition(.ScriptDataEscaped);
}
break
case .ScriptDataEscapedEndTagOpen:
if (r.matchesLetter()) {
t.createTagPending(false);
t.tagPending.appendTagName(r.current());
t.dataBuffer.append(r.current());
t.advanceTransition(.ScriptDataEscapedEndTagName);
} else {
t.emit("</");
t.transition(.ScriptDataEscaped);
}
break
case .ScriptDataEscapedEndTagName:
try TokeniserState.handleDataEndTag(t, r, .ScriptDataEscaped);
break
case .ScriptDataDoubleEscapeStart:
TokeniserState.handleDataDoubleEscapeTag(t, r, .ScriptDataDoubleEscaped, .ScriptDataEscaped);
break
case .ScriptDataDoubleEscaped:
let c = r.current();
switch (c) {
case "-":
t.emit(c);
t.advanceTransition(.ScriptDataDoubleEscapedDash);
break;
case "<":
t.emit(c);
t.advanceTransition(.ScriptDataDoubleEscapedLessthanSign);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
r.advance();
t.emit(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
default:
let data = r.consumeToAny("-", "<", TokeniserStateVars.nullScalr);
t.emit(data);
}
break
case .ScriptDataDoubleEscapedDash:
let c = r.consume();
switch (c) {
case "-":
t.emit(c);
t.transition(.ScriptDataDoubleEscapedDashDash);
break;
case "<":
t.emit(c);
t.transition(.ScriptDataDoubleEscapedLessthanSign);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.emit(TokeniserStateVars.replacementChar);
t.transition(.ScriptDataDoubleEscaped);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
default:
t.emit(c);
t.transition(.ScriptDataDoubleEscaped);
}
break
case .ScriptDataDoubleEscapedDashDash:
let c = r.consume();
switch (c) {
case "-":
t.emit(c);
break;
case "<":
t.emit(c);
t.transition(.ScriptDataDoubleEscapedLessthanSign);
break;
case ">":
t.emit(c);
t.transition(.ScriptData);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.emit(TokeniserStateVars.replacementChar);
t.transition(.ScriptDataDoubleEscaped);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
default:
t.emit(c);
t.transition(.ScriptDataDoubleEscaped);
}
break
case .ScriptDataDoubleEscapedLessthanSign:
if (r.matches("/")) {
t.emit("/");
t.createTempBuffer();
t.advanceTransition(.ScriptDataDoubleEscapeEnd);
} else {
t.transition(.ScriptDataDoubleEscaped);
}
break
case .ScriptDataDoubleEscapeEnd:
TokeniserState.handleDataDoubleEscapeTag(t,r, .ScriptDataEscaped, .ScriptDataDoubleEscaped);
break
case .BeforeAttributeName:
// from tagname <xxx
let c = r.consume();
switch (c) {
case "\t":
t.transition(.SelfClosingStartTag)
break
case "\n":
t.transition(.SelfClosingStartTag)
break
case "\r":
t.transition(.SelfClosingStartTag)
break
case UnicodeScalar.BackslashF:
t.transition(.SelfClosingStartTag)
break
case " ":
break; // ignore whitespace
case "/":
t.transition(.SelfClosingStartTag)
break
case ">":
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
try t.tagPending.newAttribute();
r.unconsume();
t.transition(.AttributeName);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
case "\"":
t.error(self);
try t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(.AttributeName);
break;
case "'":
t.error(self);
try t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(.AttributeName);
break;
case "<":
t.error(self);
try t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(.AttributeName);
break;
case "=":
t.error(self);
try t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(.AttributeName);
break;
default: // A-Z, anything else
try t.tagPending.newAttribute();
r.unconsume();
t.transition(.AttributeName);
}
break
case .AttributeName:
let name = r.consumeToAnySorted(TokeniserStateVars.attributeNameCharsSorted);
t.tagPending.appendAttributeName(name);
let c = r.consume();
switch (c) {
case "\t":
t.transition(.AfterAttributeName);
break;
case "\n":
t.transition(.AfterAttributeName);
break;
case "\r":
t.transition(.AfterAttributeName);
break;
case UnicodeScalar.BackslashF:
t.transition(.AfterAttributeName);
break;
case " ":
t.transition(.AfterAttributeName);
break;
case "/":
t.transition(.SelfClosingStartTag);
break;
case "=":
t.transition(.BeforeAttributeValue);
break;
case ">":
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.tagPending.appendAttributeName(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
case "\"":
t.error(self);
t.tagPending.appendAttributeName(c);
case "'":
t.error(self);
t.tagPending.appendAttributeName(c);
case "<":
t.error(self);
t.tagPending.appendAttributeName(c);
// no default, as covered in consumeToAny
default:
break
}
break
case .AfterAttributeName:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
// ignore
break;
case "/":
t.transition(.SelfClosingStartTag);
break;
case "=":
t.transition(.BeforeAttributeValue);
break;
case ">":
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.tagPending.appendAttributeName(TokeniserStateVars.replacementChar);
t.transition(.AttributeName);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
case "\"","'","<":
t.error(self);
try t.tagPending.newAttribute();
t.tagPending.appendAttributeName(c);
t.transition(.AttributeName);
break;
default: // A-Z, anything else
try t.tagPending.newAttribute();
r.unconsume();
t.transition(.AttributeName);
}
break
case .BeforeAttributeValue:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
// ignore
break;
case "\"":
t.transition(.AttributeValue_doubleQuoted);
break;
case "&":
r.unconsume();
t.transition(.AttributeValue_unquoted);
break;
case "'":
t.transition(.AttributeValue_singleQuoted);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.tagPending.appendAttributeValue(TokeniserStateVars.replacementChar);
t.transition(.AttributeValue_unquoted);
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitTagPending();
t.transition(.Data);
break;
case ">":
t.error(self);
try t.emitTagPending();
t.transition(.Data);
break;
case "<","=","`":
t.error(self);
t.tagPending.appendAttributeValue(c);
t.transition(.AttributeValue_unquoted);
break;
default:
r.unconsume();
t.transition(.AttributeValue_unquoted);
}
break
case .AttributeValue_doubleQuoted:
let value = r.consumeToAny(TokeniserStateVars.attributeDoubleValueCharsSorted);
if (value.characters.count > 0){
t.tagPending.appendAttributeValue(value);
}else{
t.tagPending.setEmptyAttributeValue();
}
let c = r.consume();
switch (c) {
case "\"":
t.transition(.AfterAttributeValue_quoted);
break;
case "&":
if let ref = try t.consumeCharacterReference("\"", true){
t.tagPending.appendAttributeValue(ref);
}else{
t.tagPending.appendAttributeValue("&");
}
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.tagPending.appendAttributeValue(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
// no default, handled in consume to any above
default:
break
}
break
case .AttributeValue_singleQuoted:
let value = r.consumeToAny(TokeniserStateVars.attributeSingleValueCharsSorted);
if (value.characters.count > 0){
t.tagPending.appendAttributeValue(value);
}else{
t.tagPending.setEmptyAttributeValue();
}
let c = r.consume();
switch (c) {
case "'":
t.transition(.AfterAttributeValue_quoted);
break;
case "&":
if let ref = try t.consumeCharacterReference("'", true){
t.tagPending.appendAttributeValue(ref);
}else{
t.tagPending.appendAttributeValue("&");
}
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.tagPending.appendAttributeValue(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
// no default, handled in consume to any above
default:
break
}
break
case .AttributeValue_unquoted:
let value = r.consumeToAnySorted(TokeniserStateVars.attributeValueUnquoted);
if (value.characters.count > 0){
t.tagPending.appendAttributeValue(value);
}
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.BeforeAttributeName);
break;
case "&":
if let ref = try t.consumeCharacterReference(">", true){
t.tagPending.appendAttributeValue(ref);
}else{
t.tagPending.appendAttributeValue("&");
}
break;
case ">":
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.tagPending.appendAttributeValue(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
case "\"","'","<","=","`":
t.error(self);
t.tagPending.appendAttributeValue(c);
break;
// no default, handled in consume to any above
default:
break
}
break
case .AfterAttributeValue_quoted:
// CharacterReferenceInAttributeValue state handled inline
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.BeforeAttributeName);
break;
case "/":
t.transition(.SelfClosingStartTag);
break;
case ">":
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
default:
t.error(self);
r.unconsume();
t.transition(.BeforeAttributeName);
}
break
case .SelfClosingStartTag:
let c = r.consume();
switch (c) {
case ">":
t.tagPending._selfClosing = true;
try t.emitTagPending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.transition(.Data);
break;
default:
t.error(self);
r.unconsume();
t.transition(.BeforeAttributeName);
}
break
case .BogusComment:
// todo: handle bogus comment starting from eof. when does that trigger?
// rewind to capture character that lead us here
r.unconsume();
let comment : Token.Comment = Token.Comment();
comment.bogus = true;
comment.data.append(r.consumeTo(">"));
// todo: replace nullChar with replaceChar
try t.emit(comment);
t.advanceTransition(.Data);
break
case .MarkupDeclarationOpen:
if (r.matchConsume("--")) {
t.createCommentPending();
t.transition(.CommentStart);
} else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
t.transition(.Doctype);
} else if (r.matchConsume("[CDATA[")) {
// todo: should actually check current namepspace, and only non-html allows cdata. until namespace
// is implemented properly, keep handling as cdata
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
t.transition(.CdataSection);
} else {
t.error(self);
t.advanceTransition(.BogusComment); // advance so self character gets in bogus comment data's rewind
}
break
case .CommentStart:
let c = r.consume();
switch (c) {
case "-":
t.transition(.CommentStartDash);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.commentPending.data.append(TokeniserStateVars.replacementChar);
t.transition(.Comment);
break;
case ">":
t.error(self);
try t.emitCommentPending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitCommentPending();
t.transition(.Data);
break;
default:
t.commentPending.data.append(c);
t.transition(.Comment);
}
break
case .CommentStartDash:
let c = r.consume();
switch (c) {
case "-":
t.transition(.CommentStartDash);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.commentPending.data.append(TokeniserStateVars.replacementChar);
t.transition(.Comment);
break;
case ">":
t.error(self);
try t.emitCommentPending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitCommentPending();
t.transition(.Data);
break;
default:
t.commentPending.data.append(c);
t.transition(.Comment);
}
break
case .Comment:
let c = r.current();
switch (c) {
case "-":
t.advanceTransition(.CommentEndDash);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
r.advance();
t.commentPending.data.append(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitCommentPending();
t.transition(.Data);
break;
default:
t.commentPending.data.append(r.consumeToAny("-", TokeniserStateVars.nullScalr));
}
break
case .CommentEndDash:
let c = r.consume();
switch (c) {
case "-":
t.transition(.CommentEnd);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.commentPending.data.append("-").append(TokeniserStateVars.replacementChar);
t.transition(.Comment);
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitCommentPending();
t.transition(.Data);
break;
default:
t.commentPending.data.append("-").append(c);
t.transition(.Comment);
}
break
case .CommentEnd:
let c = r.consume();
switch (c) {
case ">":
try t.emitCommentPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.commentPending.data.append("--").append(TokeniserStateVars.replacementChar);
t.transition(.Comment);
break;
case "!":
t.error(self);
t.transition(.CommentEndBang);
break;
case "-":
t.error(self);
t.commentPending.data.append("-");
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitCommentPending();
t.transition(.Data);
break;
default:
t.error(self);
t.commentPending.data.append("--").append(c);
t.transition(.Comment);
}
break
case .CommentEndBang:
let c = r.consume();
switch (c) {
case "-":
t.commentPending.data.append("--!");
t.transition(.CommentEndDash);
break;
case ">":
try t.emitCommentPending();
t.transition(.Data);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.commentPending.data.append("--!").append(TokeniserStateVars.replacementChar);
t.transition(.Comment);
break;
case TokeniserStateVars.eof:
t.eofError(self);
try t.emitCommentPending();
t.transition(.Data);
break;
default:
t.commentPending.data.append("--!").append(c);
t.transition(.Comment);
}
break
case .Doctype:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.BeforeDoctypeName);
break;
case TokeniserStateVars.eof:
t.eofError(self);
// note: fall through to > case
case ">": // catch invalid <!DOCTYPE>
t.error(self);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.transition(.BeforeDoctypeName);
}
break
case .BeforeDoctypeName:
if (r.matchesLetter()) {
t.createDoctypePending();
t.transition(.DoctypeName);
return;
}
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
break; // ignore whitespace
case TokeniserStateVars.nullScalr:
t.error(self);
t.createDoctypePending();
t.doctypePending.name.append(TokeniserStateVars.replacementChar);
t.transition(.DoctypeName);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.createDoctypePending();
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.createDoctypePending();
t.doctypePending.name.append(c);
t.transition(.DoctypeName);
}
break
case .DoctypeName:
if (r.matchesLetter()) {
let name = r.consumeLetterSequence();
t.doctypePending.name.append(name);
return;
}
let c = r.consume();
switch (c) {
case ">":
try t.emitDoctypePending();
t.transition(.Data);
break;
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.AfterDoctypeName);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.doctypePending.name.append(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.doctypePending.name.append(c);
}
break
case .AfterDoctypeName:
if (r.isEmpty()) {
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
return;
}
if (r.matchesAny("\t", "\n", "\r", UnicodeScalar.BackslashF, " ")){
r.advance(); // ignore whitespace
}else if (r.matches(">")) {
try t.emitDoctypePending();
t.advanceTransition(.Data);
} else if (r.matchConsumeIgnoreCase("PUBLIC")) {
t.transition(.AfterDoctypePublicKeyword);
} else if (r.matchConsumeIgnoreCase("SYSTEM")) {
t.transition(.AfterDoctypeSystemKeyword);
} else {
t.error(self);
t.doctypePending.forceQuirks = true;
t.advanceTransition(.BogusDoctype);
}
break
case .AfterDoctypePublicKeyword:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.BeforeDoctypePublicIdentifier);
break;
case "\"":
t.error(self);
// set public id to empty string
t.transition(.DoctypePublicIdentifier_doubleQuoted);
break;
case "'":
t.error(self);
// set public id to empty string
t.transition(.DoctypePublicIdentifier_singleQuoted);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.doctypePending.forceQuirks = true;
t.transition(.BogusDoctype);
}
break
case .BeforeDoctypePublicIdentifier:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
break;
case "\"":
// set public id to empty string
t.transition(.DoctypePublicIdentifier_doubleQuoted);
break;
case "'":
// set public id to empty string
t.transition(.DoctypePublicIdentifier_singleQuoted);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.doctypePending.forceQuirks = true;
t.transition(.BogusDoctype);
}
break
case .DoctypePublicIdentifier_doubleQuoted:
let c = r.consume();
switch (c) {
case "\"":
t.transition(.AfterDoctypePublicIdentifier);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.doctypePending.publicIdentifier.append(TokeniserStateVars.replacementChar);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
break
case .DoctypePublicIdentifier_singleQuoted:
let c = r.consume();
switch (c) {
case "'":
t.transition(.AfterDoctypePublicIdentifier);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.doctypePending.publicIdentifier.append(TokeniserStateVars.replacementChar);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.doctypePending.publicIdentifier.append(c);
}
break
case .AfterDoctypePublicIdentifier:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.BetweenDoctypePublicAndSystemIdentifiers);
break;
case ">":
try t.emitDoctypePending();
t.transition(.Data);
break;
case "\"":
t.error(self);
// system id empty
t.transition(.DoctypeSystemIdentifier_doubleQuoted);
break;
case "'":
t.error(self);
// system id empty
t.transition(.DoctypeSystemIdentifier_singleQuoted);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.doctypePending.forceQuirks = true;
t.transition(.BogusDoctype);
}
break
case .BetweenDoctypePublicAndSystemIdentifiers:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
break;
case ">":
try t.emitDoctypePending();
t.transition(.Data);
break;
case "\"":
t.error(self);
// system id empty
t.transition(.DoctypeSystemIdentifier_doubleQuoted);
break;
case "'":
t.error(self);
// system id empty
t.transition(.DoctypeSystemIdentifier_singleQuoted);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.doctypePending.forceQuirks = true;
t.transition(.BogusDoctype);
}
break
case .AfterDoctypeSystemKeyword:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(.BeforeDoctypeSystemIdentifier);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case "\"":
t.error(self);
// system id empty
t.transition(.DoctypeSystemIdentifier_doubleQuoted);
break;
case "'":
t.error(self);
// system id empty
t.transition(.DoctypeSystemIdentifier_singleQuoted);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
}
break
case .BeforeDoctypeSystemIdentifier:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
break;
case "\"":
// set system id to empty string
t.transition(.DoctypeSystemIdentifier_doubleQuoted);
break;
case "'":
// set public id to empty string
t.transition(.DoctypeSystemIdentifier_singleQuoted);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.doctypePending.forceQuirks = true;
t.transition(.BogusDoctype);
}
break
case .DoctypeSystemIdentifier_doubleQuoted:
let c = r.consume();
switch (c) {
case "\"":
t.transition(.AfterDoctypeSystemIdentifier);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.doctypePending.systemIdentifier.append(TokeniserStateVars.replacementChar);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
break
case .DoctypeSystemIdentifier_singleQuoted:
let c = r.consume();
switch (c) {
case "'":
t.transition(.AfterDoctypeSystemIdentifier);
break;
case TokeniserStateVars.nullScalr:
t.error(self);
t.doctypePending.systemIdentifier.append(TokeniserStateVars.replacementChar);
break;
case ">":
t.error(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.doctypePending.systemIdentifier.append(c);
}
break
case .AfterDoctypeSystemIdentifier:
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
break;
case ">":
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
t.eofError(self);
t.doctypePending.forceQuirks = true;
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
t.error(self);
t.transition(.BogusDoctype);
// NOT force quirks
}
break
case .BogusDoctype:
let c = r.consume();
switch (c) {
case ">":
try t.emitDoctypePending();
t.transition(.Data);
break;
case TokeniserStateVars.eof:
try t.emitDoctypePending();
t.transition(.Data);
break;
default:
// ignore char
break;
}
break
case .CdataSection:
let data = r.consumeTo("]]>");
t.emit(data);
r.matchConsume("]]>");
t.transition(.Data);
break
}
}
var description: String {return String(describing: type(of: self))}
/**
* Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just
* different else exit transitions.
*/
private static func handleDataEndTag(_ t: Tokeniser, _ r: CharacterReader, _ elseTransition: TokeniserState)throws {
if (r.matchesLetter()) {
let name = r.consumeLetterSequence();
t.tagPending.appendTagName(name);
t.dataBuffer.append(name);
return;
}
var needsExitTransition = false;
if (try t.isAppropriateEndTagToken() && !r.isEmpty()) {
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ":
t.transition(BeforeAttributeName);
break;
case "/":
t.transition(SelfClosingStartTag);
break;
case ">":
try t.emitTagPending();
t.transition(Data);
break;
default:
t.dataBuffer.append(c);
needsExitTransition = true;
}
} else {
needsExitTransition = true;
}
if (needsExitTransition) {
t.emit("</" + t.dataBuffer.toString());
t.transition(elseTransition);
}
}
private static func readData(_ t: Tokeniser, _ r: CharacterReader, _ current: TokeniserState, _ advance: TokeniserState)throws {
switch (r.current()) {
case "<":
t.advanceTransition(advance);
break;
case TokeniserStateVars.nullScalr:
t.error(current);
r.advance();
t.emit(TokeniserStateVars.replacementChar);
break;
case TokeniserStateVars.eof:
try t.emit(Token.EOF());
break;
default:
let data = r.consumeToAny("<", TokeniserStateVars.nullScalr);
t.emit(data);
break;
}
}
private static func readCharRef(_ t: Tokeniser, _ advance: TokeniserState)throws {
let c = try t.consumeCharacterReference(nil, false);
if (c == nil){
t.emit("&");
}else{
t.emit(c!);
}
t.transition(advance);
}
private static func readEndTag(_ t: Tokeniser, _ r: CharacterReader, _ a: TokeniserState, _ b: TokeniserState) {
if (r.matchesLetter()) {
t.createTagPending(false);
t.transition(a);
} else {
t.emit("</");
t.transition(b);
}
}
private static func handleDataDoubleEscapeTag(_ t: Tokeniser, _ r: CharacterReader, _ primary: TokeniserState, _ fallback: TokeniserState) {
if (r.matchesLetter()) {
let name = r.consumeLetterSequence();
t.dataBuffer.append(name);
t.emit(name);
return;
}
let c = r.consume();
switch (c) {
case "\t","\n","\r",UnicodeScalar.BackslashF," ","/",">":
if (t.dataBuffer.toString() == "script"){
t.transition(primary);
}else{
t.transition(fallback);
}
t.emit(c);
break;
default:
r.unconsume();
t.transition(fallback);
}
}
}