321 lines
11 KiB
Swift
321 lines
11 KiB
Swift
//
|
|
// Tokeniser.swift
|
|
// SwiftSoup
|
|
//
|
|
// Created by Nabil Chatbi on 19/10/16.
|
|
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
|
|
//
|
|
|
|
import Foundation
|
|
|
|
final class Tokeniser
|
|
{
|
|
static let replacementChar : UnicodeScalar = "\u{FFFD}" // replaces null character
|
|
private static let notCharRefCharsSorted : [UnicodeScalar] = ["\t", "\n", "\r",UnicodeScalar.BackslashF, " ", "<", "&"].sorted()
|
|
|
|
private let reader : CharacterReader; // html input
|
|
private let errors : ParseErrorList?; // errors found while tokenising
|
|
|
|
private var state: TokeniserState = TokeniserState.Data; // current tokenisation state
|
|
private var emitPending: Token? ; // the token we are about to emit on next read
|
|
private var isEmitPending : Bool = false;
|
|
private var charsString : String? = nil; // characters pending an emit. Will fall to charsBuilder if more than one
|
|
private let charsBuilder : StringBuilder = StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
|
|
let dataBuffer : StringBuilder = StringBuilder(1024); // buffers data looking for </script>
|
|
|
|
var tagPending : Token.Tag = Token.Tag() // tag we are building up
|
|
let startPending : Token.StartTag = Token.StartTag();
|
|
let endPending: Token.EndTag = Token.EndTag();
|
|
let charPending: Token.Char = Token.Char();
|
|
let doctypePending: Token.Doctype = Token.Doctype(); // doctype building up
|
|
let commentPending: Token.Comment = Token.Comment(); // comment building up
|
|
private var lastStartTag: String? // the last start tag emitted, to test appropriate end tag
|
|
private var selfClosingFlagAcknowledged: Bool = true;
|
|
|
|
|
|
init(_ reader: CharacterReader, _ errors: ParseErrorList?) {
|
|
self.reader = reader;
|
|
self.errors = errors;
|
|
}
|
|
|
|
func read()throws->Token {
|
|
if (!selfClosingFlagAcknowledged) {
|
|
error("Self closing flag not acknowledged");
|
|
selfClosingFlagAcknowledged = true;
|
|
}
|
|
|
|
while (!isEmitPending){
|
|
try state.read(self, reader);
|
|
}
|
|
|
|
// if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
|
|
if (charsBuilder.length > 0) {
|
|
let str: String = charsBuilder.toString();
|
|
charsBuilder.clear()
|
|
charsString = nil
|
|
return charPending.data(str)
|
|
} else if (charsString != nil) {
|
|
let token : Token = charPending.data(charsString!);
|
|
charsString = nil;
|
|
return token;
|
|
} else {
|
|
isEmitPending = false;
|
|
return emitPending!;
|
|
}
|
|
}
|
|
|
|
func emit(_ token: Token)throws {
|
|
try Validate.isFalse(val: isEmitPending, msg: "There is an unread token pending!");
|
|
|
|
emitPending = token;
|
|
isEmitPending = true;
|
|
|
|
if (token.type == Token.TokenType.StartTag) {
|
|
let startTag : Token.StartTag = token as! Token.StartTag;
|
|
lastStartTag = startTag._tagName!;
|
|
if (startTag._selfClosing){
|
|
selfClosingFlagAcknowledged = false;
|
|
}
|
|
} else if (token.type == Token.TokenType.EndTag) {
|
|
let endTag : Token.EndTag = token as! Token.EndTag;
|
|
if (endTag._attributes.size() != 0){
|
|
error("Attributes incorrectly present on end tag");
|
|
}
|
|
}
|
|
}
|
|
|
|
func emit(_ str: String ) {
|
|
// buffer strings up until last string token found, to emit only one token for a run of character refs etc.
|
|
// does not set isEmitPending; read checks that
|
|
if (charsString == nil) {
|
|
charsString = str;
|
|
}
|
|
else {
|
|
if (charsBuilder.length == 0) { // switching to string builder as more than one emit before read
|
|
charsBuilder.append(charsString!);
|
|
}
|
|
charsBuilder.append(str);
|
|
}
|
|
}
|
|
|
|
func emit(_ chars: [UnicodeScalar]) {
|
|
emit(String(chars.map{Character($0)}))
|
|
}
|
|
|
|
// func emit(_ codepoints: [Int]) {
|
|
// emit(String(codepoints, 0, codepoints.length));
|
|
// }
|
|
|
|
func emit(_ c: UnicodeScalar) {
|
|
emit(String(c));
|
|
}
|
|
|
|
func getState()->TokeniserState {
|
|
return state;
|
|
}
|
|
|
|
func transition(_ state: TokeniserState) {
|
|
self.state = state;
|
|
}
|
|
|
|
func advanceTransition(_ state: TokeniserState) {
|
|
reader.advance();
|
|
self.state = state;
|
|
}
|
|
|
|
func acknowledgeSelfClosingFlag() {
|
|
selfClosingFlagAcknowledged = true;
|
|
}
|
|
|
|
private var codepointHolder: [UnicodeScalar] = [UnicodeScalar(0)!]; // holder to not have to keep creating arrays
|
|
private var multipointHolder: [UnicodeScalar] = [UnicodeScalar(0)!,UnicodeScalar(0)!];
|
|
|
|
func consumeCharacterReference(_ additionalAllowedCharacter: UnicodeScalar?, _ inAttribute: Bool)throws->[UnicodeScalar]? {
|
|
if (reader.isEmpty()){
|
|
return nil;
|
|
}
|
|
if (additionalAllowedCharacter != nil && additionalAllowedCharacter == reader.current()){
|
|
return nil;
|
|
}
|
|
if (reader.matchesAnySorted(Tokeniser.notCharRefCharsSorted)){
|
|
return nil;
|
|
}
|
|
|
|
var codeRef: [UnicodeScalar] = codepointHolder;
|
|
reader.markPos();
|
|
if (reader.matchConsume("#")) { // numbered
|
|
let isHexMode: Bool = reader.matchConsumeIgnoreCase("X");
|
|
let numRef: String = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
|
|
if (numRef.unicodeScalars.count == 0) { // didn't match anything
|
|
characterReferenceError("numeric reference with no numerals");
|
|
reader.rewindToMark();
|
|
return nil;
|
|
}
|
|
if (!reader.matchConsume(";")){
|
|
characterReferenceError("missing semicolon"); // missing semi
|
|
}
|
|
var charval : Int = -1;
|
|
|
|
let base: Int = isHexMode ? 16 : 10;
|
|
if let num = Int(numRef,radix: base)
|
|
{
|
|
charval = num
|
|
}
|
|
|
|
|
|
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
|
|
characterReferenceError("character outside of valid range");
|
|
codeRef[0] = Tokeniser.replacementChar;
|
|
return codeRef;
|
|
} else {
|
|
// todo: implement number replacement table
|
|
// todo: check for extra illegal unicode points as parse errors
|
|
codeRef[0] = UnicodeScalar(charval)!;
|
|
return codeRef;
|
|
}
|
|
} else { // named
|
|
// get as many letters as possible, and look for matching entities.
|
|
let nameRef : String = reader.consumeLetterThenDigitSequence();
|
|
let looksLegit: Bool = reader.matches(";");
|
|
// found if a base named entity without a ;, or an extended entity with the ;.
|
|
let found: Bool = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
|
|
|
|
if (!found) {
|
|
reader.rewindToMark();
|
|
if (looksLegit){ // named with semicolon
|
|
characterReferenceError(String(format:"invalid named referenece '%@'", nameRef));
|
|
}
|
|
return nil;
|
|
}
|
|
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny("=", "-", "_"))) {
|
|
// don't want that to match
|
|
reader.rewindToMark();
|
|
return nil;
|
|
}
|
|
if (!reader.matchConsume(";")){
|
|
characterReferenceError("missing semicolon"); // missing semi
|
|
}
|
|
let numChars: Int = Entities.codepointsForName(nameRef, codepoints: &multipointHolder);
|
|
if (numChars == 1) {
|
|
codeRef[0] = multipointHolder[0];
|
|
return codeRef;
|
|
} else if (numChars == 2) {
|
|
return multipointHolder;
|
|
} else {
|
|
try Validate.fail(msg: "Unexpected characters returned for " + nameRef);
|
|
return multipointHolder;
|
|
}
|
|
}
|
|
}
|
|
|
|
@discardableResult
|
|
func createTagPending(_ start: Bool)->Token.Tag {
|
|
tagPending = start ? startPending.reset() : endPending.reset();
|
|
return tagPending;
|
|
}
|
|
|
|
func emitTagPending()throws {
|
|
try tagPending.finaliseTag();
|
|
try emit(tagPending);
|
|
}
|
|
|
|
func createCommentPending() {
|
|
commentPending.reset();
|
|
}
|
|
|
|
func emitCommentPending()throws {
|
|
try emit(commentPending);
|
|
}
|
|
|
|
func createDoctypePending() {
|
|
doctypePending.reset();
|
|
}
|
|
|
|
func emitDoctypePending()throws {
|
|
try emit(doctypePending);
|
|
}
|
|
|
|
func createTempBuffer() {
|
|
Token.reset(dataBuffer);
|
|
}
|
|
|
|
func isAppropriateEndTagToken()throws->Bool {
|
|
if(lastStartTag != nil){
|
|
let s = try tagPending.name()
|
|
return s.equalsIgnoreCase(string: lastStartTag!)
|
|
}
|
|
return false
|
|
}
|
|
|
|
func appropriateEndTagName()->String? {
|
|
if (lastStartTag == nil){
|
|
return nil;
|
|
}
|
|
return lastStartTag;
|
|
}
|
|
|
|
func error(_ state: TokeniserState) {
|
|
if (errors != nil && errors!.canAddError()){
|
|
errors?.add(ParseError(reader.getPos(), "Unexpected character '%@' in input state [%@]", String(reader.current()), state.description));
|
|
}
|
|
}
|
|
|
|
func eofError(_ state: TokeniserState) {
|
|
if (errors != nil && errors!.canAddError()){
|
|
errors?.add(ParseError(reader.getPos(), "Unexpectedly reached end of file (EOF) in input state [%@]", state.description));
|
|
}
|
|
}
|
|
|
|
private func characterReferenceError(_ message: String) {
|
|
if (errors != nil && errors!.canAddError()){
|
|
errors?.add(ParseError(reader.getPos(), "Invalid character reference: %@", message));
|
|
}
|
|
}
|
|
|
|
private func error(_ errorMsg: String) {
|
|
if (errors != nil && errors!.canAddError()){
|
|
errors?.add(ParseError(reader.getPos(), errorMsg));
|
|
}
|
|
}
|
|
|
|
func currentNodeInHtmlNS()->Bool {
|
|
// todo: implement namespaces correctly
|
|
return true;
|
|
// Element currentNode = currentNode();
|
|
// return currentNode != null && currentNode.namespace().equals("HTML");
|
|
}
|
|
|
|
/**
|
|
* Utility method to consume reader and unescape entities found within.
|
|
* @param inAttribute
|
|
* @return unescaped string from reader
|
|
*/
|
|
func unescapeEntities(_ inAttribute: Bool)throws->String {
|
|
let builder : StringBuilder = StringBuilder();
|
|
while (!reader.isEmpty()) {
|
|
builder.append(reader.consumeTo("&"));
|
|
if (reader.matches("&")) {
|
|
reader.consume();
|
|
if let c = try consumeCharacterReference(nil, inAttribute)
|
|
{
|
|
if (c.count==0){
|
|
builder.append("&");
|
|
}else {
|
|
builder.appendCodePoint(c[0]);
|
|
if (c.count == 2){
|
|
builder.appendCodePoint(c[1]);
|
|
}
|
|
}
|
|
}else {
|
|
builder.append("&");
|
|
}
|
|
}
|
|
}
|
|
return builder.toString();
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|