SwiftSoup/Sources/parser/CharacterReader.swift

462 lines
13 KiB
Swift

//
// CharacterReader.swift
// SwiftSoup
//
// Created by Nabil Chatbi on 10/10/16.
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
//
import Foundation
/**
CharacterReader consumes tokens off a string. To replace the old TokenQueue.
*/
public final class CharacterReader
{
public static let EOF : UnicodeScalar = "\u{FFFF}"//65535
private static let maxCacheLen : Int = 12
private let input : String
private let length : Int
private var pos : Int = 0
private var mark : Int = 0
private let stringCache : Array<String?> // holds reused strings in this doc, to lessen garbage
public init(_ input: String)
{
self.input = input
self.length = self.input.unicodeScalars.count
stringCache = Array(repeating:nil, count:512)
}
public func getPos() -> Int {
return self.pos
}
public func isEmpty() -> Bool {
return pos >= length;
}
public func current() -> UnicodeScalar {
return (pos >= length) ? CharacterReader.EOF : input.unicodeScalar(pos)
}
@discardableResult
public func consume() -> UnicodeScalar {
let val = (pos >= length) ? CharacterReader.EOF : input.unicodeScalar(pos)
pos += 1;
return val;
}
public func unconsume() {
pos -= 1;
}
public func advance() {
pos += 1;
}
public func markPos() {
mark = pos;
}
public func rewindToMark() {
pos = mark;
}
public func consumeAsString() -> String {
let p = pos;
pos+=1
return String(input[p])
//return String(input, pos+=1, 1);
}
/**
* Returns the number of characters between the current position and the next instance of the input char
* @param c scan target
* @return offset between current position and next instance of target. -1 if not found.
*/
public func nextIndexOf(_ c : UnicodeScalar) -> Int {
// doesn't handle scanning for surrogates
for i in pos..<length {
if (c == input.unicodeScalar(i)){
return i - pos;
}
}
return -1;
}
/**
* Returns the number of characters between the current position and the next instance of the input sequence
*
* @param seq scan target
* @return offset between current position and next instance of target. -1 if not found.
*/
public func nextIndexOf(_ seq: String) -> Int {
// doesn't handle scanning for surrogates
if(seq.isEmpty){return -1}
let startChar : UnicodeScalar = seq.unicodeScalar(0)
for var offset in pos..<length {
// scan to first instance of startchar:
if (startChar != input.unicodeScalar(offset)){
offset+=1
while(offset < length && startChar != input.unicodeScalar(offset)) { offset+=1 }
}
var i = offset + 1;
let last = i + seq.unicodeScalars.count-1;
if (offset < length && last <= length)
{
var j = 1;
while i < last && seq.unicodeScalar(j) == input.unicodeScalar(i) {
j+=1
i+=1;
}
// found full sequence
if (i == last){
return offset - pos;
}
}
}
return -1;
}
public func consumeTo(_ c : UnicodeScalar) -> String {
let offset = nextIndexOf(c);
if (offset != -1) {
let consumed = cacheString(pos, offset);
pos += offset;
return consumed;
} else {
return consumeToEnd();
}
}
public func consumeTo(_ seq: String) -> String {
let offset = nextIndexOf(seq);
if (offset != -1) {
let consumed = cacheString(pos, offset);
pos += offset;
return consumed;
} else {
return consumeToEnd();
}
}
public func consumeToAny(_ chars: UnicodeScalar...)->String {
return consumeToAny(chars)
}
public func consumeToAny(_ chars: [UnicodeScalar])->String {
let start : Int = pos;
let remaining : Int = length;
let val = input;
OUTER: while (pos < remaining) {
for c in chars {
if (val.unicodeScalar(pos) == c){
break OUTER;
}
}
pos += 1;
}
return pos > start ? cacheString(start, pos-start) : "";
}
public func consumeToAnySorted(_ chars: UnicodeScalar...)->String {
return consumeToAnySorted(chars)
}
public func consumeToAnySorted(_ chars: [UnicodeScalar])->String {
let start = pos;
let remaining = length;
let val = input;
while (pos < remaining) {
if (chars.binarySearch(chars, val.unicodeScalar(pos)) >= 0){
break;
}
pos += 1;
}
return pos > start ? cacheString(start, pos-start) : "";
}
public func consumeData() -> String {
// &, <, null
let start = pos;
let remaining = length;
let val = input;
while (pos < remaining) {
let c : UnicodeScalar = val.unicodeScalar(pos);
if (c == "&" || c == "<" || c == TokeniserStateVars.nullScalr){
break;
}
pos += 1;
}
return pos > start ? cacheString(start, pos-start) : "";
}
public func consumeTagName()-> String {
// '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
let start = pos;
let remaining = length;
let val = input;
while (pos < remaining) {
let c : UnicodeScalar = val.unicodeScalar(pos)
if (c == "\t" || c == "\n" || c == "\r" || c == UnicodeScalar.BackslashF || c == " " || c == "/" || c == ">" || c == TokeniserStateVars.nullScalr){
break;
}
pos += 1;
}
return pos > start ? cacheString(start, pos-start) : "";
}
public func consumeToEnd()-> String {
let data = cacheString(pos, length-pos);
pos = length;
return data;
}
public func consumeLetterSequence()-> String {
let start = pos;
while (pos < length) {
let c : UnicodeScalar = input.unicodeScalar(pos)
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)){
pos += 1;
}else{
break;
}
}
return cacheString(start, pos - start);
}
public func consumeLetterThenDigitSequence()-> String {
let start = pos;
while (pos < length) {
let c = input.unicodeScalar(pos)
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)){
pos += 1;
}else{
break;
}
}
while (!isEmpty()) {
let c = input.unicodeScalar(pos)
if (c >= "0" && c <= "9"){
pos += 1;
}else{
break;
}
}
return cacheString(start, pos - start);
}
public func consumeHexSequence()-> String {
let start = pos;
while (pos < length) {
let c = input.unicodeScalar(pos)
if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")){
pos+=1;
}else{
break;
}
}
return cacheString(start, pos - start);
}
public func consumeDigitSequence() -> String {
let start = pos;
while (pos < length) {
let c = input.unicodeScalar(pos)
if (c >= "0" && c <= "9"){
pos+=1;
}else{
break;
}
}
return cacheString(start, pos - start);
}
public func matches(_ c: UnicodeScalar) -> Bool {
return !isEmpty() && input.unicodeScalar(pos) == c;
}
public func matches(_ seq: String)-> Bool {
let scanLength = seq.unicodeScalars.count;
if (scanLength > length - pos){
return false;
}
for offset in 0..<scanLength{
if (seq.unicodeScalar(offset) != input.unicodeScalar(pos+offset)){
return false;
}
}
return true;
}
public func matchesIgnoreCase(_ seq: String )->Bool {
let scanLength = seq.unicodeScalars.count;
if(scanLength == 0){
return false
}
if (scanLength > length - pos){
return false;
}
for offset in 0..<scanLength{
let upScan : UnicodeScalar = seq.unicodeScalar(offset).uppercase
let upTarget : UnicodeScalar = input.unicodeScalar(pos+offset).uppercase;
if (upScan != upTarget){
return false;
}
}
return true;
}
public func matchesAny(_ seq: UnicodeScalar...)->Bool {
if (isEmpty()){
return false;
}
let c : UnicodeScalar = input.unicodeScalar(pos);
for seek in seq {
if (seek == c){
return true;
}
}
return false;
}
public func matchesAnySorted(_ seq : [UnicodeScalar]) -> Bool {
return !isEmpty() && seq.binarySearch(seq, input.unicodeScalar(pos)) >= 0;
}
public func matchesLetter()-> Bool {
if (isEmpty()){
return false;
}
let c = input.unicodeScalar(pos);
return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters);
}
public func matchesDigit()->Bool {
if (isEmpty()){
return false;
}
let c = input.unicodeScalar(pos)
return (c >= "0" && c <= "9");
}
@discardableResult
public func matchConsume(_ seq: String)->Bool {
if (matches(seq)) {
pos += seq.unicodeScalars.count;
return true;
} else {
return false;
}
}
@discardableResult
public func matchConsumeIgnoreCase(_ seq: String)->Bool {
if (matchesIgnoreCase(seq)) {
pos += seq.unicodeScalars.count;
return true;
} else {
return false;
}
}
public func containsIgnoreCase(_ seq: String )->Bool {
// used to check presence of </title>, </style>. only finds consistent case.
let loScan = seq.lowercased(with: Locale(identifier: "en"))
let hiScan = seq.uppercased(with: Locale(identifier: "eng"))
return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
}
public func toString()->String {
return input.string(pos, length - pos);
}
/**
* Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
* <p />
* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
* That saves both having to create objects as hash keys, and running through the entry list, at the expense of
* some more duplicates.
*/
private func cacheString(_ start: Int, _ count: Int) -> String {
let val = input;
var cache : [String?] = stringCache;
// limit (no cache):
if (count > CharacterReader.maxCacheLen){
return val.string(start, count);
}
// calculate hash:
var hash : Int = 0;
var offset = start;
for _ in 0..<count{
let ch = val.unicodeScalar(pos).value;
hash = Int.addWithOverflow(Int.multiplyWithOverflow(31, hash).0, Int(ch)).0;
offset+=1
}
// get from cache
hash = abs(hash)
let i = hash % cache.count
let index : Int = abs(i) //Int(hash & Int(cache.count) - 1);
var cached = cache[index];
if (cached == nil)
{ // miss, add
cached = val.string(start, count);
cache[Int(index)] = cached;
} else { // hashcode hit, check equality
if (rangeEquals(start, count, cached!)) { // hit
return cached!;
} else { // hashcode conflict
cached = val.string(start, count);
cache[index] = cached; // update the cache, as recently used strings are more likely to show up again
}
}
return cached!;
}
/**
* Check if the value of the provided range equals the string.
*/
public func rangeEquals(_ start: Int, _ count: Int, _ cached: String) -> Bool {
if (count == cached.unicodeScalars.count)
{
var count = count
let one = input;
var i = start;
var j = 0;
while (count != 0) {
count -= 1
if (one.unicodeScalar(i) != cached.unicodeScalar(j) )
{
return false;
}
j += 1
i += 1
}
return true;
}
return false;
}
}