485 lines
14 KiB
Swift
485 lines
14 KiB
Swift
//
|
|
// CharacterReader.swift
|
|
// SwiftSoup
|
|
//
|
|
// Created by Nabil Chatbi on 10/10/16.
|
|
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
|
|
//
|
|
|
|
import Foundation
|
|
|
|
/**
|
|
CharacterReader consumes tokens off a string. To replace the old TokenQueue.
|
|
*/
|
|
public final class CharacterReader {
|
|
private static let empty = ""
|
|
public static let EOF: UnicodeScalar = "\u{FFFF}"//65535
|
|
private let input: Bytes
|
|
private let length: Int
|
|
private var pos: Int = 0
|
|
private var mark: Int = 0
|
|
//private let stringCache: Array<String?> // holds reused strings in this doc, to lessen garbage
|
|
//let bytes: Bytes
|
|
//var scanner: Scanner<Bytes>
|
|
|
|
public init(_ input: String) {
|
|
self.input = input.makeBytes()
|
|
self.length = self.input.count
|
|
//bytes = input.makeBytes()
|
|
//scanner = Scanner(bytes)
|
|
//stringCache = Array(repeating:nil, count:512)
|
|
|
|
|
|
}
|
|
|
|
public func getPos() -> Int {
|
|
return self.pos
|
|
}
|
|
|
|
public func isEmpty() -> Bool {
|
|
return pos >= length
|
|
}
|
|
|
|
public func current() -> Byte {
|
|
return (pos >= length) ? Byte.EOF : input[pos]
|
|
}
|
|
|
|
@discardableResult
|
|
public func consume() -> Byte {
|
|
let val = (pos >= length) ? Byte.EOF : input[pos]
|
|
pos += 1
|
|
return val
|
|
}
|
|
|
|
public func unconsume() {
|
|
pos -= 1
|
|
}
|
|
|
|
public func advance() {
|
|
pos += 1
|
|
}
|
|
|
|
public func markPos() {
|
|
mark = pos
|
|
}
|
|
|
|
public func rewindToMark() {
|
|
pos = mark
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the number of characters between the current position and the next instance of the input char
|
|
* @param c scan target
|
|
* @return offset between current position and next instance of target. -1 if not found.
|
|
*/
|
|
public func nextIndexOf(_ c: Byte) -> Int {
|
|
// doesn't handle scanning for surrogates
|
|
for i in pos..<length {
|
|
if (c == input[i]) {
|
|
return i - pos
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
/**
|
|
* Returns the number of characters between the current position and the next instance of the input sequence
|
|
*
|
|
* @param seq scan target
|
|
* @return offset between current position and next instance of target. -1 if not found.
|
|
*/
|
|
public func nextIndexOf(_ seq: String) -> Int {
|
|
let seq = seq.makeBytes()
|
|
// doesn't handle scanning for surrogates
|
|
if(seq.isEmpty) {return -1}
|
|
let startChar = seq[0]
|
|
for var offset in pos..<length {
|
|
// scan to first instance of startchar:
|
|
if (startChar != input[offset]) {
|
|
offset+=1
|
|
while(offset < length && startChar != input[offset]) { offset+=1 }
|
|
}
|
|
var i = offset + 1
|
|
let last = i + seq.count-1
|
|
if (offset < length && last <= length) {
|
|
var j = 1
|
|
while i < last && seq[j] == input[i] {
|
|
j+=1
|
|
i+=1
|
|
}
|
|
// found full sequence
|
|
if (i == last) {
|
|
return offset - pos
|
|
}
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
public func consumeTo(_ c: Byte) -> String {
|
|
let offset = nextIndexOf(c)
|
|
if (offset != -1) {
|
|
let consumed = cacheString(pos, offset)
|
|
pos += offset
|
|
return consumed
|
|
} else {
|
|
return consumeToEnd()
|
|
}
|
|
}
|
|
|
|
public func consumeTo(_ seq: String) -> String {
|
|
let offset = nextIndexOf(seq)
|
|
if (offset != -1) {
|
|
let consumed = cacheString(pos, offset)
|
|
pos += offset
|
|
return consumed
|
|
} else {
|
|
return consumeToEnd()
|
|
}
|
|
}
|
|
|
|
public func consumeToAny(_ chars: Byte...) -> String {
|
|
return consumeToAny(chars)
|
|
}
|
|
public func consumeToAny(_ chars: [Byte]) -> String {
|
|
let start: Int = pos
|
|
let remaining: Int = length
|
|
let val = input
|
|
OUTER: while (pos < remaining) {
|
|
if chars.contains(val[pos]) {
|
|
break OUTER
|
|
}
|
|
// for c in chars {
|
|
// if (val[pos] == c){
|
|
// break OUTER
|
|
// }
|
|
// }
|
|
pos += 1
|
|
}
|
|
|
|
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
|
|
}
|
|
|
|
public func consumeToAnySorted(_ chars: Byte...) -> String {
|
|
return consumeToAnySorted(chars)
|
|
}
|
|
public func consumeToAnySorted(_ chars: [Byte]) -> String {
|
|
let start = pos
|
|
let remaining = length
|
|
let val = input
|
|
|
|
while (pos < remaining) {
|
|
|
|
if chars.contains(val[pos]) {
|
|
break
|
|
}
|
|
pos += 1
|
|
}
|
|
|
|
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
|
|
}
|
|
|
|
public func consumeData() -> String {
|
|
// &, <, null
|
|
let start = pos
|
|
let remaining = length
|
|
let val = input
|
|
|
|
while (pos < remaining) {
|
|
let c = val[pos]
|
|
if (c == Byte.ampersand || c == Byte.lessThan || c == Byte.null) {
|
|
break
|
|
}
|
|
pos += 1
|
|
}
|
|
|
|
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
|
|
}
|
|
|
|
public func consumeTagName() -> String {
|
|
// '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
|
|
let start = pos
|
|
let remaining = length
|
|
let val = input
|
|
|
|
while (pos < remaining) {
|
|
let c = val[pos]
|
|
if (c == Byte.horizontalTab || c == Byte.newLine || c == Byte.carriageReturn || c == Byte.formfeed || c == Byte.space || c == Byte.forwardSlash || c == Byte.greaterThan || c == Byte.null) {
|
|
break
|
|
}
|
|
pos += 1
|
|
}
|
|
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
|
|
}
|
|
|
|
public func consumeToEnd() -> String {
|
|
let data = cacheString(pos, length-pos)
|
|
pos = length
|
|
return data
|
|
}
|
|
|
|
public func consumeLetterSequence() -> String {
|
|
let start = pos
|
|
while (pos < length) {
|
|
let c = input[pos]
|
|
if ((c >= Byte.A && c <= Byte.Z) || (c >= Byte.a && c <= Byte.z) || c.isLetter) {
|
|
pos += 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return cacheString(start, pos - start)
|
|
}
|
|
|
|
public func consumeLetterThenDigitSequence() -> String {
|
|
let start = pos
|
|
while (pos < length) {
|
|
let c = input[pos]
|
|
if ((c >= Byte.A && c <= Byte.Z) || (c >= Byte.a && c <= Byte.z) || c.isLetter) {
|
|
pos += 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
while (!isEmpty()) {
|
|
let c = input[pos]
|
|
if (c >= Byte.zero && c <= Byte.nine) {
|
|
pos += 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
return cacheString(start, pos - start)
|
|
}
|
|
|
|
public func consumeHexSequence() -> String {
|
|
let start = pos
|
|
while (pos < length) {
|
|
let c = input[pos]
|
|
if ((c >= Byte.zero && c <= Byte.nine) || (c >= Byte.A && c <= Byte.F) || (c >= Byte.a && c <= Byte.f)) {
|
|
pos+=1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return cacheString(start, pos - start)
|
|
}
|
|
|
|
public func consumeDigitSequence() -> String {
|
|
let start = pos
|
|
while (pos < length) {
|
|
let c = input[pos]
|
|
if (c >= Byte.zero && c <= Byte.nine) {
|
|
pos+=1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return cacheString(start, pos - start)
|
|
}
|
|
|
|
public func matches(_ c: Byte) -> Bool {
|
|
return !isEmpty() && input[pos] == c
|
|
|
|
}
|
|
|
|
public func matches(_ seq: Bytes) -> Bool {
|
|
let scanLength = seq.count
|
|
if (scanLength > length - pos) {
|
|
return false
|
|
}
|
|
|
|
for offset in 0..<scanLength {
|
|
if (seq[offset] != input[pos+offset]) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
public func matchesIgnoreCase(_ seq: String ) -> Bool {
|
|
let seq = seq.makeBytes()
|
|
let scanLength = seq.count
|
|
if(scanLength == 0) {
|
|
return false
|
|
}
|
|
if (scanLength > length - pos) {
|
|
return false
|
|
}
|
|
|
|
for offset in 0..<scanLength {
|
|
let upScan = seq[offset].uppercase
|
|
let upTarget = input[pos+offset].uppercase
|
|
if (upScan != upTarget) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
public func matchesAny(_ seq: Byte...) -> Bool {
|
|
if (isEmpty()) {
|
|
return false
|
|
}
|
|
|
|
let c = input[pos]
|
|
for seek in seq {
|
|
if (seek == c) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
public func matchesAnySorted(_ seq: [Byte]) -> Bool {
|
|
return !isEmpty() && seq.contains(input[pos])
|
|
}
|
|
|
|
public func matchesLetter() -> Bool {
|
|
if (isEmpty()) {
|
|
return false
|
|
}
|
|
let c = input[pos]
|
|
return (c >= Byte.A && c <= Byte.Z) || (c >= Byte.a && c <= Byte.z) || c.isLetter
|
|
}
|
|
|
|
public func matchesDigit() -> Bool {
|
|
if (isEmpty()) {
|
|
return false
|
|
}
|
|
let c = input[pos]
|
|
return (c >= Byte.zero && c <= Byte.nine)
|
|
}
|
|
|
|
@discardableResult
|
|
public func matchConsume(_ seq: Bytes) -> Bool {
|
|
if (matches(seq)) {
|
|
pos += seq.count
|
|
return true
|
|
} else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
@discardableResult
|
|
public func matchConsume(_ seq: Byte) -> Bool {
|
|
if (matches(seq)) {
|
|
pos += 1
|
|
return true
|
|
} else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
@discardableResult
|
|
public func matchConsumeIgnoreCase(_ seq: String) -> Bool {
|
|
if (matchesIgnoreCase(seq)) {
|
|
pos += seq.unicodeScalars.count
|
|
return true
|
|
} else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
@discardableResult
|
|
public func matchConsumeIgnoreCase(_ seq: Byte) -> Bool {
|
|
if (containsIgnoreCase(seq)) {
|
|
pos += 1
|
|
return true
|
|
} else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
///TODO: provare RIMUOVERE
|
|
public func containsIgnoreCase(_ seq: String ) -> Bool {
|
|
// used to check presence of </title>, </style>. only finds consistent case.
|
|
let loScan = seq.lowercased(with: Locale(identifier: "en"))
|
|
let hiScan = seq.uppercased(with: Locale(identifier: "eng"))
|
|
return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1)
|
|
}
|
|
|
|
public func containsIgnoreCase(_ seq: Byte ) -> Bool {
|
|
// used to check presence of </title>, </style>. only finds consistent case.
|
|
return (nextIndexOf(seq.uppercase) > -1) || (nextIndexOf(seq.lowercase) > -1)
|
|
}
|
|
|
|
public func toString() -> String {
|
|
return input[pos..<length].makeString()
|
|
}
|
|
|
|
/**
|
|
* Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
|
|
* <p />
|
|
* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
|
|
* That saves both having to create objects as hash keys, and running through the entry list, at the expense of
|
|
* some more duplicates.
|
|
*/
|
|
private func cacheString(_ start: Int, _ count: Int) -> String {
|
|
let ar = input[start..<start+count]
|
|
return ar.makeString()
|
|
//return String(input[start..<start+count].flatMap { Character($0) })
|
|
// Too Slow
|
|
// var cache: [String?] = stringCache
|
|
//
|
|
// // limit (no cache):
|
|
// if (count > CharacterReader.maxCacheLen) {
|
|
// return String(val[start..<start+count].flatMap { Character($0) })
|
|
// }
|
|
//
|
|
// // calculate hash:
|
|
// var hash: Int = 0
|
|
// var offset = start
|
|
// for _ in 0..<count {
|
|
// let ch = val[offset].value
|
|
// hash = Int.addWithOverflow(Int.multiplyWithOverflow(31, hash).0, Int(ch)).0
|
|
// offset+=1
|
|
// }
|
|
//
|
|
// // get from cache
|
|
// hash = abs(hash)
|
|
// let i = hash % cache.count
|
|
// let index: Int = abs(i) //Int(hash & Int(cache.count) - 1)
|
|
// var cached = cache[index]
|
|
//
|
|
// if (cached == nil) { // miss, add
|
|
// cached = String(val[start..<start+count].flatMap { Character($0) })
|
|
// //cached = val.string(start, count)
|
|
// cache[Int(index)] = cached
|
|
// } else { // hashcode hit, check equality
|
|
// if (rangeEquals(start, count, cached!)) { // hit
|
|
// return cached!
|
|
// } else { // hashcode conflict
|
|
// cached = String(val[start..<start+count].flatMap { Character($0) })
|
|
// //cached = val.string(start, count)
|
|
// cache[index] = cached // update the cache, as recently used strings are more likely to show up again
|
|
// }
|
|
// }
|
|
// return cached!
|
|
}
|
|
|
|
// /**
|
|
// * Check if the value of the provided range equals the string.
|
|
// */
|
|
// public func rangeEquals(_ start: Int, _ count: Int, _ cached: String) -> Bool {
|
|
// if (count == cached.unicodeScalars.count) {
|
|
// var count = count
|
|
// let one = input
|
|
// var i = start
|
|
// var j = 0
|
|
// while (count != 0) {
|
|
// count -= 1
|
|
// if (one[i] != cached.unicodeScalar(j) ) {
|
|
// return false
|
|
// }
|
|
// j += 1
|
|
// i += 1
|
|
// }
|
|
// return true
|
|
// }
|
|
// return false
|
|
// }
|
|
}
|