Convert CharacterReader to native string indexing

This commit is contained in:
Garth Snyder 2019-03-03 17:33:18 -08:00
parent 5da0b123b9
commit a4eca3ed31
2 changed files with 149 additions and 290 deletions

View File

@ -14,43 +14,47 @@ import Foundation
public final class CharacterReader { public final class CharacterReader {
private static let empty = "" private static let empty = ""
public static let EOF: UnicodeScalar = "\u{FFFF}"//65535 public static let EOF: UnicodeScalar = "\u{FFFF}"//65535
private let input: [UnicodeScalar] private let input: String.UnicodeScalarView
private let length: Int private var pos: String.UnicodeScalarView.Index
private var pos: Int = 0 private var mark: String.UnicodeScalarView.Index
private var mark: Int = 0
//private let stringCache: Array<String?> // holds reused strings in this doc, to lessen garbage //private let stringCache: Array<String?> // holds reused strings in this doc, to lessen garbage
public init(_ input: String) { public init(_ input: String) {
self.input = Array(input.unicodeScalars) self.input = input.unicodeScalars
self.length = self.input.count self.pos = input.startIndex
//stringCache = Array(repeating:nil, count:512) self.mark = input.startIndex
} }
public func getPos() -> Int { public func getPos() -> Int {
return self.pos return input.distance(from: input.startIndex, to: pos)
} }
public func isEmpty() -> Bool { public func isEmpty() -> Bool {
return pos >= length return pos >= input.endIndex
} }
public func current() -> UnicodeScalar { public func current() -> UnicodeScalar {
return (pos >= length) ? CharacterReader.EOF : input[pos] return (pos >= input.endIndex) ? CharacterReader.EOF : input[pos]
} }
@discardableResult @discardableResult
public func consume() -> UnicodeScalar { public func consume() -> UnicodeScalar {
let val = (pos >= length) ? CharacterReader.EOF : input[pos] guard pos < input.endIndex else {
pos += 1 return CharacterReader.EOF
}
let val = input[pos]
pos = input.index(after: pos)
return val return val
} }
public func unconsume() { public func unconsume() {
pos -= 1 guard pos > input.startIndex else { return }
pos = input.index(before: pos)
} }
public func advance() { public func advance() {
pos += 1 guard pos < input.endIndex else { return }
pos = input.index(after: pos)
} }
public func markPos() { public func markPos() {
@ -62,221 +66,169 @@ public final class CharacterReader {
} }
public func consumeAsString() -> String { public func consumeAsString() -> String {
let p = pos guard pos < input.endIndex else { return "" }
pos+=1 let str = String(input[pos])
return String(input[p]) pos = input.index(after: pos)
//return String(input, pos+=1, 1) return str
} }
/** /**
* Returns the number of characters between the current position and the next instance of the input char * Locate the next occurrence of a Unicode scalar
* @param c scan target
* @return offset between current position and next instance of target. -1 if not found.
*/
public func nextIndexOf(_ c: UnicodeScalar) -> Int {
// doesn't handle scanning for surrogates
for i in pos..<length {
if (c == input[i]) {
return i - pos
}
}
return -1
}
/**
* Returns the number of characters between the current position and the next instance of the input sequence
* *
* @param seq scan target * - Parameter c: scan target
* @return offset between current position and next instance of target. -1 if not found. * - Returns: offset between current position and next instance of target. -1 if not found.
*/ */
public func nextIndexOf(_ seq: String) -> Int { public func nextIndexOf(_ c: UnicodeScalar) -> String.UnicodeScalarView.Index? {
// doesn't handle scanning for surrogates // doesn't handle scanning for surrogates
if(seq.isEmpty) {return -1} return input[pos...].firstIndex(of: c)
let startChar: UnicodeScalar = seq.unicodeScalar(0)
for var offset in pos..<length {
// scan to first instance of startchar:
if (startChar != input[offset]) {
offset+=1
while(offset < length && startChar != input[offset]) { offset+=1 }
}
var i = offset + 1
let last = i + seq.unicodeScalars.count-1
if (offset < length && last <= length) {
var j = 1
while i < last && seq.unicodeScalar(j) == input[i] {
j+=1
i+=1
}
// found full sequence
if (i == last) {
return offset - pos
}
}
}
return -1
} }
/**
* Locate the next occurence of a target string
*
* - Parameter seq: scan target
* - Returns: index of next instance of target. nil if not found.
*/
public func nextIndexOf(_ seq: String) -> String.UnicodeScalarView.Index? {
// doesn't handle scanning for surrogates
var start = pos
let targetScalars = seq.unicodeScalars
guard let firstChar = targetScalars.first else { return pos } // search for "" -> current place
MATCH: while true {
// Match on first scalar
guard let firstCharIx = input[start...].firstIndex(of: firstChar) else { return nil }
var current = firstCharIx
// Then manually match subsequent scalars
for scalar in targetScalars.dropFirst() {
current = input.index(after: current)
guard current < input.endIndex else { return nil }
if input[current] != scalar {
start = input.index(after: firstCharIx)
continue MATCH
}
}
// full match; current is at position of last matching character
return firstCharIx
}
}
public func consumeTo(_ c: UnicodeScalar) -> String { public func consumeTo(_ c: UnicodeScalar) -> String {
let offset = nextIndexOf(c) guard let targetIx = nextIndexOf(c) else {
if (offset != -1) {
let consumed = cacheString(pos, offset)
pos += offset
return consumed
} else {
return consumeToEnd() return consumeToEnd()
} }
let consumed = cacheString(pos, targetIx)
pos = targetIx
return consumed
} }
public func consumeTo(_ seq: String) -> String { public func consumeTo(_ seq: String) -> String {
let offset = nextIndexOf(seq) guard let targetIx = nextIndexOf(seq) else {
if (offset != -1) {
let consumed = cacheString(pos, offset)
pos += offset
return consumed
} else {
return consumeToEnd() return consumeToEnd()
} }
let consumed = cacheString(pos, targetIx)
pos = targetIx
return consumed
} }
public func consumeToAny(_ chars: UnicodeScalar...) -> String { public func consumeToAny(_ chars: UnicodeScalar...) -> String {
return consumeToAny(chars) return consumeToAny(chars)
} }
public func consumeToAny(_ chars: [UnicodeScalar]) -> String { public func consumeToAny(_ chars: [UnicodeScalar]) -> String {
let start: Int = pos let start = pos
let remaining: Int = length while pos < input.endIndex {
let val = input if chars.contains(input[pos]) {
OUTER: while (pos < remaining) { break
if chars.contains(val[pos]) { }
break OUTER pos = input.index(after: pos)
}
// for c in chars {
// if (val[pos] == c){
// break OUTER
// }
// }
pos += 1
} }
return cacheString(start, pos)
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
} }
public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String { public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String {
return consumeToAnySorted(chars) return consumeToAny(chars)
} }
public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String { public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String {
let start = pos return consumeToAny(chars)
let remaining = length
let val = input
while (pos < remaining) {
if chars.contains(val[pos]) {
break
}
pos += 1
}
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
} }
static let dataTerminators: [UnicodeScalar] = [.Ampersand, .LessThan, TokeniserStateVars.nullScalr]
// read to &, <, or null
public func consumeData() -> String { public func consumeData() -> String {
// &, <, null return consumeToAny(CharacterReader.dataTerminators)
let start = pos
let remaining = length
let val = input
while (pos < remaining) {
let c: UnicodeScalar = val[pos]
if (c == UnicodeScalar.Ampersand || c == UnicodeScalar.LessThan || c == TokeniserStateVars.nullScalr) {
break
}
pos += 1
}
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
} }
static let tagNameTerminators: [UnicodeScalar] = [.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr]
// read to '\t', '\n', '\r', '\f', ' ', '/', '>', or nullChar
public func consumeTagName() -> String { public func consumeTagName() -> String {
// '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar return consumeToAny(CharacterReader.tagNameTerminators)
let start = pos
let remaining = length
let val = input
while (pos < remaining) {
let c: UnicodeScalar = val[pos]
if (c == UnicodeScalar.BackslashT || c == UnicodeScalar.BackslashN || c == UnicodeScalar.BackslashR || c == UnicodeScalar.BackslashF || c == UnicodeScalar.Space || c == UnicodeScalar.Slash || c == UnicodeScalar.GreaterThan || c == TokeniserStateVars.nullScalr) {
break
}
pos += 1
}
return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
} }
public func consumeToEnd() -> String { public func consumeToEnd() -> String {
let data = cacheString(pos, length-pos) let consumed = cacheString(pos, input.endIndex)
pos = length pos = input.endIndex
return data return consumed
} }
public func consumeLetterSequence() -> String { public func consumeLetterSequence() -> String {
let start = pos let start = pos
while (pos < length) { while pos < input.endIndex {
let c: UnicodeScalar = input[pos] let c = input[pos]
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) {
pos += 1 pos = input.index(after: pos)
} else { } else {
break break
} }
} }
return cacheString(start, pos - start) return cacheString(start, pos)
} }
public func consumeLetterThenDigitSequence() -> String { public func consumeLetterThenDigitSequence() -> String {
let start = pos let start = pos
while (pos < length) { while pos < input.endIndex {
let c = input[pos] let c = input[pos]
if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) {
pos += 1 pos = input.index(after: pos)
} else { } else {
break break
} }
} }
while (!isEmpty()) { while pos < input.endIndex {
let c = input[pos] let c = input[pos]
if (c >= "0" && c <= "9") { if (c >= "0" && c <= "9") {
pos += 1 pos = input.index(after: pos)
} else { } else {
break break
} }
} }
return cacheString(start, pos)
return cacheString(start, pos - start)
} }
public func consumeHexSequence() -> String { public func consumeHexSequence() -> String {
let start = pos let start = pos
while (pos < length) { while pos < input.endIndex {
let c = input[pos] let c = input[pos]
if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) { if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) {
pos+=1 pos = input.index(after: pos)
} else { } else {
break break
} }
} }
return cacheString(start, pos - start) return cacheString(start, pos)
} }
public func consumeDigitSequence() -> String { public func consumeDigitSequence() -> String {
let start = pos let start = pos
while (pos < length) { while pos < input.endIndex {
let c = input[pos] let c = input[pos]
if (c >= "0" && c <= "9") { if (c >= "0" && c <= "9") {
pos+=1 pos = input.index(after: pos)
} else { } else {
break break
} }
} }
return cacheString(start, pos - start) return cacheString(start, pos)
} }
public func matches(_ c: UnicodeScalar) -> Bool { public func matches(_ c: UnicodeScalar) -> Bool {
@ -284,180 +236,85 @@ public final class CharacterReader {
} }
public func matches(_ seq: String) -> Bool { public func matches(_ seq: String, ignoreCase: Bool = false, consume: Bool = false) -> Bool {
let scanLength = seq.unicodeScalars.count var current = pos
if (scanLength > length - pos) { let scalars = seq.unicodeScalars
return false for scalar in scalars {
} guard current < input.endIndex else { return false }
if ignoreCase {
for offset in 0..<scanLength { guard input[current].uppercase == scalar.uppercase else { return false }
if (seq.unicodeScalar(offset) != input[pos+offset]) { } else {
return false guard input[current] == scalar else { return false }
} }
current = input.index(after: current)
}
if consume {
pos = current
} }
return true return true
} }
public func matchesIgnoreCase(_ seq: String ) -> Bool { public func matchesIgnoreCase(_ seq: String ) -> Bool {
return matches(seq, ignoreCase: true)
let scanLength = seq.unicodeScalars.count
if(scanLength == 0) {
return false
}
if (scanLength > length - pos) {
return false
}
for offset in 0..<scanLength {
let upScan: UnicodeScalar = seq.unicodeScalar(offset).uppercase
let upTarget: UnicodeScalar = input[pos+offset].uppercase
if (upScan != upTarget) {
return false
}
}
return true
} }
public func matchesAny(_ seq: UnicodeScalar...) -> Bool { public func matchesAny(_ seq: UnicodeScalar...) -> Bool {
if (isEmpty()) { return matchesAny(seq)
return false }
}
public func matchesAny(_ seq: [UnicodeScalar]) -> Bool {
let c: UnicodeScalar = input[pos] guard pos < input.endIndex else { return false }
for seek in seq { return seq.contains(input[pos])
if (seek == c) {
return true
}
}
return false
} }
public func matchesAnySorted(_ seq: [UnicodeScalar]) -> Bool { public func matchesAnySorted(_ seq: [UnicodeScalar]) -> Bool {
return !isEmpty() && seq.contains(input[pos]) return matchesAny(seq)
} }
public func matchesLetter() -> Bool { public func matchesLetter() -> Bool {
if (isEmpty()) { guard pos < input.endIndex else { return false }
return false let c = input[pos]
}
let c = input[pos]
return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters) return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)
} }
public func matchesDigit() -> Bool { public func matchesDigit() -> Bool {
if (isEmpty()) { guard pos < input.endIndex else { return false }
return false let c = input[pos]
} return c >= "0" && c <= "9"
let c = input[pos]
return (c >= "0" && c <= "9")
} }
@discardableResult @discardableResult
public func matchConsume(_ seq: String) -> Bool { public func matchConsume(_ seq: String) -> Bool {
if (matches(seq)) { return matches(seq, consume: true)
pos += seq.unicodeScalars.count
return true
} else {
return false
}
} }
@discardableResult @discardableResult
public func matchConsumeIgnoreCase(_ seq: String) -> Bool { public func matchConsumeIgnoreCase(_ seq: String) -> Bool {
if (matchesIgnoreCase(seq)) { return matches(seq, ignoreCase: true, consume: true)
pos += seq.unicodeScalars.count
return true
} else {
return false
}
} }
public func containsIgnoreCase(_ seq: String ) -> Bool { public func containsIgnoreCase(_ seq: String ) -> Bool {
// used to check presence of </title>, </style>. only finds consistent case. // used to check presence of </title>, </style>. only finds consistent case.
let loScan = seq.lowercased(with: Locale(identifier: "en")) let loScan = seq.lowercased(with: Locale(identifier: "en"))
let hiScan = seq.uppercased(with: Locale(identifier: "eng")) let hiScan = seq.uppercased(with: Locale(identifier: "eng"))
return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1) return nextIndexOf(loScan) != nil || nextIndexOf(hiScan) != nil
} }
public func toString() -> String { public func toString() -> String {
return String(input[pos..<length]) return String(input[pos...])
//return String.unicodescalars(Array(input[pos..<length]))
//return input.string(pos, length - pos)
} }
/** /**
* Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks. * Originally intended as a caching mechanism for strings, but caching doesn't
* <p /> * seem to improve performance. Now just a stub.
* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
* That saves both having to create objects as hash keys, and running through the entry list, at the expense of
* some more duplicates.
*/ */
private func cacheString(_ start: Int, _ count: Int) -> String { private func cacheString(_ start: String.UnicodeScalarView.Index, _ end: String.UnicodeScalarView.Index) -> String {
return String(input[start..<start+count]) return String(input[start..<end])
// Too Slow
// var cache: [String?] = stringCache
//
// // limit (no cache):
// if (count > CharacterReader.maxCacheLen) {
// return String(val[start..<start+count].flatMap { Character($0) })
// }
//
// // calculate hash:
// var hash: Int = 0
// var offset = start
// for _ in 0..<count {
// let ch = val[offset].value
// hash = Int.addWithOverflow(Int.multiplyWithOverflow(31, hash).0, Int(ch)).0
// offset+=1
// }
//
// // get from cache
// hash = abs(hash)
// let i = hash % cache.count
// let index: Int = abs(i) //Int(hash & Int(cache.count) - 1)
// var cached = cache[index]
//
// if (cached == nil) { // miss, add
// cached = String(val[start..<start+count].flatMap { Character($0) })
// //cached = val.string(start, count)
// cache[Int(index)] = cached
// } else { // hashcode hit, check equality
// if (rangeEquals(start, count, cached!)) { // hit
// return cached!
// } else { // hashcode conflict
// cached = String(val[start..<start+count].flatMap { Character($0) })
// //cached = val.string(start, count)
// cache[index] = cached // update the cache, as recently used strings are more likely to show up again
// }
// }
// return cached!
} }
// /**
// * Check if the value of the provided range equals the string.
// */
// public func rangeEquals(_ start: Int, _ count: Int, _ cached: String) -> Bool {
// if (count == cached.unicodeScalars.count) {
// var count = count
// let one = input
// var i = start
// var j = 0
// while (count != 0) {
// count -= 1
// if (one[i] != cached.unicodeScalar(j) ) {
// return false
// }
// j += 1
// i += 1
// }
// return true
// }
// return false
// }
} }
extension CharacterReader: CustomDebugStringConvertible { extension CharacterReader: CustomDebugStringConvertible {
public var debugDescription: String { public var debugDescription: String {
return self.toString() return toString()
} }
} }

View File

@ -53,10 +53,12 @@ class CharacterReaderTest: XCTestCase {
XCTAssertEqual("e", r.consume()) XCTAssertEqual("e", r.consume())
XCTAssertTrue(r.isEmpty()) XCTAssertTrue(r.isEmpty())
XCTAssertEqual(CharacterReader.EOF, r.consume()) // Indexes beyond the end are not allowed in native indexing
r.unconsume() //
XCTAssertTrue(r.isEmpty()) // XCTAssertEqual(CharacterReader.EOF, r.consume())
XCTAssertEqual(CharacterReader.EOF, r.current()) // r.unconsume()
// XCTAssertTrue(r.isEmpty())
// XCTAssertEqual(CharacterReader.EOF, r.current())
} }
func testMark() { func testMark() {
@ -82,31 +84,31 @@ class CharacterReaderTest: XCTestCase {
let input = "blah blah" let input = "blah blah"
let r = CharacterReader(input) let r = CharacterReader(input)
XCTAssertEqual(-1, r.nextIndexOf("x")) XCTAssertEqual(nil, r.nextIndexOf("x"))
XCTAssertEqual(3, r.nextIndexOf("h")) XCTAssertEqual(input.index(input.startIndex, offsetBy: 3), r.nextIndexOf("h"))
let pull = r.consumeTo("h") let pull = r.consumeTo("h")
XCTAssertEqual("bla", pull) XCTAssertEqual("bla", pull)
XCTAssertEqual("h", r.consume()) XCTAssertEqual("h", r.consume())
XCTAssertEqual(2, r.nextIndexOf("l")) XCTAssertEqual(input.index(input.startIndex, offsetBy: 6), r.nextIndexOf("l"))
XCTAssertEqual(" blah", r.consumeToEnd()) XCTAssertEqual(" blah", r.consumeToEnd())
XCTAssertEqual(-1, r.nextIndexOf("x")) XCTAssertEqual(nil, r.nextIndexOf("x"))
} }
func testNextIndexOfString() { func testNextIndexOfString() {
let input = "One Two something Two Three Four" let input = "One Two something Two Three Four"
let r = CharacterReader(input) let r = CharacterReader(input)
XCTAssertEqual(-1, r.nextIndexOf("Foo")) XCTAssertEqual(nil, r.nextIndexOf("Foo"))
XCTAssertEqual(4, r.nextIndexOf("Two")) XCTAssertEqual(input.index(input.startIndex, offsetBy: 4), r.nextIndexOf("Two"))
XCTAssertEqual("One Two ", r.consumeTo("something")) XCTAssertEqual("One Two ", r.consumeTo("something"))
XCTAssertEqual(10, r.nextIndexOf("Two")) XCTAssertEqual(input.index(input.startIndex, offsetBy: 18), r.nextIndexOf("Two"))
XCTAssertEqual("something Two Three Four", r.consumeToEnd()) XCTAssertEqual("something Two Three Four", r.consumeToEnd())
XCTAssertEqual(-1, r.nextIndexOf("Two")) XCTAssertEqual(nil, r.nextIndexOf("Two"))
} }
func testNextIndexOfUnmatched() { func testNextIndexOfUnmatched() {
let r = CharacterReader("<[[one]]") let r = CharacterReader("<[[one]]")
XCTAssertEqual(-1, r.nextIndexOf("]]>")) XCTAssertEqual(nil, r.nextIndexOf("]]>"))
} }
func testConsumeToChar() { func testConsumeToChar() {