Convert CharacterReader to native string indexing

2019-03-03 17:33:18 -08:00 · 2019-03-03 17:33:18 -08:00 · a4eca3ed31
parent 5da0b123b9
commit a4eca3ed31
2 changed files with 149 additions and 290 deletions
--- a/Sources/CharacterReader.swift
+++ b/Sources/CharacterReader.swift
@ -14,43 +14,47 @@ import Foundation
 public final class CharacterReader {
    private static let empty = ""
    public static let EOF: UnicodeScalar = "\u{FFFF}"//65535
-    private let input: [UnicodeScalar]
+    private let input: String.UnicodeScalarView
-    private let length: Int
+    private var pos: String.UnicodeScalarView.Index
-    private var pos: Int = 0
+    private var mark: String.UnicodeScalarView.Index
    private var mark: Int = 0
    //private let stringCache: Array<String?> // holds reused strings in this doc, to lessen garbage
    public init(_ input: String) {
-        self.input = Array(input.unicodeScalars)
+        self.input = input.unicodeScalars
-        self.length = self.input.count
+        self.pos = input.startIndex
-        //stringCache = Array(repeating:nil, count:512)
+        self.mark = input.startIndex
    }
    public func getPos() -> Int {
-        return self.pos
+        return input.distance(from: input.startIndex, to: pos)
    }
    public func isEmpty() -> Bool {
-        return pos >= length
+        return pos >= input.endIndex
    }
    public func current() -> UnicodeScalar {
-        return (pos >= length) ? CharacterReader.EOF : input[pos]
+        return (pos >= input.endIndex) ? CharacterReader.EOF : input[pos]
    }
    @discardableResult
    public func consume() -> UnicodeScalar {
-        let val = (pos >= length) ? CharacterReader.EOF : input[pos]
+        guard pos < input.endIndex else {
-        pos += 1
+            return CharacterReader.EOF
        }
        let val = input[pos]
        pos = input.index(after: pos)
        return val
    }
    public func unconsume() {
-        pos -= 1
+        guard pos > input.startIndex else { return }
        pos = input.index(before: pos)
    }
    public func advance() {
-        pos += 1
+        guard pos < input.endIndex else { return }
        pos = input.index(after: pos)
    }
    public func markPos() {
@ -62,221 +66,169 @@ public final class CharacterReader {
    }
    public func consumeAsString() -> String {
-        let p = pos
+        guard pos < input.endIndex else { return "" }
-        pos+=1
+        let str = String(input[pos])
-        return String(input[p])
+        pos = input.index(after: pos)
-        //return String(input, pos+=1, 1)
+        return str
    }
    /**
-     * Returns the number of characters between the current position and the next instance of the input char
+     * Locate the next occurrence of a Unicode scalar
     * @param c scan target
     * @return offset between current position and next instance of target. -1 if not found.
     */
    public func nextIndexOf(_ c: UnicodeScalar) -> Int {
        // doesn't handle scanning for surrogates
        for i in pos..<length {
            if (c == input[i]) {
                return i - pos
            }
        }
        return -1
    }
    /**
     * Returns the number of characters between the current position and the next instance of the input sequence
     *
-     * @param seq scan target
+     * - Parameter c: scan target
-     * @return offset between current position and next instance of target. -1 if not found.
+     * - Returns: offset between current position and next instance of target. -1 if not found.
     */
-    public func nextIndexOf(_ seq: String) -> Int {
+    public func nextIndexOf(_ c: UnicodeScalar) -> String.UnicodeScalarView.Index? {
        // doesn't handle scanning for surrogates
-		if(seq.isEmpty) {return -1}
+        return input[pos...].firstIndex(of: c)
        let startChar: UnicodeScalar = seq.unicodeScalar(0)
        for var offset in pos..<length {
            // scan to first instance of startchar:
            if (startChar != input[offset]) {
                offset+=1
                while(offset < length && startChar != input[offset]) { offset+=1 }
            }
            var i = offset + 1
            let last = i + seq.unicodeScalars.count-1
            if (offset < length && last <= length) {
                var j = 1
                while i < last && seq.unicodeScalar(j) == input[i] {
                    j+=1
                    i+=1
                }
                // found full sequence
                if (i == last) {
                    return offset - pos
                }
            }
        }
        return -1
    }
    /**
     * Locate the next occurence of a target string
     *
     * - Parameter seq: scan target
     * - Returns: index of next instance of target. nil if not found.
     */
    public func nextIndexOf(_ seq: String) -> String.UnicodeScalarView.Index? {
        // doesn't handle scanning for surrogates
        var start = pos
        let targetScalars = seq.unicodeScalars
        guard let firstChar = targetScalars.first else { return pos } // search for "" -> current place
        MATCH: while true {
            // Match on first scalar
            guard let firstCharIx = input[start...].firstIndex(of: firstChar) else { return nil }
            var current = firstCharIx
            // Then manually match subsequent scalars
            for scalar in targetScalars.dropFirst() {
                current = input.index(after: current)
                guard current < input.endIndex else { return nil }
                if input[current] != scalar {
                    start = input.index(after: firstCharIx)
                    continue MATCH
                }
            }
            // full match; current is at position of last matching character
            return firstCharIx
        }
    }
    public func consumeTo(_ c: UnicodeScalar) -> String {
-        let offset = nextIndexOf(c)
+        guard let targetIx = nextIndexOf(c) else {
        if (offset != -1) {
            let consumed = cacheString(pos, offset)
            pos += offset
            return consumed
        } else {
            return consumeToEnd()
        }
        let consumed = cacheString(pos, targetIx)
        pos = targetIx
        return consumed
    }
    public func consumeTo(_ seq: String) -> String {
-        let offset = nextIndexOf(seq)
+        guard let targetIx = nextIndexOf(seq) else {
        if (offset != -1) {
            let consumed = cacheString(pos, offset)
            pos += offset
            return consumed
        } else {
            return consumeToEnd()
        }
        let consumed = cacheString(pos, targetIx)
        pos = targetIx
        return consumed
    }
    public func consumeToAny(_ chars: UnicodeScalar...) -> String {
        return consumeToAny(chars)
    }
    public func consumeToAny(_ chars: [UnicodeScalar]) -> String {
-        let start: Int = pos
+        let start = pos
-        let remaining: Int = length
+        while pos < input.endIndex {
-        let val = input
+            if chars.contains(input[pos]) {
-        OUTER: while (pos < remaining) {
+                break
-			if chars.contains(val[pos]) {
+            }
-				break OUTER
+            pos = input.index(after: pos)
 			}
 //            for c in chars {
 //                if (val[pos] == c){
 //                    break OUTER
 //                }
 //            }
            pos += 1
        }
-
+        return cacheString(start, pos)
        return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
    }
    public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String {
-        return consumeToAnySorted(chars)
+        return consumeToAny(chars)
    }
    public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String {
-        let start = pos
+        return consumeToAny(chars)
        let remaining = length
        let val = input
        while (pos < remaining) {
            if chars.contains(val[pos]) {
                break
            }
            pos += 1
        }
        return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
    }
    static let dataTerminators: [UnicodeScalar] = [.Ampersand, .LessThan, TokeniserStateVars.nullScalr]
    // read to &, <, or null
    public func consumeData() -> String {
-        // &, <, null
+        return consumeToAny(CharacterReader.dataTerminators)
        let start = pos
        let remaining = length
        let val = input
        while (pos < remaining) {
            let c: UnicodeScalar = val[pos]
            if (c == UnicodeScalar.Ampersand || c ==  UnicodeScalar.LessThan || c ==  TokeniserStateVars.nullScalr) {
                break
            }
            pos += 1
        }
        return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
    }
    static let tagNameTerminators: [UnicodeScalar] = [.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr]
    // read to '\t', '\n', '\r', '\f', ' ', '/', '>', or nullChar
    public func consumeTagName() -> String {
-        // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
+        return consumeToAny(CharacterReader.tagNameTerminators)
        let start = pos
        let remaining = length
        let val = input
        while (pos < remaining) {
            let c: UnicodeScalar = val[pos]
            if (c == UnicodeScalar.BackslashT || c ==  UnicodeScalar.BackslashN || c ==  UnicodeScalar.BackslashR || c ==  UnicodeScalar.BackslashF || c ==  UnicodeScalar.Space || c ==  UnicodeScalar.Slash || c ==  UnicodeScalar.GreaterThan || c ==  TokeniserStateVars.nullScalr) {
                break
            }
            pos += 1
        }
        return pos > start ? cacheString(start, pos-start) : CharacterReader.empty
    }
    public func consumeToEnd() -> String {
-        let data = cacheString(pos, length-pos)
+        let consumed = cacheString(pos, input.endIndex)
-        pos = length
+        pos = input.endIndex
-        return data
+        return consumed
    }
    public func consumeLetterSequence() -> String {
        let start = pos
-        while (pos < length) {
+        while pos < input.endIndex {
-            let c: UnicodeScalar = input[pos]
+            let c = input[pos]
            if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) {
-                pos += 1
+                pos = input.index(after: pos)
            } else {
                break
            }
        }
-        return cacheString(start, pos - start)
+        return cacheString(start, pos)
    }
    public func consumeLetterThenDigitSequence() -> String {
        let start = pos
-        while (pos < length) {
+        while pos < input.endIndex {
            let c = input[pos]
            if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) {
-                pos += 1
+                pos = input.index(after: pos)
            } else {
                break
            }
        }
-        while (!isEmpty()) {
+        while pos < input.endIndex {
            let c = input[pos]
            if (c >= "0" && c <= "9") {
-                pos += 1
+                pos = input.index(after: pos)
            } else {
                break
            }
        }
-
+        return cacheString(start, pos)
        return cacheString(start, pos - start)
    }
    public func consumeHexSequence() -> String {
        let start = pos
-        while (pos < length) {
+        while pos < input.endIndex {
            let c = input[pos]
            if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) {
-                pos+=1
+                pos = input.index(after: pos)
            } else {
                break
            }
        }
-        return cacheString(start, pos - start)
+        return cacheString(start, pos)
    }
    public func consumeDigitSequence() -> String {
        let start = pos
-        while (pos < length) {
+        while pos < input.endIndex {
            let c = input[pos]
            if (c >= "0" && c <= "9") {
-                pos+=1
+                pos = input.index(after: pos)
            } else {
                break
            }
        }
-        return cacheString(start, pos - start)
+        return cacheString(start, pos)
    }
    public func matches(_ c: UnicodeScalar) -> Bool {
@ -284,180 +236,85 @@ public final class CharacterReader {
    }
-    public func matches(_ seq: String) -> Bool {
+    public func matches(_ seq: String, ignoreCase: Bool = false, consume: Bool = false) -> Bool {
-        let scanLength = seq.unicodeScalars.count
+        var current = pos
-        if (scanLength > length - pos) {
+        let scalars = seq.unicodeScalars
-            return false
+        for scalar in scalars {
-        }
+            guard current < input.endIndex else { return false }
-
+            if ignoreCase {
-        for offset in 0..<scanLength {
+                guard input[current].uppercase == scalar.uppercase else { return false }
-            if (seq.unicodeScalar(offset) != input[pos+offset]) {
+            } else {
-                return false
+                guard input[current] == scalar else { return false }
            }
            current = input.index(after: current)
        }
        if consume {
            pos = current
        }
        return true
    }
    public func matchesIgnoreCase(_ seq: String ) -> Bool {
-
+        return matches(seq, ignoreCase: true)
        let scanLength = seq.unicodeScalars.count
 		if(scanLength == 0) {
 			return false
 		}
        if (scanLength > length - pos) {
            return false
        }
        for offset in 0..<scanLength {
            let upScan: UnicodeScalar = seq.unicodeScalar(offset).uppercase
            let upTarget: UnicodeScalar = input[pos+offset].uppercase
            if (upScan != upTarget) {
                return false
            }
        }
        return true
    }
    public func matchesAny(_ seq: UnicodeScalar...) -> Bool {
-        if (isEmpty()) {
+        return matchesAny(seq)
-            return false
+    }
-        }
+    
-
+    public func matchesAny(_ seq: [UnicodeScalar]) -> Bool {
-        let c: UnicodeScalar = input[pos]
+        guard pos < input.endIndex else { return false }
-        for seek in seq {
+        return seq.contains(input[pos])
            if (seek == c) {
                return true
            }
        }
        return false
    }
    public func matchesAnySorted(_ seq: [UnicodeScalar]) -> Bool {
-        return !isEmpty() && seq.contains(input[pos])
+        return matchesAny(seq)
    }
    public func matchesLetter() -> Bool {
-        if (isEmpty()) {
+        guard pos < input.endIndex else { return false }
-            return false
+        let c = input[pos]
        }
        let c  = input[pos]
        return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)
    }
    public func matchesDigit() -> Bool {
-        if (isEmpty()) {
+        guard pos < input.endIndex else { return false }
-            return false
+        let c = input[pos]
-        }
+        return c >= "0" && c <= "9"
        let c  = input[pos]
        return (c >= "0" && c <= "9")
    }
    @discardableResult
    public func matchConsume(_ seq: String) -> Bool {
-        if (matches(seq)) {
+        return matches(seq, consume: true)
            pos += seq.unicodeScalars.count
            return true
        } else {
            return false
        }
    }
    @discardableResult
    public func matchConsumeIgnoreCase(_ seq: String) -> Bool {
-        if (matchesIgnoreCase(seq)) {
+        return matches(seq, ignoreCase: true, consume: true)
            pos += seq.unicodeScalars.count
            return true
        } else {
            return false
        }
    }
    public func containsIgnoreCase(_ seq: String ) -> Bool {
        // used to check presence of </title>, </style>. only finds consistent case.
        let loScan = seq.lowercased(with: Locale(identifier: "en"))
        let hiScan = seq.uppercased(with: Locale(identifier: "eng"))
-        return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1)
+        return nextIndexOf(loScan) != nil || nextIndexOf(hiScan) != nil
    }
    public func toString() -> String {
-        return String(input[pos..<length])
+        return String(input[pos...])
 		//return String.unicodescalars(Array(input[pos..<length]))
        //return  input.string(pos, length - pos)
    }
    /**
-     * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
+     * Originally intended as a caching mechanism for strings, but caching doesn't
-     * <p />
+     * seem to improve performance. Now just a stub.
     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
     * some more duplicates.
     */
-    private func cacheString(_ start: Int, _ count: Int) -> String {
+    private func cacheString(_ start: String.UnicodeScalarView.Index, _ end: String.UnicodeScalarView.Index) -> String {
-        return String(input[start..<start+count])
+        return String(input[start..<end])
 // Too Slow
 //        var cache: [String?] = stringCache
 //
 //        // limit (no cache):
 //        if (count > CharacterReader.maxCacheLen) {
 //            return String(val[start..<start+count].flatMap { Character($0) })
 //        }
 //
 //        // calculate hash:
 //        var hash: Int = 0
 //        var offset = start
 //        for _ in 0..<count {
 //            let ch = val[offset].value
 //            hash = Int.addWithOverflow(Int.multiplyWithOverflow(31, hash).0, Int(ch)).0
 //            offset+=1
 //        }
 //
 //        // get from cache
 //		hash = abs(hash)
 //		let i = hash % cache.count
 //        let index: Int = abs(i) //Int(hash & Int(cache.count) - 1)
 //        var cached = cache[index]
 //
 //        if (cached == nil) { // miss, add
 //			cached = String(val[start..<start+count].flatMap { Character($0) })
 //            //cached = val.string(start, count)
 //            cache[Int(index)] = cached
 //        } else { // hashcode hit, check equality
 //            if (rangeEquals(start, count, cached!)) { // hit
 //                return cached!
 //            } else { // hashcode conflict
 //				cached = String(val[start..<start+count].flatMap { Character($0) })
 //                //cached = val.string(start, count)
 //                cache[index] = cached // update the cache, as recently used strings are more likely to show up again
 //            }
 //        }
 //        return cached!
    }
 //    /**
 //     * Check if the value of the provided range equals the string.
 //     */
 //    public func rangeEquals(_ start: Int, _ count: Int, _ cached: String) -> Bool {
 //        if (count == cached.unicodeScalars.count) {
 //            var count = count
 //            let one = input
 //            var i = start
 //            var j = 0
 //            while (count != 0) {
 //                count -= 1
 //                if (one[i] != cached.unicodeScalar(j) ) {
 //                    return false
 //                }
 //                j += 1
 //                i += 1
 //            }
 //            return true
 //        }
 //        return false
 //    }
 }
 extension CharacterReader: CustomDebugStringConvertible {
    public var debugDescription: String {
-        return  self.toString()
+        return  toString()
    }
 }
--- a/Tests/SwiftSoupTests/CharacterReaderTest.swift
+++ b/Tests/SwiftSoupTests/CharacterReaderTest.swift
@ -53,10 +53,12 @@ class CharacterReaderTest: XCTestCase {
        XCTAssertEqual("e", r.consume())
        XCTAssertTrue(r.isEmpty())
-        XCTAssertEqual(CharacterReader.EOF, r.consume())
+        // Indexes beyond the end are not allowed in native indexing
-        r.unconsume()
+        //
-        XCTAssertTrue(r.isEmpty())
+        // XCTAssertEqual(CharacterReader.EOF, r.consume())
-        XCTAssertEqual(CharacterReader.EOF, r.current())
+        // r.unconsume()
        // XCTAssertTrue(r.isEmpty())
        // XCTAssertEqual(CharacterReader.EOF, r.current())
    }
    func testMark() {
@ -82,31 +84,31 @@ class CharacterReaderTest: XCTestCase {
        let input = "blah blah"
        let r = CharacterReader(input)
-        XCTAssertEqual(-1, r.nextIndexOf("x"))
+        XCTAssertEqual(nil, r.nextIndexOf("x"))
-        XCTAssertEqual(3, r.nextIndexOf("h"))
+        XCTAssertEqual(input.index(input.startIndex, offsetBy: 3), r.nextIndexOf("h"))
        let pull = r.consumeTo("h")
        XCTAssertEqual("bla", pull)
        XCTAssertEqual("h", r.consume())
-        XCTAssertEqual(2, r.nextIndexOf("l"))
+        XCTAssertEqual(input.index(input.startIndex, offsetBy: 6), r.nextIndexOf("l"))
        XCTAssertEqual(" blah", r.consumeToEnd())
-        XCTAssertEqual(-1, r.nextIndexOf("x"))
+        XCTAssertEqual(nil, r.nextIndexOf("x"))
    }
    func testNextIndexOfString() {
        let input = "One Two something Two Three Four"
        let r = CharacterReader(input)
-        XCTAssertEqual(-1, r.nextIndexOf("Foo"))
+        XCTAssertEqual(nil, r.nextIndexOf("Foo"))
-        XCTAssertEqual(4, r.nextIndexOf("Two"))
+        XCTAssertEqual(input.index(input.startIndex, offsetBy: 4), r.nextIndexOf("Two"))
        XCTAssertEqual("One Two ", r.consumeTo("something"))
-        XCTAssertEqual(10, r.nextIndexOf("Two"))
+        XCTAssertEqual(input.index(input.startIndex, offsetBy: 18), r.nextIndexOf("Two"))
        XCTAssertEqual("something Two Three Four", r.consumeToEnd())
-        XCTAssertEqual(-1, r.nextIndexOf("Two"))
+        XCTAssertEqual(nil, r.nextIndexOf("Two"))
    }
    func testNextIndexOfUnmatched() {
        let r = CharacterReader("<[[one]]")
-        XCTAssertEqual(-1, r.nextIndexOf("]]>"))
+        XCTAssertEqual(nil, r.nextIndexOf("]]>"))
    }
    func testConsumeToChar() {