SwiftSoup/Sources/parser/CharacterReader.swift

//
//  CharacterReader.swift
//  SwiftSoup
//
//  Created by Nabil Chatbi on 10/10/16.
//  Copyright © 2016 Nabil Chatbi.. All rights reserved.
//

import Foundation

/**
 CharacterReader consumes tokens off a string. To replace the old TokenQueue.
 */
public final class CharacterReader
{
    public static let EOF : UnicodeScalar = "\u{FFFF}"//65535
    private static let maxCacheLen : Int = 12
    private let input : String
    private let length : Int
    private var pos : Int = 0
    private var mark : Int = 0
    private let stringCache : Array<String?> // holds reused strings in this doc, to lessen garbage

    public init(_ input: String)
    {
        self.input = input
        self.length = self.input.unicodeScalars.count
        stringCache = Array(repeating:nil, count:512)
    }

    public func getPos() -> Int {
        return self.pos
    }

    public func isEmpty() -> Bool {
        return pos >= length;
    }

    public func current() -> UnicodeScalar {
        return (pos >= length) ? CharacterReader.EOF : input.unicodeScalar(pos)
    }

    @discardableResult
    public func consume() -> UnicodeScalar {
        let val = (pos >= length) ? CharacterReader.EOF : input.unicodeScalar(pos)
        pos += 1;
        return val;
    }

    public func unconsume() {
        pos -= 1;
    }

    public func advance() {
        pos += 1;
    }

    public func markPos() {
        mark = pos;
    }

    public func rewindToMark() {
        pos = mark;
    }

    public func consumeAsString() -> String {
        let p = pos;
        pos+=1
        return String(input[p])
        //return String(input, pos+=1, 1);
    }

    /**
     * Returns the number of characters between the current position and the next instance of the input char
     * @param c scan target
     * @return offset between current position and next instance of target. -1 if not found.
     */
    public func nextIndexOf(_ c : UnicodeScalar) -> Int {
        // doesn't handle scanning for surrogates
        for i in pos..<length {
            if (c == input.unicodeScalar(i)){
                return i - pos;
            }
        }
        return -1;
    }

    /**
     * Returns the number of characters between the current position and the next instance of the input sequence
     *
     * @param seq scan target
     * @return offset between current position and next instance of target. -1 if not found.
     */
    public func nextIndexOf(_ seq: String) -> Int {
        // doesn't handle scanning for surrogates
		if(seq.isEmpty){return -1}
        let startChar : UnicodeScalar = seq.unicodeScalar(0)
        for var offset in pos..<length {
            // scan to first instance of startchar:
            if (startChar != input.unicodeScalar(offset)){
                offset+=1
                while(offset < length && startChar != input.unicodeScalar(offset)) { offset+=1 }
            }
            var i = offset + 1;
            let last = i + seq.unicodeScalars.count-1;
            if (offset < length && last <= length)
            {
                var j = 1;
                while i < last && seq.unicodeScalar(j) == input.unicodeScalar(i) {
                    j+=1
                    i+=1;
                }
                // found full sequence
                if (i == last){
                    return offset - pos;
                }
            }
        }
        return -1;
    }

    public func consumeTo(_ c : UnicodeScalar) -> String {
        let offset = nextIndexOf(c);
        if (offset != -1) {
            let consumed = cacheString(pos, offset);
            pos += offset;
            return consumed;
        } else {
            return consumeToEnd();
        }
    }

    public func consumeTo(_ seq: String) -> String {
        let offset = nextIndexOf(seq);
        if (offset != -1) {
            let consumed = cacheString(pos, offset);
            pos += offset;
            return consumed;
        } else {
            return consumeToEnd();
        }
    }

    public func consumeToAny(_ chars: UnicodeScalar...)->String {
        return consumeToAny(chars)
    }
    public func consumeToAny(_ chars: [UnicodeScalar])->String {
        let start : Int = pos;
        let remaining : Int = length;
        let val = input;

        OUTER: while (pos < remaining) {
            for c in chars {
                if (val.unicodeScalar(pos) == c){
                    break OUTER;
                }
            }
            pos += 1;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }


    public func consumeToAnySorted(_ chars: UnicodeScalar...)->String {
        return consumeToAnySorted(chars)
    }
    public func consumeToAnySorted(_ chars: [UnicodeScalar])->String {
        let start = pos;
        let remaining = length;
        let val = input;

        while (pos < remaining) {
            if (chars.binarySearch(chars, val.unicodeScalar(pos)) >= 0){
                break;
            }
            pos += 1;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }


    public func consumeData() -> String {
        // &, <, null
        let start = pos;
        let remaining = length;
        let val = input;

        while (pos < remaining) {
            let c : UnicodeScalar = val.unicodeScalar(pos);
            if (c == "&" || c ==  "<" || c ==  TokeniserStateVars.nullScalr){
                break;
            }
            pos += 1;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }

    public func consumeTagName()-> String {
        // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
        let start = pos;
        let remaining = length;
        let val = input;

        while (pos < remaining) {
            let c : UnicodeScalar = val.unicodeScalar(pos)
            if (c == "\t" || c ==  "\n" || c ==  "\r" || c ==  UnicodeScalar.BackslashF || c ==  " " || c ==  "/" || c ==  ">" || c ==  TokeniserStateVars.nullScalr){
                break;
            }
            pos += 1;
        }
        return pos > start ? cacheString(start, pos-start) : "";
    }


    public func consumeToEnd()-> String {
        let data = cacheString(pos, length-pos);
        pos = length;
        return data;
    }

    public func consumeLetterSequence()-> String {
        let start = pos;
        while (pos < length) {
            let c : UnicodeScalar = input.unicodeScalar(pos)
            if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)){
                pos += 1;
            }else{
                break;
            }
        }
        return cacheString(start, pos - start);
    }

    public func consumeLetterThenDigitSequence()-> String {
        let start = pos;
        while (pos < length) {
            let c = input.unicodeScalar(pos)
            if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)){
                pos += 1;
            }else{
                break;
            }
        }
        while (!isEmpty()) {
            let c = input.unicodeScalar(pos)
            if (c >= "0" && c <= "9"){
                pos += 1;
            }else{
                break;
            }
        }

        return cacheString(start, pos - start);
    }

    public func consumeHexSequence()-> String {
        let start = pos;
        while (pos < length) {
            let c = input.unicodeScalar(pos)
            if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")){
                pos+=1;
            }else{
                break;
            }
        }
        return cacheString(start, pos - start);
    }

    public func consumeDigitSequence() -> String {
        let start = pos;
        while (pos < length) {
            let c = input.unicodeScalar(pos)
            if (c >= "0" && c <= "9"){
                pos+=1;
            }else{
                break;
            }
        }
        return cacheString(start, pos - start);
    }

    public func matches(_ c: UnicodeScalar) -> Bool {
        return !isEmpty() && input.unicodeScalar(pos) == c;

    }

    public func matches(_ seq: String)-> Bool {
        let scanLength = seq.unicodeScalars.count;
        if (scanLength > length - pos){
            return false;
        }

        for offset in 0..<scanLength{
            if (seq.unicodeScalar(offset) != input.unicodeScalar(pos+offset)){
                return false;
            }
        }
        return true;
    }

    public func matchesIgnoreCase(_ seq: String )->Bool {

        let scanLength = seq.unicodeScalars.count;
		if(scanLength == 0){
			return false
		}
        if (scanLength > length - pos){
            return false;
        }

        for offset in 0..<scanLength{
            let upScan : UnicodeScalar = seq.unicodeScalar(offset).uppercase
            let upTarget : UnicodeScalar = input.unicodeScalar(pos+offset).uppercase;
            if (upScan != upTarget){
                return false;
            }
        }
        return true;
    }

    public func matchesAny(_ seq: UnicodeScalar...)->Bool {
        if (isEmpty()){
            return false;
        }

        let c : UnicodeScalar = input.unicodeScalar(pos);
        for seek in seq {
            if (seek == c){
                return true;
            }
        }
        return false;
    }

    public func matchesAnySorted(_ seq : [UnicodeScalar]) -> Bool {
        return !isEmpty() && seq.binarySearch(seq, input.unicodeScalar(pos)) >= 0;
    }

    public func matchesLetter()-> Bool {
        if (isEmpty()){
            return false;
        }
        let c  = input.unicodeScalar(pos);
        return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters);
    }

    public func matchesDigit()->Bool {
        if (isEmpty()){
            return false;
        }
        let c  = input.unicodeScalar(pos)
        return (c >= "0" && c <= "9");
    }

    @discardableResult
    public func matchConsume(_ seq: String)->Bool {
        if (matches(seq)) {
            pos += seq.unicodeScalars.count;
            return true;
        } else {
            return false;
        }
    }

    @discardableResult
    public func matchConsumeIgnoreCase(_ seq: String)->Bool {
        if (matchesIgnoreCase(seq)) {
            pos += seq.unicodeScalars.count;
            return true;
        } else {
            return false;
        }
    }

    public func containsIgnoreCase(_ seq: String )->Bool {
        // used to check presence of </title>, </style>. only finds consistent case.
        let loScan = seq.lowercased(with: Locale(identifier: "en"))
        let hiScan = seq.uppercased(with: Locale(identifier: "eng"))
        return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
    }


    public func toString()->String {
        return  input.string(pos, length - pos);
    }


    /**
     * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
     * <p />
     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
     * some more duplicates.
     */
    private func cacheString(_ start: Int, _ count: Int) -> String {
        let val = input;
        var cache : [String?] = stringCache;

        // limit (no cache):
        if (count > CharacterReader.maxCacheLen){
            return val.string(start, count);
        }

        // calculate hash:
        var hash : Int = 0;
        var offset = start;
        for _ in 0..<count{
            let ch = val.unicodeScalar(pos).value;
            hash = Int.addWithOverflow(Int.multiplyWithOverflow(31, hash).0, Int(ch)).0;
            offset+=1
        }

        // get from cache
		hash = abs(hash)
		let i = hash % cache.count
        let index : Int = abs(i) //Int(hash & Int(cache.count) - 1);
        var cached = cache[index];

        if (cached == nil)
        { // miss, add
            cached = val.string(start, count);
            cache[Int(index)] = cached;
        } else { // hashcode hit, check equality
            if (rangeEquals(start, count, cached!)) { // hit
                return cached!;
            } else { // hashcode conflict
                cached = val.string(start, count);
                cache[index] = cached; // update the cache, as recently used strings are more likely to show up again
            }
        }
        return cached!;
    }

    /**
     * Check if the value of the provided range equals the string.
     */
    public func rangeEquals(_ start: Int, _ count: Int, _ cached: String) -> Bool {
        if (count == cached.unicodeScalars.count)
        {
            var count = count
            let one = input;
            var i = start;
            var j = 0;
            while (count != 0) {
                count -= 1
                if (one.unicodeScalar(i) != cached.unicodeScalar(j) )
                {
                    return false;
                }
                j += 1
                i += 1
            }
            return true;
        }
        return false;
    }
}