Use binary search when handling HTML named entities

This commit is contained in:
Garth Snyder 2019-03-03 19:03:34 -08:00
parent 2cfe21cb27
commit 904ebe27fa
5 changed files with 181 additions and 161 deletions

View File

@ -0,0 +1,95 @@
//
// BinarySearch.swift
// SwiftSoup-iOS
//
// Created by Garth Snyder on 2/28/19.
// Copyright © 2019 Nabil Chatbi. All rights reserved.
//
// Adapted from https://stackoverflow.com/questions/31904396/swift-binary-search-for-standard-array
//
import Foundation
extension Collection {
/// Generalized binary search algorithm for ordered Collections
///
/// Behavior is undefined if the collection is not properly sorted.
///
/// This is only O(logN) for RandomAccessCollections; Collections in
/// general may implement offsetting of indexes as an O(K) operation. (E.g.,
/// Strings are like this).
///
/// - Note: If you are using this for searching only (not insertion), you
/// must always test the element at the returned index to ensure that
/// it's a genuine match. If the element is not present in the array,
/// you will still get a valid index back that represents the location
/// where it should be inserted. Also check to be sure the returned
/// index isn't off the end of the collection.
///
/// - Parameter predicate: Reports the ordering of a given Element relative
/// to the desired Element. Typically, this is <.
///
/// - Returns: Index N such that the predicate is true for all elements up to
/// but not including N, and is false for all elements N and beyond
func binarySearch(predicate: (Element) -> Bool) -> Index {
var low = startIndex
var high = endIndex
while low != high {
let mid = index(low, offsetBy: distance(from: low, to: high)/2)
if predicate(self[mid]) {
low = index(after: mid)
} else {
high = mid
}
}
return low
}
/// Binary search lookup for ordered Collections using a KeyPath
/// relative to Element.
///
/// Behavior is undefined if the collection is not properly sorted.
///
/// This is only O(logN) for RandomAccessCollections; Collections in
/// general may implement offsetting of indexes as an O(K) operation. (E.g.,
/// Strings are like this).
///
/// - Note: If you are using this for searching only (not insertion), you
/// must always test the element at the returned index to ensure that
/// it's a genuine match. If the element is not present in the array,
/// you will still get a valid index back that represents the location
/// where it should be inserted. Also check to be sure the returned
/// index isn't off the end of the collection.
///
/// - Parameter keyPath: KeyPath that extracts the Element value on which
/// the Collection is presorted. Must be Comparable and Equatable.
/// ordering is presumed to be <, however that is defined for the type.
///
/// - Returns: The index of a matching element, or nil if not found. If
/// the return value is non-nil, it is always a valid index.
func indexOfElement<T>(withValue value: T, atKeyPath keyPath: KeyPath<Element, T>) -> Index? where T: Comparable & Equatable {
let ix = binarySearch { $0[keyPath: keyPath] < value }
guard ix < endIndex else { return nil }
guard self[ix][keyPath: keyPath] == value else { return nil }
return ix
}
func element<T>(withValue value: T, atKeyPath keyPath: KeyPath<Element, T>) -> Element? where T: Comparable & Equatable {
if let ix = indexOfElement(withValue: value, atKeyPath: keyPath) {
return self[ix]
}
return nil
}
func elements<T>(withValue value: T, atKeyPath keyPath: KeyPath<Element, T>) -> [Element] where T: Comparable & Equatable {
guard let start = indexOfElement(withValue: value, atKeyPath: keyPath) else { return [] }
var end = index(after: start)
while end < endIndex && self[end][keyPath: keyPath] == value {
end = index(after: end)
}
return Array(self[start..<end])
}
}

View File

@ -18,7 +18,7 @@ public class Entities {
private static let emptyName = ""
private static let codepointRadix: Int = 36
public struct EscapeMode: Equatable {
public class EscapeMode: Equatable {
/** Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. */
public static let xhtml: EscapeMode = EscapeMode(string: Entities.xhtml, size: 4, id: 0)
@ -29,13 +29,19 @@ public class Entities {
fileprivate let value: Int
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
fileprivate var nameKeys: [String]
fileprivate var codeVals: [Int] // limitation is the few references with multiple characters; those go into multipoints.
struct NamedCodepoint {
let scalar: UnicodeScalar
let name: String
}
// Array of named references, sorted by name for binary search. built by BuildEntities.
// The few entities that map to a multi-codepoint sequence go into multipoints.
fileprivate var entitiesByName: [NamedCodepoint] = []
// table of codepoints to named entities.
fileprivate var codeKeys: [Int] // we don' support multicodepoints to single named value currently
fileprivate var nameVals: [String]
// Array of entities in first-codepoint order. We don't currently support
// multicodepoints to single named value currently. Lazy because this index
// is used only when generating HTML text.
fileprivate lazy var entitiesByCodepoint = entitiesByName.sorted() { a, b in a.scalar < b.scalar }
public static func == (left: EscapeMode, right: EscapeMode) -> Bool {
return left.value == right.value
@ -46,23 +52,14 @@ public class Entities {
}
private static let codeDelims: [UnicodeScalar] = [",", ";"]
init(string: String, size: Int, id: Int) {
nameKeys = [String](repeating: "", count: size)
codeVals = [Int](repeating: 0, count: size)
codeKeys = [Int](repeating: 0, count: size)
nameVals = [String](repeating: "", count: size)
value = id
//Load()
var i = 0
value = id
let reader: CharacterReader = CharacterReader(string)
while (!reader.isEmpty()) {
// NotNestedLessLess=10913,824;1887
entitiesByName.reserveCapacity(size)
while !reader.isEmpty() {
let name: String = reader.consumeTo("=")
reader.advance()
let cp1: Int = Int(reader.consumeToAny(EscapeMode.codeDelims), radix: codepointRadix) ?? 0
@ -75,100 +72,46 @@ public class Entities {
} else {
cp2 = empty
}
let index: Int = Int(reader.consumeTo("\n"), radix: codepointRadix) ?? 0
let _ = Int(reader.consumeTo("\n"), radix: codepointRadix) ?? 0
reader.advance()
nameKeys[i] = name
codeVals[i] = cp1
codeKeys[index] = cp1
nameVals[index] = name
entitiesByName.append(NamedCodepoint(scalar: UnicodeScalar(cp1)!, name: name))
if (cp2 != empty) {
var s = String()
s.append(Character(UnicodeScalar(cp1)!))
s.append(Character(UnicodeScalar(cp2)!))
multipoints[name] = s
}
i = i + 1
}
}
// init(string: String, size: Int, id: Int) {
// nameKeys = [String](repeating: "", count: size)
// codeVals = [Int](repeating: 0, count: size)
// codeKeys = [Int](repeating: 0, count: size)
// nameVals = [String](repeating: "", count: size)
// value = id
//
// let components = string.components(separatedBy: "\n")
//
// var i = 0
// for entry in components {
// let match = Entities.entityPattern.matcher(in: entry)
// if (match.find()) {
// let name = match.group(1)!
// let cp1 = Int(match.group(2)!, radix: codepointRadix)
// //let cp2 = Int(Int.parseInt(s: match.group(3), radix: codepointRadix))
// let cp2 = match.group(3) != nil ? Int(match.group(3)!, radix: codepointRadix) : empty
// let index = Int(match.group(4)!, radix: codepointRadix)
//
// nameKeys[i] = name
// codeVals[i] = cp1!
// codeKeys[index!] = cp1!
// nameVals[index!] = name
//
// if (cp2 != empty) {
// var s = String()
// s.append(Character(UnicodeScalar(cp1!)!))
// s.append(Character(UnicodeScalar(cp2!)!))
// multipoints[name] = s
// }
// i += 1
// }
// }
// }
public func codepointForName(_ name: String) -> Int {
// for s in nameKeys {
// if s == name {
// return codeVals[nameKeys.index(of: s)!]
// }
// }
guard let index = nameKeys.firstIndex(of: name) else {
return empty
}
return codeVals[index]
}
public func nameForCodepoint(_ codepoint: Int ) -> String {
//let ss = codeKeys.index(of: codepoint)
var index = -1
for s in codeKeys {
if s == codepoint {
index = codeKeys.firstIndex(of: codepoint)!
multipoints[name] = [UnicodeScalar(cp1)!, UnicodeScalar(cp2)!]
}
}
// Entities should start in name order, but better safe than sorry...
entitiesByName.sort() { a, b in a.name < b.name }
}
if (index >= 0) {
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
// (and binary search for same item with multi results is undefined
return (index < nameVals.count-1 && codeKeys[index+1] == codepoint) ?
nameVals[index+1] : nameVals[index]
// Only returns the first of potentially multiple codepoints
public func codepointForName(_ name: String) -> UnicodeScalar? {
let ix = entitiesByName.binarySearch { $0.name < name }
guard ix < entitiesByName.endIndex else { return nil }
let entity = entitiesByName[ix]
guard entity.name == name else { return nil }
return entity.scalar
}
// Search by first codepoint only
public func nameForCodepoint(_ codepoint: UnicodeScalar ) -> String? {
var ix = entitiesByCodepoint.binarySearch { $0.scalar < codepoint }
var matches: [String] = []
while ix < entitiesByCodepoint.endIndex && entitiesByCodepoint[ix].scalar == codepoint {
matches.append(entitiesByCodepoint[ix].name)
ix = entitiesByCodepoint.index(after: ix)
}
return emptyName
return matches.isEmpty ? nil : matches.sorted().last!
}
private func size() -> Int {
return nameKeys.count
return entitiesByName.count
}
}
private static var multipoints: Dictionary<String, String> = Dictionary<String, String>() // name -> multiple character references
private init() {
}
private static var multipoints: [String: [UnicodeScalar]] = [:] // name -> multiple character references
/**
* Check if the input is a known named entity
@ -176,7 +119,7 @@ public class Entities {
* @return true if a known named entity
*/
public static func isNamedEntity(_ name: String ) -> Bool {
return (EscapeMode.extended.codepointForName(name) != empty)
return (EscapeMode.extended.codepointForName(name) != nil)
}
/**
@ -186,17 +129,7 @@ public class Entities {
* @see #isNamedEntity(String)
*/
public static func isBaseNamedEntity(_ name: String) -> Bool {
return EscapeMode.base.codepointForName(name) != empty
}
/**
* Get the Character value of the named entity
* @param name named entity (e.g. "lt" or "amp")
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
* @deprecated does not support characters outside the BMP or multiple character names
*/
public static func getCharacterByName(name: String) -> Character {
return Character.convertFromIntegerLiteral(value: EscapeMode.extended.codepointForName(name))
return EscapeMode.base.codepointForName(name) != nil
}
/**
@ -204,30 +137,20 @@ public class Entities {
* @param name entity (e.g. "lt" or "amp")
* @return the string value of the character(s) represented by this entity, or "" if not defined
*/
public static func getByName(name: String) -> String {
let val = multipoints[name]
if (val != nil) {return val!}
let codepoint = EscapeMode.extended.codepointForName(name)
if (codepoint != empty) {
return String(Character(UnicodeScalar(codepoint)!))
public static func getByName(name: String) -> String? {
if let scalars = codepointsForName(name) {
return String(String.UnicodeScalarView(scalars))
}
return emptyName
return nil
}
public static func codepointsForName(_ name: String, codepoints: inout [UnicodeScalar]) -> Int {
if let val: String = multipoints[name] {
codepoints[0] = val.unicodeScalar(0)
codepoints[1] = val.unicodeScalar(1)
return 2
public static func codepointsForName(_ name: String) -> [UnicodeScalar]? {
if let scalars = multipoints[name] {
return scalars
} else if let scalar = EscapeMode.extended.codepointForName(name) {
return [scalar]
}
let codepoint = EscapeMode.extended.codepointForName(name)
if (codepoint != empty) {
codepoints[0] = UnicodeScalar(codepoint)!
return 1
}
return 0
return nil
}
public static func escape(_ string: String, _ encode: String.Encoding = .utf8 ) -> String {
@ -326,9 +249,9 @@ public class Entities {
}
private static func appendEncoded(accum: StringBuilder, escapeMode: EscapeMode, codePoint: UnicodeScalar) {
let name = escapeMode.nameForCodepoint(Int(codePoint.value))
if (name != emptyName) // ok for identity check
{accum.append(UnicodeScalar.Ampersand).append(name).append(";")
if let name = escapeMode.nameForCodepoint(codePoint) {
// ok for identity check
accum.append(UnicodeScalar.Ampersand).append(name).append(";")
} else {
accum.append("&#x").append(String.toHexString(n: Int(codePoint.value)) ).append(";")
}

View File

@ -124,9 +124,6 @@ final class Tokeniser {
selfClosingFlagAcknowledged = true
}
private var codepointHolder: [UnicodeScalar] = [UnicodeScalar(0)!] // holder to not have to keep creating arrays
private var multipointHolder: [UnicodeScalar] = [UnicodeScalar(0)!, UnicodeScalar(0)!]
func consumeCharacterReference(_ additionalAllowedCharacter: UnicodeScalar?, _ inAttribute: Bool)throws->[UnicodeScalar]? {
if (reader.isEmpty()) {
return nil
@ -138,7 +135,6 @@ final class Tokeniser {
return nil
}
var codeRef: [UnicodeScalar] = codepointHolder
reader.markPos()
if (reader.matchConsume("#")) { // numbered
let isHexMode: Bool = reader.matchConsumeIgnoreCase("X")
@ -160,13 +156,11 @@ final class Tokeniser {
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range")
codeRef[0] = Tokeniser.replacementChar
return codeRef
return [Tokeniser.replacementChar]
} else {
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
codeRef[0] = UnicodeScalar(charval)!
return codeRef
return [UnicodeScalar(charval)!]
}
} else { // named
// get as many letters as possible, and look for matching entities.
@ -190,16 +184,14 @@ final class Tokeniser {
if (!reader.matchConsume(";")) {
characterReferenceError("missing semicolon") // missing semi
}
let numChars: Int = Entities.codepointsForName(nameRef, codepoints: &multipointHolder)
if (numChars == 1) {
codeRef[0] = multipointHolder[0]
return codeRef
} else if (numChars == 2) {
return multipointHolder
} else {
try Validate.fail(msg: "Unexpected characters returned for \(nameRef) num: \(numChars)")
return multipointHolder
if let points = Entities.codepointsForName(nameRef) {
if points.count > 2 {
try Validate.fail(msg: "Unexpected characters returned for \(nameRef) num: \(points.count)")
}
return points
}
try Validate.fail(msg: "Entity name not found: \(nameRef)")
return []
}
}

View File

@ -96,6 +96,10 @@
BB57C2E5222CB0E3008933AA /* Wirecutter.html in Resources */ = {isa = PBXBuildFile; fileRef = BB57C2DF222CB0E2008933AA /* Wirecutter.html */; };
BB57C2E6222CB0E3008933AA /* GitHub.html in Resources */ = {isa = PBXBuildFile; fileRef = BB57C2E0222CB0E3008933AA /* GitHub.html */; };
BB57C2E7222CB0E3008933AA /* Amazon.html in Resources */ = {isa = PBXBuildFile; fileRef = BB57C2E1222CB0E3008933AA /* Amazon.html */; };
BB57C2EA222CCCB6008933AA /* BinarySearch.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB57C2E9222CCCB5008933AA /* BinarySearch.swift */; };
BB57C2EB222CCCC3008933AA /* BinarySearch.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB57C2E9222CCCB5008933AA /* BinarySearch.swift */; };
BB57C2EC222CCCC5008933AA /* BinarySearch.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB57C2E9222CCCB5008933AA /* BinarySearch.swift */; };
BB57C2ED222CCCC6008933AA /* BinarySearch.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB57C2E9222CCCB5008933AA /* BinarySearch.swift */; };
BD3B5B6B1FBED933001FDB3B /* Whitelist.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8CC2FD8C1DB12382002CB469 /* Whitelist.swift */; };
BD3B5B6C1FBED933001FDB3B /* Tokeniser.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8C19C82E1DB7E5D200B8FC22 /* Tokeniser.swift */; };
BD3B5B6D1FBED933001FDB3B /* Pattern.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8CE418541DAA568600240B42 /* Pattern.swift */; };
@ -378,6 +382,7 @@
BB57C2DF222CB0E2008933AA /* Wirecutter.html */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.html; path = Wirecutter.html; sourceTree = "<group>"; };
BB57C2E0222CB0E3008933AA /* GitHub.html */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.html; path = GitHub.html; sourceTree = "<group>"; };
BB57C2E1222CB0E3008933AA /* Amazon.html */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.html; path = Amazon.html; sourceTree = "<group>"; };
BB57C2E9222CCCB5008933AA /* BinarySearch.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BinarySearch.swift; sourceTree = "<group>"; };
BD36975B20135EBB00D8FAC6 /* SwiftSoup.podspec */ = {isa = PBXFileReference; lastKnownFileType = text; path = SwiftSoup.podspec; sourceTree = "<group>"; };
BD3B5BA91FBED933001FDB3B /* SwiftSoup.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = SwiftSoup.framework; sourceTree = BUILT_PRODUCTS_DIR; };
BD3B5BAA1FBED934001FDB3B /* InfoMac.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; name = InfoMac.plist; path = /Users/nabil/Documents/nabil/SwiftSoup/Sources/InfoMac.plist; sourceTree = "<absolute>"; };
@ -447,6 +452,7 @@
8C7ED6731E00B0690032A27C /* shared */ = {
isa = PBXGroup;
children = (
BB57C2E9222CCCB5008933AA /* BinarySearch.swift */,
8CE418501DAA568600240B42 /* ArrayExt.swift */,
8CE418511DAA568600240B42 /* CharacterExt.swift */,
8CE418541DAA568600240B42 /* Pattern.swift */,
@ -972,6 +978,7 @@
8CE4186D1DAA568700240B42 /* FormElement.swift in Sources */,
8C73DB4B1DDA605900233A68 /* UnicodeScalar.swift in Sources */,
8CE418601DAA568600240B42 /* Validate.swift in Sources */,
BB57C2EA222CCCB6008933AA /* BinarySearch.swift in Sources */,
8C3617C11DBAC2AE00E00CFE /* Selector.swift in Sources */,
8CE418711DAA568700240B42 /* Parser.swift in Sources */,
8CE418701DAA568700240B42 /* XmlDeclaration.swift in Sources */,
@ -1073,6 +1080,7 @@
BD3B5B881FBED933001FDB3B /* FormElement.swift in Sources */,
BD3B5B891FBED933001FDB3B /* UnicodeScalar.swift in Sources */,
BD3B5B8A1FBED933001FDB3B /* Validate.swift in Sources */,
BB57C2EB222CCCC3008933AA /* BinarySearch.swift in Sources */,
BD3B5B8B1FBED933001FDB3B /* Selector.swift in Sources */,
BD3B5B8C1FBED933001FDB3B /* Parser.swift in Sources */,
BD3B5B8D1FBED933001FDB3B /* XmlDeclaration.swift in Sources */,
@ -1135,6 +1143,7 @@
BD3B5BCB1FC063BD001FDB3B /* FormElement.swift in Sources */,
BD3B5BCC1FC063BD001FDB3B /* UnicodeScalar.swift in Sources */,
BD3B5BCD1FC063BD001FDB3B /* Validate.swift in Sources */,
BB57C2EC222CCCC5008933AA /* BinarySearch.swift in Sources */,
BD3B5BCE1FC063BD001FDB3B /* Selector.swift in Sources */,
BD3B5BCF1FC063BD001FDB3B /* Parser.swift in Sources */,
BD3B5BD01FC063BD001FDB3B /* XmlDeclaration.swift in Sources */,
@ -1197,6 +1206,7 @@
BD3B5C0E1FC06423001FDB3B /* FormElement.swift in Sources */,
BD3B5C0F1FC06423001FDB3B /* UnicodeScalar.swift in Sources */,
BD3B5C101FC06423001FDB3B /* Validate.swift in Sources */,
BB57C2ED222CCCC6008933AA /* BinarySearch.swift in Sources */,
BD3B5C111FC06423001FDB3B /* Selector.swift in Sources */,
BD3B5C121FC06423001FDB3B /* Parser.swift in Sources */,
BD3B5C131FC06423001FDB3B /* XmlDeclaration.swift in Sources */,

View File

@ -50,15 +50,15 @@ class EntitiesTest: XCTestCase {
func testXhtml() {
//let text = "&amp; &gt; &lt; &quot;";
XCTAssertEqual(38, Entities.EscapeMode.xhtml.codepointForName("amp"))
XCTAssertEqual(62, Entities.EscapeMode.xhtml.codepointForName("gt"))
XCTAssertEqual(60, Entities.EscapeMode.xhtml.codepointForName("lt"))
XCTAssertEqual(34, Entities.EscapeMode.xhtml.codepointForName("quot"))
XCTAssertEqual(UnicodeScalar(38), Entities.EscapeMode.xhtml.codepointForName("amp"))
XCTAssertEqual(UnicodeScalar(62), Entities.EscapeMode.xhtml.codepointForName("gt"))
XCTAssertEqual(UnicodeScalar(60), Entities.EscapeMode.xhtml.codepointForName("lt"))
XCTAssertEqual(UnicodeScalar(34), Entities.EscapeMode.xhtml.codepointForName("quot"))
XCTAssertEqual("amp", Entities.EscapeMode.xhtml.nameForCodepoint(38))
XCTAssertEqual("gt", Entities.EscapeMode.xhtml.nameForCodepoint(62))
XCTAssertEqual("lt", Entities.EscapeMode.xhtml.nameForCodepoint(60))
XCTAssertEqual("quot", Entities.EscapeMode.xhtml.nameForCodepoint(34))
XCTAssertEqual("amp", Entities.EscapeMode.xhtml.nameForCodepoint(UnicodeScalar(38)!))
XCTAssertEqual("gt", Entities.EscapeMode.xhtml.nameForCodepoint(UnicodeScalar(62)!))
XCTAssertEqual("lt", Entities.EscapeMode.xhtml.nameForCodepoint(UnicodeScalar(60)!))
XCTAssertEqual("quot", Entities.EscapeMode.xhtml.nameForCodepoint(UnicodeScalar(34)!))
}
func testGetByName() {