SwiftSoup/Sources/nodes/Entities.swift

338 lines
13 KiB
Swift

//
// Entities.swift
// SwifSoup
//
// Created by Nabil Chatbi on 29/09/16.
// Copyright © 2016 Nabil Chatbi.. All rights reserved.
//
import Foundation
/**
* HTML entities, and escape routines.
* Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML
* named character references</a>.
*/
public class Entities {
static let entityPattern : Pattern = Pattern("^(\\w+)=(\\w+)(?:,(\\w+))?;(\\w+)$")
static let empty = -1;
static let emptyName = "";
static let codepointRadix : Int = 36;
public struct EscapeMode : Equatable{
/** Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. */
public static let xhtml : EscapeMode = EscapeMode(file: "entities-xhtml.properties", size: 4, id: 0)
/** Default HTML output entities. */
public static let base : EscapeMode = EscapeMode(file: "entities-base.properties", size: 106, id: 1)
/** Complete HTML entities. */
public static let extended: EscapeMode = EscapeMode(file: "entities-full.properties", size: 2125, id: 2)
fileprivate let value : Int ;
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
fileprivate var nameKeys : [String];
fileprivate var codeVals : [Int] ; // limitation is the few references with multiple characters; those go into multipoints.
// table of codepoints to named entities.
fileprivate var codeKeys : [Int] // we don' support multicodepoints to single named value currently
fileprivate var nameVals : [String] ;
public static func == (left: EscapeMode, right: EscapeMode) -> Bool {
return left.value == right.value
}
static func != (left: EscapeMode, right: EscapeMode) -> Bool {
return left.value != right.value
}
init(file: String, size:Int ,id:Int) {
nameKeys = [String](repeating: "", count: size)
codeVals = [Int](repeating: 0, count: size)
codeKeys = [Int](repeating: 0, count: size)
nameVals = [String](repeating: "", count: size)
value = id
let frameworkBundle = Bundle(for: Entities.self)
guard let path = frameworkBundle.path(forResource:file, ofType: "") else {
return
}
if let aStreamReader = StreamReader(path:path) {
defer
{
aStreamReader.close()
}
var i = 0;
while let entry = aStreamReader.nextLine() {
// NotNestedLessLess=10913,824;1887
let match = Entities.entityPattern.matcher(in: entry);
if (match.find())
{
let name = match.group(1)!;
let cp1 = Int(match.group(2)!,radix: codepointRadix)
//let cp2 = Int(Int.parseInt(s: match.group(3), radix: codepointRadix));
let cp2 = match.group(3) != nil ? Int(match.group(3)!,radix: codepointRadix) : empty;
let index = Int(match.group(4)!,radix: codepointRadix)
nameKeys[i] = name;
codeVals[i] = cp1!;
codeKeys[index!] = cp1!;
nameVals[index!] = name;
if (cp2 != empty) {
var s = String();
s.append(Character(UnicodeScalar(cp1!)!))
s.append(Character(UnicodeScalar(cp2!)!))
multipoints[name] = s
}
i += 1;
}
}
}
}
public func codepointForName(_ name: String) -> Int
{
let index = nameKeys.binarySearch(nameKeys,name)
return index >= 0 ? codeVals[index] : empty;
}
public func nameForCodepoint(_ codepoint: Int )->String {
//let ss = codeKeys.index(of: codepoint)
let index = codeKeys.binarySearch(codeKeys,codepoint)
if (index >= 0) {
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
// (and binary search for same item with multi results is undefined
return (index < nameVals.count-1 && codeKeys[index+1] == codepoint) ?
nameVals[index+1] : nameVals[index];
}
return emptyName;
}
private func size() -> Int {
return nameKeys.count;
}
}
private static var multipoints : Dictionary<String, String> = Dictionary<String, String>(); // name -> multiple character references
private init() {
}
/**
* Check if the input is a known named entity
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity
*/
open static func isNamedEntity(_ name: String )->Bool {
return (EscapeMode.extended.codepointForName(name) != empty);
}
/**
* Check if the input is a known named entity in the base entity set.
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity in the base set
* @see #isNamedEntity(String)
*/
open static func isBaseNamedEntity(_ name: String) -> Bool {
return EscapeMode.base.codepointForName(name) != empty;
}
/**
* Get the Character value of the named entity
* @param name named entity (e.g. "lt" or "amp")
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
* @deprecated does not support characters outside the BMP or multiple character names
*/
open static func getCharacterByName(name: String) -> Character {
return Character.convertFromIntegerLiteral(value:EscapeMode.extended.codepointForName(name));
}
/**
* Get the character(s) represented by the named entitiy
* @param name entity (e.g. "lt" or "amp")
* @return the string value of the character(s) represented by this entity, or "" if not defined
*/
open static func getByName(name: String)-> String {
let val = multipoints[name];
if (val != nil){return val!;}
let codepoint = EscapeMode.extended.codepointForName(name);
if (codepoint != empty)
{
return String(Character(UnicodeScalar(codepoint)!));
}
return emptyName;
}
open static func codepointsForName(_ name: String , codepoints: inout [UnicodeScalar]) -> Int {
if let val: String = multipoints[name]
{
codepoints[0] = val.unicodeScalar(0);
codepoints[1] = val.unicodeScalar(1);
return 2;
}
let codepoint = EscapeMode.extended.codepointForName(name);
if (codepoint != empty) {
codepoints[0] = UnicodeScalar(codepoint)!;
return 1;
}
return 0;
}
open static func escape(_ string: String,_ out: OutputSettings) -> String
{
let accum = StringBuilder();//string.characters.count * 2
escape(accum, string, out, false, false, false);
// try {
//
// } catch (IOException e) {
// throw new SerializationException(e); // doesn't happen
// }
return accum.toString();
}
// this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
static func escape(_ accum: StringBuilder ,_ string: String,_ out: OutputSettings,_ inAttribute: Bool,_ normaliseWhite: Bool,_ stripLeadingWhite: Bool )
{
var lastWasWhite = false;
var reachedNonWhite = false;
let escapeMode : EscapeMode = out.escapeMode();
let encoder : String.Encoding = out.encoder();
//let length = UInt32(string.characters.count);
var codePoint : UnicodeScalar;
for ch in string.characters
{
codePoint = ch.unicodeScalar
if (normaliseWhite) {
if (codePoint.isWhitespace) {
if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite){
continue;
}
accum.append(" ");
lastWasWhite = true;
continue;
} else {
lastWasWhite = false;
reachedNonWhite = true;
}
}
// surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
if (codePoint.value < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
let c = codePoint;
// html specific and required escapes:
switch (codePoint) {
case "&":
accum.append("&amp;");
break;
case UnicodeScalar(UInt32(0xA0))!:
if (escapeMode != EscapeMode.xhtml){
accum.append("&nbsp;");
}else{
accum.append("&#xa0;");
}
break;
case "<":
// escape when in character data or when in a xml attribue val; not needed in html attr val
if (!inAttribute || escapeMode == EscapeMode.xhtml){
accum.append("&lt;");
}else{
accum.append(c);
}
break;
case ">":
if (!inAttribute){
accum.append("&gt;");
}else{
accum.append(c);}
break;
case "\"":
if (inAttribute){
accum.append("&quot;");
}else{
accum.append(c);
}
break;
default:
if (canEncode(c, encoder)){
accum.append(c);
}
else{
appendEncoded(accum: accum, escapeMode: escapeMode, codePoint: codePoint);
}
}
} else {
if (encoder.canEncode(String(codePoint))) // uses fallback encoder for simplicity
{
accum.append(String(codePoint))
}else{
appendEncoded(accum: accum, escapeMode: escapeMode, codePoint: codePoint);
}
}
}
}
private static func appendEncoded(accum: StringBuilder, escapeMode: EscapeMode, codePoint: UnicodeScalar)
{
let name = escapeMode.nameForCodepoint(Int(codePoint.value));
if (name != emptyName) // ok for identity check
{accum.append("&").append(name).append(";");
}else{
accum.append("&#x").append(String.toHexString(n:Int(codePoint.value)) ).append(";");
}
}
static func unescape(_ string: String)throws-> String {
return try unescape(string: string, strict: false);
}
/**
* Unescape the input string.
* @param string to un-HTML-escape
* @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
* @return unescaped string
*/
public static func unescape(string: String, strict: Bool)throws -> String {
return try Parser.unescapeEntities(string, strict);
}
/*
* Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
* After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
* performance may be bad. We can add more encoders for common character sets that are impacted by performance
* issues on Android if required.
*
* Benchmarks: *
* OLD toHtml() impl v New (fastpath) in millis
* Wiki: 1895, 16
* CNN: 6378, 55
* Alterslash: 3013, 28
* Jsoup: 167, 2
*/
private static func canEncode(_ c: UnicodeScalar, _ fallback: String.Encoding)->Bool {
// todo add more charset tests if impacted by Android's bad perf in canEncode
switch (fallback)
{
case String.Encoding.ascii:
return c.value < 0x80;
case String.Encoding.utf8:
return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
default:
return fallback.canEncode(String(Character(c)))
}
}
}