feat: Cleaner can clean head and body
This commit is contained in:
parent
d91752f45d
commit
9511b86ecd
|
@ -9,131 +9,132 @@
|
|||
import Foundation
|
||||
|
||||
open class Cleaner {
|
||||
fileprivate let whitelist: Whitelist
|
||||
fileprivate let headWhitelist: Whitelist?
|
||||
fileprivate let bodyWhitelist: Whitelist
|
||||
|
||||
/**
|
||||
Create a new cleaner, that sanitizes documents using the supplied whitelist.
|
||||
@param whitelist white-list to clean with
|
||||
*/
|
||||
public init(_ whitelist: Whitelist) {
|
||||
self.whitelist = whitelist
|
||||
/// Create a new cleaner, that sanitizes documents' `<head>` and `<body>` using the supplied whitelist.
|
||||
/// - Parameters:
|
||||
/// - headWhitelist: Whitelist to clean the head with
|
||||
/// - bodyWhitelist: Whitelist to clean the body with
|
||||
public init(headWhitelist: Whitelist?, bodyWhitelist: Whitelist) {
|
||||
self.headWhitelist = headWhitelist
|
||||
self.bodyWhitelist = bodyWhitelist
|
||||
}
|
||||
|
||||
/**
|
||||
Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
|
||||
The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
|
||||
@param dirtyDocument Untrusted base document to clean.
|
||||
@return cleaned document.
|
||||
*/
|
||||
public func clean(_ dirtyDocument: Document)throws->Document {
|
||||
//Validate.notNull(dirtyDocument)
|
||||
let clean: Document = Document.createShell(dirtyDocument.getBaseUri())
|
||||
if (dirtyDocument.body() != nil && clean.body() != nil) // frameset documents won't have a body. the clean doc will have empty body.
|
||||
{
|
||||
try copySafeNodes(dirtyDocument.body()!, clean.body()!)
|
||||
}
|
||||
/// Create a new cleaner, that sanitizes documents' `<body>` using the supplied whitelist.
|
||||
/// - Parameter whitelist: Whitelist to clean the body with
|
||||
convenience init(_ whitelist: Whitelist) {
|
||||
self.init(headWhitelist: nil, bodyWhitelist: whitelist)
|
||||
}
|
||||
|
||||
/// Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
|
||||
/// The original document is not modified. Only elements from the dirt document's `<body>` are used.
|
||||
/// - Parameter dirtyDocument: Untrusted base document to clean.
|
||||
/// - Returns: A cleaned document.
|
||||
public func clean(_ dirtyDocument: Document) throws -> Document {
|
||||
let clean = Document.createShell(dirtyDocument.getBaseUri())
|
||||
if let headWhitelist, let dirtHead = dirtyDocument.head(), let cleanHead = clean.head() { // frameset documents won't have a head. the clean doc will have empty head.
|
||||
try copySafeNodes(dirtHead, cleanHead, whitelist: headWhitelist)
|
||||
}
|
||||
if let dirtBody = dirtyDocument.body(), let cleanBody = clean.body() { // frameset documents won't have a body. the clean doc will have empty body.
|
||||
try copySafeNodes(dirtBody, cleanBody, whitelist: bodyWhitelist)
|
||||
}
|
||||
return clean
|
||||
}
|
||||
|
||||
/**
|
||||
Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
|
||||
in the input HTML are allowed by the whitelist.
|
||||
<p>
|
||||
This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
|
||||
using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
|
||||
to ensure enforced attributes are set correctly, and that the output is tidied.
|
||||
</p>
|
||||
@param dirtyDocument document to test
|
||||
@return true if no tags or attributes need to be removed; false if they do
|
||||
*/
|
||||
public func isValid(_ dirtyDocument: Document)throws->Bool {
|
||||
//Validate.notNull(dirtyDocument)
|
||||
let clean: Document = Document.createShell(dirtyDocument.getBaseUri())
|
||||
let numDiscarded: Int = try copySafeNodes(dirtyDocument.body()!, clean.body()!)
|
||||
return numDiscarded == 0
|
||||
/// Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
|
||||
/// in the input HTML are allowed by the whitelist.
|
||||
///
|
||||
/// This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
|
||||
/// using the ``clean(_:)`` document. If using as a validator, it is recommended to still clean the document
|
||||
/// to ensure enforced attributes are set correctly, and that the output is tidied.
|
||||
/// - Parameter dirtyDocument: document to test
|
||||
/// - Returns: true if no tags or attributes need to be removed; false if they do
|
||||
public func isValid(_ dirtyDocument: Document) throws -> Bool {
|
||||
let clean = Document.createShell(dirtyDocument.getBaseUri())
|
||||
let numDiscarded = try copySafeNodes(dirtyDocument.body()!, clean.body()!, whitelist: bodyWhitelist)
|
||||
return numDiscarded == 0
|
||||
}
|
||||
|
||||
@discardableResult
|
||||
fileprivate func copySafeNodes(_ source: Element, _ dest: Element)throws->Int {
|
||||
let cleaningVisitor: Cleaner.CleaningVisitor = Cleaner.CleaningVisitor(source, dest, self)
|
||||
let traversor: NodeTraversor = NodeTraversor(cleaningVisitor)
|
||||
try traversor.traverse(source)
|
||||
fileprivate func copySafeNodes(_ source: Element, _ dest: Element, whitelist: Whitelist) throws -> Int {
|
||||
let cleaningVisitor = Cleaner.CleaningVisitor(source, dest, whitelist)
|
||||
try NodeTraversor(cleaningVisitor).traverse(source)
|
||||
return cleaningVisitor.numDiscarded
|
||||
}
|
||||
|
||||
fileprivate func createSafeElement(_ sourceEl: Element)throws->ElementMeta {
|
||||
let sourceTag: String = sourceEl.tagName()
|
||||
let destAttrs: Attributes = Attributes()
|
||||
let dest: Element = try Element(Tag.valueOf(sourceTag), sourceEl.getBaseUri(), destAttrs)
|
||||
var numDiscarded: Int = 0
|
||||
|
||||
if let sourceAttrs = sourceEl.getAttributes() {
|
||||
for sourceAttr: Attribute in sourceAttrs {
|
||||
if (try whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) {
|
||||
destAttrs.put(attribute: sourceAttr)
|
||||
} else {
|
||||
numDiscarded+=1
|
||||
}
|
||||
}
|
||||
}
|
||||
let enforcedAttrs: Attributes = try whitelist.getEnforcedAttributes(sourceTag)
|
||||
destAttrs.addAll(incoming: enforcedAttrs)
|
||||
|
||||
return ElementMeta(dest, numDiscarded)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
extension Cleaner {
|
||||
fileprivate final class CleaningVisitor: NodeVisitor {
|
||||
var numDiscarded: Int = 0
|
||||
let root: Element
|
||||
var destination: Element? // current element to append nodes to
|
||||
private(set) var numDiscarded = 0
|
||||
|
||||
private var cleaner: Cleaner
|
||||
private let root: Element
|
||||
private var destination: Element? // current element to append nodes to
|
||||
|
||||
public init(_ root: Element, _ destination: Element, _ cleaner: Cleaner) {
|
||||
private let whitelist: Whitelist
|
||||
|
||||
public init(_ root: Element, _ destination: Element, _ whitelist: Whitelist) {
|
||||
self.root = root
|
||||
self.destination = destination
|
||||
self.cleaner = cleaner
|
||||
self.whitelist = whitelist
|
||||
}
|
||||
|
||||
public func head(_ source: Node, _ depth: Int)throws {
|
||||
if let sourceEl = (source as? Element) {
|
||||
if (cleaner.whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
|
||||
let meta: Cleaner.ElementMeta = try cleaner.createSafeElement(sourceEl)
|
||||
let destChild: Element = meta.el
|
||||
public func head(_ source: Node, _ depth: Int) throws {
|
||||
if let sourceEl = source as? Element {
|
||||
if whitelist.isSafeTag(sourceEl.tagName()) { // safe, clone and copy safe attrs
|
||||
let meta = try createSafeElement(sourceEl)
|
||||
let destChild = meta.el
|
||||
try destination?.appendChild(destChild)
|
||||
|
||||
numDiscarded += meta.numAttribsDiscarded
|
||||
destination = destChild
|
||||
} else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
|
||||
numDiscarded+=1
|
||||
} else if source != root { // not a safe tag, so don't add. don't count root against discarded.
|
||||
numDiscarded += 1
|
||||
}
|
||||
} else if let sourceText = (source as? TextNode) {
|
||||
let destText: TextNode = TextNode(sourceText.getWholeText(), source.getBaseUri())
|
||||
} else if let sourceText = source as? TextNode {
|
||||
let destText = TextNode(sourceText.getWholeText(), source.getBaseUri())
|
||||
try destination?.appendChild(destText)
|
||||
} else if let sourceData = (source as? DataNode) {
|
||||
if sourceData.parent() != nil && cleaner.whitelist.isSafeTag(sourceData.parent()!.nodeName()) {
|
||||
//let sourceData: DataNode = (DataNode) source
|
||||
let destData: DataNode = DataNode(sourceData.getWholeData(), source.getBaseUri())
|
||||
} else if let sourceData = source as? DataNode {
|
||||
if sourceData.parent() != nil && whitelist.isSafeTag(sourceData.parent()!.nodeName()) {
|
||||
let destData = DataNode(sourceData.getWholeData(), source.getBaseUri())
|
||||
try destination?.appendChild(destData)
|
||||
} else {
|
||||
numDiscarded+=1
|
||||
numDiscarded += 1
|
||||
}
|
||||
} else { // else, we don't care about comments, xml proc instructions, etc
|
||||
numDiscarded+=1
|
||||
numDiscarded += 1
|
||||
}
|
||||
}
|
||||
|
||||
public func tail(_ source: Node, _ depth: Int)throws {
|
||||
if let x = (source as? Element) {
|
||||
if cleaner.whitelist.isSafeTag(x.nodeName()) {
|
||||
public func tail(_ source: Node, _ depth: Int) throws {
|
||||
if let x = source as? Element {
|
||||
if whitelist.isSafeTag(x.nodeName()) {
|
||||
// would have descended, so pop destination stack
|
||||
destination = destination?.parent()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func createSafeElement(_ sourceEl: Element) throws -> ElementMeta {
|
||||
let sourceTag = sourceEl.tagName()
|
||||
let destAttrs = Attributes()
|
||||
var numDiscarded = 0
|
||||
|
||||
if let sourceAttrs = sourceEl.getAttributes() {
|
||||
for sourceAttr in sourceAttrs {
|
||||
if try whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr) {
|
||||
destAttrs.put(attribute: sourceAttr)
|
||||
} else {
|
||||
numDiscarded += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
let enforcedAttrs = try whitelist.getEnforcedAttributes(sourceTag)
|
||||
destAttrs.addAll(incoming: enforcedAttrs)
|
||||
|
||||
let dest = try Element(Tag.valueOf(sourceTag), sourceEl.getBaseUri(), destAttrs)
|
||||
return ElementMeta(dest, numDiscarded)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
//
|
||||
// HeadCleaner.swift
|
||||
// SwiftSoup
|
||||
//
|
||||
// Created by Valentin Perignon on 25/04/2023.
|
||||
//
|
||||
|
||||
import Foundation
|
||||
|
||||
public enum HeadCleaner {
|
||||
/// Adds to the destination document a sanitized version from the dirt document's `<head>code</head>`.
|
||||
/// - Parameters:
|
||||
/// - dirtyDocument: Source document containing the tag `<head>` to sanitize
|
||||
/// - destinationDocument: Document with a cleaned body.
|
||||
public static func clean(dirtyDocument: Document, destinationDocument: Document) throws {
|
||||
guard let dirtHead = dirtyDocument.head(), let cleanedHead = destinationDocument.head() else { return }
|
||||
try copySafeNodes(source: dirtHead, destination: cleanedHead)
|
||||
}
|
||||
|
||||
static private func copySafeNodes(source: Element, destination: Element) throws {
|
||||
let cleaningVisitor = CleaningVisitor(root: source, destination: destination)
|
||||
try NodeTraversor(cleaningVisitor).traverse(source)
|
||||
}
|
||||
}
|
||||
|
||||
extension HeadCleaner {
|
||||
private final class CleaningVisitor: NodeVisitor {
|
||||
private static let allowedTags = ["style", "meta", "base"]
|
||||
|
||||
private let root: Element
|
||||
private var destination: Element
|
||||
|
||||
private var elementToSkip: Element?
|
||||
|
||||
init(root: Element, destination: Element) {
|
||||
self.root = root
|
||||
self.destination = destination
|
||||
}
|
||||
|
||||
public func head(_ node: SwiftSoup.Node, _ depth: Int) throws {
|
||||
guard elementToSkip == nil else { return }
|
||||
|
||||
if let elementNode = node as? Element {
|
||||
if isSafeTag(node: elementNode) {
|
||||
let sourceTag = elementNode.nodeName()
|
||||
|
||||
guard let destinationAttributes = elementNode.attributes?.clone() else { return }
|
||||
let destinationChild = Element(Tag(sourceTag), elementNode.baseUri ?? "", destinationAttributes)
|
||||
try destination.appendChild(destinationChild)
|
||||
destination = destinationChild
|
||||
} else if node != root {
|
||||
elementToSkip = elementNode
|
||||
}
|
||||
} else if let textNode = node as? TextNode {
|
||||
let destinationText = TextNode(textNode.getWholeText(), textNode.getBaseUri())
|
||||
try destination.appendChild(destinationText)
|
||||
} else if let dataNode = node as? DataNode, let parent = node.parent(), isSafeTag(node: parent) {
|
||||
let destinationData = DataNode(dataNode.getWholeData(), dataNode.getBaseUri())
|
||||
try destination.appendChild(destinationData)
|
||||
}
|
||||
}
|
||||
|
||||
public func tail(_ node: SwiftSoup.Node, _ depth: Int) throws {
|
||||
if node == elementToSkip {
|
||||
elementToSkip = nil
|
||||
} else if let elementNode = node as? Element, isSafeTag(node: elementNode) {
|
||||
if let parent = destination.parent() {
|
||||
destination = parent
|
||||
} else {
|
||||
throw Exception.Error(type: .IllegalArgumentException, Message: "Illegal state")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private func isSafeTag(node: Node) -> Bool {
|
||||
guard !isMetaRefresh(node: node) else { return false }
|
||||
|
||||
let tag = node.nodeName().lowercased()
|
||||
return Self.allowedTags.contains(tag)
|
||||
}
|
||||
|
||||
private func isMetaRefresh(node: Node) -> Bool {
|
||||
let tag = node.nodeName().lowercased()
|
||||
guard tag == "meta" else { return false }
|
||||
|
||||
let attributeValue = try? node.attributes?.getIgnoreCase(key: "http-equiv").trim().lowercased()
|
||||
return attributeValue == "refresh"
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue