feat: Cleaner can clean head and body

This commit is contained in:
Valentin Perignon 2023-04-26 15:28:48 +02:00
parent d91752f45d
commit 9511b86ecd
2 changed files with 87 additions and 176 deletions

View File

@ -9,131 +9,132 @@
import Foundation import Foundation
open class Cleaner { open class Cleaner {
fileprivate let whitelist: Whitelist fileprivate let headWhitelist: Whitelist?
fileprivate let bodyWhitelist: Whitelist
/** /// Create a new cleaner, that sanitizes documents' `<head>` and `<body>` using the supplied whitelist.
Create a new cleaner, that sanitizes documents using the supplied whitelist. /// - Parameters:
@param whitelist white-list to clean with /// - headWhitelist: Whitelist to clean the head with
*/ /// - bodyWhitelist: Whitelist to clean the body with
public init(_ whitelist: Whitelist) { public init(headWhitelist: Whitelist?, bodyWhitelist: Whitelist) {
self.whitelist = whitelist self.headWhitelist = headWhitelist
self.bodyWhitelist = bodyWhitelist
} }
/** /// Create a new cleaner, that sanitizes documents' `<body>` using the supplied whitelist.
Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. /// - Parameter whitelist: Whitelist to clean the body with
The original document is not modified. Only elements from the dirt document's <code>body</code> are used. convenience init(_ whitelist: Whitelist) {
@param dirtyDocument Untrusted base document to clean. self.init(headWhitelist: nil, bodyWhitelist: whitelist)
@return cleaned document. }
*/
public func clean(_ dirtyDocument: Document)throws->Document { /// Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
//Validate.notNull(dirtyDocument) /// The original document is not modified. Only elements from the dirt document's `<body>` are used.
let clean: Document = Document.createShell(dirtyDocument.getBaseUri()) /// - Parameter dirtyDocument: Untrusted base document to clean.
if (dirtyDocument.body() != nil && clean.body() != nil) // frameset documents won't have a body. the clean doc will have empty body. /// - Returns: A cleaned document.
{ public func clean(_ dirtyDocument: Document) throws -> Document {
try copySafeNodes(dirtyDocument.body()!, clean.body()!) let clean = Document.createShell(dirtyDocument.getBaseUri())
} if let headWhitelist, let dirtHead = dirtyDocument.head(), let cleanHead = clean.head() { // frameset documents won't have a head. the clean doc will have empty head.
try copySafeNodes(dirtHead, cleanHead, whitelist: headWhitelist)
}
if let dirtBody = dirtyDocument.body(), let cleanBody = clean.body() { // frameset documents won't have a body. the clean doc will have empty body.
try copySafeNodes(dirtBody, cleanBody, whitelist: bodyWhitelist)
}
return clean return clean
} }
/** /// Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes /// in the input HTML are allowed by the whitelist.
in the input HTML are allowed by the whitelist. ///
<p> /// This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully /// using the ``clean(_:)`` document. If using as a validator, it is recommended to still clean the document
using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document /// to ensure enforced attributes are set correctly, and that the output is tidied.
to ensure enforced attributes are set correctly, and that the output is tidied. /// - Parameter dirtyDocument: document to test
</p> /// - Returns: true if no tags or attributes need to be removed; false if they do
@param dirtyDocument document to test public func isValid(_ dirtyDocument: Document) throws -> Bool {
@return true if no tags or attributes need to be removed; false if they do let clean = Document.createShell(dirtyDocument.getBaseUri())
*/ let numDiscarded = try copySafeNodes(dirtyDocument.body()!, clean.body()!, whitelist: bodyWhitelist)
public func isValid(_ dirtyDocument: Document)throws->Bool { return numDiscarded == 0
//Validate.notNull(dirtyDocument)
let clean: Document = Document.createShell(dirtyDocument.getBaseUri())
let numDiscarded: Int = try copySafeNodes(dirtyDocument.body()!, clean.body()!)
return numDiscarded == 0
} }
@discardableResult @discardableResult
fileprivate func copySafeNodes(_ source: Element, _ dest: Element)throws->Int { fileprivate func copySafeNodes(_ source: Element, _ dest: Element, whitelist: Whitelist) throws -> Int {
let cleaningVisitor: Cleaner.CleaningVisitor = Cleaner.CleaningVisitor(source, dest, self) let cleaningVisitor = Cleaner.CleaningVisitor(source, dest, whitelist)
let traversor: NodeTraversor = NodeTraversor(cleaningVisitor) try NodeTraversor(cleaningVisitor).traverse(source)
try traversor.traverse(source)
return cleaningVisitor.numDiscarded return cleaningVisitor.numDiscarded
} }
fileprivate func createSafeElement(_ sourceEl: Element)throws->ElementMeta {
let sourceTag: String = sourceEl.tagName()
let destAttrs: Attributes = Attributes()
let dest: Element = try Element(Tag.valueOf(sourceTag), sourceEl.getBaseUri(), destAttrs)
var numDiscarded: Int = 0
if let sourceAttrs = sourceEl.getAttributes() {
for sourceAttr: Attribute in sourceAttrs {
if (try whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) {
destAttrs.put(attribute: sourceAttr)
} else {
numDiscarded+=1
}
}
}
let enforcedAttrs: Attributes = try whitelist.getEnforcedAttributes(sourceTag)
destAttrs.addAll(incoming: enforcedAttrs)
return ElementMeta(dest, numDiscarded)
}
} }
extension Cleaner { extension Cleaner {
fileprivate final class CleaningVisitor: NodeVisitor { fileprivate final class CleaningVisitor: NodeVisitor {
var numDiscarded: Int = 0 private(set) var numDiscarded = 0
let root: Element
var destination: Element? // current element to append nodes to
private var cleaner: Cleaner private let root: Element
private var destination: Element? // current element to append nodes to
public init(_ root: Element, _ destination: Element, _ cleaner: Cleaner) { private let whitelist: Whitelist
public init(_ root: Element, _ destination: Element, _ whitelist: Whitelist) {
self.root = root self.root = root
self.destination = destination self.destination = destination
self.cleaner = cleaner self.whitelist = whitelist
} }
public func head(_ source: Node, _ depth: Int)throws { public func head(_ source: Node, _ depth: Int) throws {
if let sourceEl = (source as? Element) { if let sourceEl = source as? Element {
if (cleaner.whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs if whitelist.isSafeTag(sourceEl.tagName()) { // safe, clone and copy safe attrs
let meta: Cleaner.ElementMeta = try cleaner.createSafeElement(sourceEl) let meta = try createSafeElement(sourceEl)
let destChild: Element = meta.el let destChild = meta.el
try destination?.appendChild(destChild) try destination?.appendChild(destChild)
numDiscarded += meta.numAttribsDiscarded numDiscarded += meta.numAttribsDiscarded
destination = destChild destination = destChild
} else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. } else if source != root { // not a safe tag, so don't add. don't count root against discarded.
numDiscarded+=1 numDiscarded += 1
} }
} else if let sourceText = (source as? TextNode) { } else if let sourceText = source as? TextNode {
let destText: TextNode = TextNode(sourceText.getWholeText(), source.getBaseUri()) let destText = TextNode(sourceText.getWholeText(), source.getBaseUri())
try destination?.appendChild(destText) try destination?.appendChild(destText)
} else if let sourceData = (source as? DataNode) { } else if let sourceData = source as? DataNode {
if sourceData.parent() != nil && cleaner.whitelist.isSafeTag(sourceData.parent()!.nodeName()) { if sourceData.parent() != nil && whitelist.isSafeTag(sourceData.parent()!.nodeName()) {
//let sourceData: DataNode = (DataNode) source let destData = DataNode(sourceData.getWholeData(), source.getBaseUri())
let destData: DataNode = DataNode(sourceData.getWholeData(), source.getBaseUri())
try destination?.appendChild(destData) try destination?.appendChild(destData)
} else { } else {
numDiscarded+=1 numDiscarded += 1
} }
} else { // else, we don't care about comments, xml proc instructions, etc } else { // else, we don't care about comments, xml proc instructions, etc
numDiscarded+=1 numDiscarded += 1
} }
} }
public func tail(_ source: Node, _ depth: Int)throws { public func tail(_ source: Node, _ depth: Int) throws {
if let x = (source as? Element) { if let x = source as? Element {
if cleaner.whitelist.isSafeTag(x.nodeName()) { if whitelist.isSafeTag(x.nodeName()) {
// would have descended, so pop destination stack // would have descended, so pop destination stack
destination = destination?.parent() destination = destination?.parent()
} }
} }
} }
private func createSafeElement(_ sourceEl: Element) throws -> ElementMeta {
let sourceTag = sourceEl.tagName()
let destAttrs = Attributes()
var numDiscarded = 0
if let sourceAttrs = sourceEl.getAttributes() {
for sourceAttr in sourceAttrs {
if try whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr) {
destAttrs.put(attribute: sourceAttr)
} else {
numDiscarded += 1
}
}
}
let enforcedAttrs = try whitelist.getEnforcedAttributes(sourceTag)
destAttrs.addAll(incoming: enforcedAttrs)
let dest = try Element(Tag.valueOf(sourceTag), sourceEl.getBaseUri(), destAttrs)
return ElementMeta(dest, numDiscarded)
}
} }
} }

View File

@ -1,90 +0,0 @@
//
// HeadCleaner.swift
// SwiftSoup
//
// Created by Valentin Perignon on 25/04/2023.
//
import Foundation
public enum HeadCleaner {
/// Adds to the destination document a sanitized version from the dirt document's `<head>code</head>`.
/// - Parameters:
/// - dirtyDocument: Source document containing the tag `<head>` to sanitize
/// - destinationDocument: Document with a cleaned body.
public static func clean(dirtyDocument: Document, destinationDocument: Document) throws {
guard let dirtHead = dirtyDocument.head(), let cleanedHead = destinationDocument.head() else { return }
try copySafeNodes(source: dirtHead, destination: cleanedHead)
}
static private func copySafeNodes(source: Element, destination: Element) throws {
let cleaningVisitor = CleaningVisitor(root: source, destination: destination)
try NodeTraversor(cleaningVisitor).traverse(source)
}
}
extension HeadCleaner {
private final class CleaningVisitor: NodeVisitor {
private static let allowedTags = ["style", "meta", "base"]
private let root: Element
private var destination: Element
private var elementToSkip: Element?
init(root: Element, destination: Element) {
self.root = root
self.destination = destination
}
public func head(_ node: SwiftSoup.Node, _ depth: Int) throws {
guard elementToSkip == nil else { return }
if let elementNode = node as? Element {
if isSafeTag(node: elementNode) {
let sourceTag = elementNode.nodeName()
guard let destinationAttributes = elementNode.attributes?.clone() else { return }
let destinationChild = Element(Tag(sourceTag), elementNode.baseUri ?? "", destinationAttributes)
try destination.appendChild(destinationChild)
destination = destinationChild
} else if node != root {
elementToSkip = elementNode
}
} else if let textNode = node as? TextNode {
let destinationText = TextNode(textNode.getWholeText(), textNode.getBaseUri())
try destination.appendChild(destinationText)
} else if let dataNode = node as? DataNode, let parent = node.parent(), isSafeTag(node: parent) {
let destinationData = DataNode(dataNode.getWholeData(), dataNode.getBaseUri())
try destination.appendChild(destinationData)
}
}
public func tail(_ node: SwiftSoup.Node, _ depth: Int) throws {
if node == elementToSkip {
elementToSkip = nil
} else if let elementNode = node as? Element, isSafeTag(node: elementNode) {
if let parent = destination.parent() {
destination = parent
} else {
throw Exception.Error(type: .IllegalArgumentException, Message: "Illegal state")
}
}
}
private func isSafeTag(node: Node) -> Bool {
guard !isMetaRefresh(node: node) else { return false }
let tag = node.nodeName().lowercased()
return Self.allowedTags.contains(tag)
}
private func isMetaRefresh(node: Node) -> Bool {
let tag = node.nodeName().lowercased()
guard tag == "meta" else { return false }
let attributeValue = try? node.attributes?.getIgnoreCase(key: "http-equiv").trim().lowercased()
return attributeValue == "refresh"
}
}
}