From 9511b86ecd270ea0031f211f171abc5b24fdf891 Mon Sep 17 00:00:00 2001 From: Valentin Perignon Date: Wed, 26 Apr 2023 15:28:48 +0200 Subject: [PATCH] feat: Cleaner can clean head and body --- Sources/Cleaner.swift | 173 +++++++++++++++++++------------------- Sources/HeadCleaner.swift | 90 -------------------- 2 files changed, 87 insertions(+), 176 deletions(-) delete mode 100644 Sources/HeadCleaner.swift diff --git a/Sources/Cleaner.swift b/Sources/Cleaner.swift index 6c16c76..381ddae 100644 --- a/Sources/Cleaner.swift +++ b/Sources/Cleaner.swift @@ -9,131 +9,132 @@ import Foundation open class Cleaner { - fileprivate let whitelist: Whitelist + fileprivate let headWhitelist: Whitelist? + fileprivate let bodyWhitelist: Whitelist - /** - Create a new cleaner, that sanitizes documents using the supplied whitelist. - @param whitelist white-list to clean with - */ - public init(_ whitelist: Whitelist) { - self.whitelist = whitelist + /// Create a new cleaner, that sanitizes documents' `` and `` using the supplied whitelist. + /// - Parameters: + /// - headWhitelist: Whitelist to clean the head with + /// - bodyWhitelist: Whitelist to clean the body with + public init(headWhitelist: Whitelist?, bodyWhitelist: Whitelist) { + self.headWhitelist = headWhitelist + self.bodyWhitelist = bodyWhitelist } - /** - Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. - The original document is not modified. Only elements from the dirt document's body are used. - @param dirtyDocument Untrusted base document to clean. - @return cleaned document. - */ - public func clean(_ dirtyDocument: Document)throws->Document { - //Validate.notNull(dirtyDocument) - let clean: Document = Document.createShell(dirtyDocument.getBaseUri()) - if (dirtyDocument.body() != nil && clean.body() != nil) // frameset documents won't have a body. the clean doc will have empty body. - { - try copySafeNodes(dirtyDocument.body()!, clean.body()!) - } + /// Create a new cleaner, that sanitizes documents' `` using the supplied whitelist. + /// - Parameter whitelist: Whitelist to clean the body with + convenience init(_ whitelist: Whitelist) { + self.init(headWhitelist: nil, bodyWhitelist: whitelist) + } + + /// Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. + /// The original document is not modified. Only elements from the dirt document's `` are used. + /// - Parameter dirtyDocument: Untrusted base document to clean. + /// - Returns: A cleaned document. + public func clean(_ dirtyDocument: Document) throws -> Document { + let clean = Document.createShell(dirtyDocument.getBaseUri()) + if let headWhitelist, let dirtHead = dirtyDocument.head(), let cleanHead = clean.head() { // frameset documents won't have a head. the clean doc will have empty head. + try copySafeNodes(dirtHead, cleanHead, whitelist: headWhitelist) + } + if let dirtBody = dirtyDocument.body(), let cleanBody = clean.body() { // frameset documents won't have a body. the clean doc will have empty body. + try copySafeNodes(dirtBody, cleanBody, whitelist: bodyWhitelist) + } return clean } - /** - Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes - in the input HTML are allowed by the whitelist. -

- This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully - using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document - to ensure enforced attributes are set correctly, and that the output is tidied. -

- @param dirtyDocument document to test - @return true if no tags or attributes need to be removed; false if they do - */ - public func isValid(_ dirtyDocument: Document)throws->Bool { - //Validate.notNull(dirtyDocument) - let clean: Document = Document.createShell(dirtyDocument.getBaseUri()) - let numDiscarded: Int = try copySafeNodes(dirtyDocument.body()!, clean.body()!) - return numDiscarded == 0 + /// Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes + /// in the input HTML are allowed by the whitelist. + /// + /// This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully + /// using the ``clean(_:)`` document. If using as a validator, it is recommended to still clean the document + /// to ensure enforced attributes are set correctly, and that the output is tidied. + /// - Parameter dirtyDocument: document to test + /// - Returns: true if no tags or attributes need to be removed; false if they do + public func isValid(_ dirtyDocument: Document) throws -> Bool { + let clean = Document.createShell(dirtyDocument.getBaseUri()) + let numDiscarded = try copySafeNodes(dirtyDocument.body()!, clean.body()!, whitelist: bodyWhitelist) + return numDiscarded == 0 } @discardableResult - fileprivate func copySafeNodes(_ source: Element, _ dest: Element)throws->Int { - let cleaningVisitor: Cleaner.CleaningVisitor = Cleaner.CleaningVisitor(source, dest, self) - let traversor: NodeTraversor = NodeTraversor(cleaningVisitor) - try traversor.traverse(source) + fileprivate func copySafeNodes(_ source: Element, _ dest: Element, whitelist: Whitelist) throws -> Int { + let cleaningVisitor = Cleaner.CleaningVisitor(source, dest, whitelist) + try NodeTraversor(cleaningVisitor).traverse(source) return cleaningVisitor.numDiscarded } - - fileprivate func createSafeElement(_ sourceEl: Element)throws->ElementMeta { - let sourceTag: String = sourceEl.tagName() - let destAttrs: Attributes = Attributes() - let dest: Element = try Element(Tag.valueOf(sourceTag), sourceEl.getBaseUri(), destAttrs) - var numDiscarded: Int = 0 - - if let sourceAttrs = sourceEl.getAttributes() { - for sourceAttr: Attribute in sourceAttrs { - if (try whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { - destAttrs.put(attribute: sourceAttr) - } else { - numDiscarded+=1 - } - } - } - let enforcedAttrs: Attributes = try whitelist.getEnforcedAttributes(sourceTag) - destAttrs.addAll(incoming: enforcedAttrs) - - return ElementMeta(dest, numDiscarded) - } - } extension Cleaner { fileprivate final class CleaningVisitor: NodeVisitor { - var numDiscarded: Int = 0 - let root: Element - var destination: Element? // current element to append nodes to + private(set) var numDiscarded = 0 - private var cleaner: Cleaner + private let root: Element + private var destination: Element? // current element to append nodes to - public init(_ root: Element, _ destination: Element, _ cleaner: Cleaner) { + private let whitelist: Whitelist + + public init(_ root: Element, _ destination: Element, _ whitelist: Whitelist) { self.root = root self.destination = destination - self.cleaner = cleaner + self.whitelist = whitelist } - public func head(_ source: Node, _ depth: Int)throws { - if let sourceEl = (source as? Element) { - if (cleaner.whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs - let meta: Cleaner.ElementMeta = try cleaner.createSafeElement(sourceEl) - let destChild: Element = meta.el + public func head(_ source: Node, _ depth: Int) throws { + if let sourceEl = source as? Element { + if whitelist.isSafeTag(sourceEl.tagName()) { // safe, clone and copy safe attrs + let meta = try createSafeElement(sourceEl) + let destChild = meta.el try destination?.appendChild(destChild) numDiscarded += meta.numAttribsDiscarded destination = destChild - } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. - numDiscarded+=1 + } else if source != root { // not a safe tag, so don't add. don't count root against discarded. + numDiscarded += 1 } - } else if let sourceText = (source as? TextNode) { - let destText: TextNode = TextNode(sourceText.getWholeText(), source.getBaseUri()) + } else if let sourceText = source as? TextNode { + let destText = TextNode(sourceText.getWholeText(), source.getBaseUri()) try destination?.appendChild(destText) - } else if let sourceData = (source as? DataNode) { - if sourceData.parent() != nil && cleaner.whitelist.isSafeTag(sourceData.parent()!.nodeName()) { - //let sourceData: DataNode = (DataNode) source - let destData: DataNode = DataNode(sourceData.getWholeData(), source.getBaseUri()) + } else if let sourceData = source as? DataNode { + if sourceData.parent() != nil && whitelist.isSafeTag(sourceData.parent()!.nodeName()) { + let destData = DataNode(sourceData.getWholeData(), source.getBaseUri()) try destination?.appendChild(destData) } else { - numDiscarded+=1 + numDiscarded += 1 } } else { // else, we don't care about comments, xml proc instructions, etc - numDiscarded+=1 + numDiscarded += 1 } } - public func tail(_ source: Node, _ depth: Int)throws { - if let x = (source as? Element) { - if cleaner.whitelist.isSafeTag(x.nodeName()) { + public func tail(_ source: Node, _ depth: Int) throws { + if let x = source as? Element { + if whitelist.isSafeTag(x.nodeName()) { // would have descended, so pop destination stack destination = destination?.parent() } } } + + private func createSafeElement(_ sourceEl: Element) throws -> ElementMeta { + let sourceTag = sourceEl.tagName() + let destAttrs = Attributes() + var numDiscarded = 0 + + if let sourceAttrs = sourceEl.getAttributes() { + for sourceAttr in sourceAttrs { + if try whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr) { + destAttrs.put(attribute: sourceAttr) + } else { + numDiscarded += 1 + } + } + } + let enforcedAttrs = try whitelist.getEnforcedAttributes(sourceTag) + destAttrs.addAll(incoming: enforcedAttrs) + + let dest = try Element(Tag.valueOf(sourceTag), sourceEl.getBaseUri(), destAttrs) + return ElementMeta(dest, numDiscarded) + } } } diff --git a/Sources/HeadCleaner.swift b/Sources/HeadCleaner.swift deleted file mode 100644 index 856064a..0000000 --- a/Sources/HeadCleaner.swift +++ /dev/null @@ -1,90 +0,0 @@ -// -// HeadCleaner.swift -// SwiftSoup -// -// Created by Valentin Perignon on 25/04/2023. -// - -import Foundation - -public enum HeadCleaner { - /// Adds to the destination document a sanitized version from the dirt document's `code`. - /// - Parameters: - /// - dirtyDocument: Source document containing the tag `` to sanitize - /// - destinationDocument: Document with a cleaned body. - public static func clean(dirtyDocument: Document, destinationDocument: Document) throws { - guard let dirtHead = dirtyDocument.head(), let cleanedHead = destinationDocument.head() else { return } - try copySafeNodes(source: dirtHead, destination: cleanedHead) - } - - static private func copySafeNodes(source: Element, destination: Element) throws { - let cleaningVisitor = CleaningVisitor(root: source, destination: destination) - try NodeTraversor(cleaningVisitor).traverse(source) - } -} - -extension HeadCleaner { - private final class CleaningVisitor: NodeVisitor { - private static let allowedTags = ["style", "meta", "base"] - - private let root: Element - private var destination: Element - - private var elementToSkip: Element? - - init(root: Element, destination: Element) { - self.root = root - self.destination = destination - } - - public func head(_ node: SwiftSoup.Node, _ depth: Int) throws { - guard elementToSkip == nil else { return } - - if let elementNode = node as? Element { - if isSafeTag(node: elementNode) { - let sourceTag = elementNode.nodeName() - - guard let destinationAttributes = elementNode.attributes?.clone() else { return } - let destinationChild = Element(Tag(sourceTag), elementNode.baseUri ?? "", destinationAttributes) - try destination.appendChild(destinationChild) - destination = destinationChild - } else if node != root { - elementToSkip = elementNode - } - } else if let textNode = node as? TextNode { - let destinationText = TextNode(textNode.getWholeText(), textNode.getBaseUri()) - try destination.appendChild(destinationText) - } else if let dataNode = node as? DataNode, let parent = node.parent(), isSafeTag(node: parent) { - let destinationData = DataNode(dataNode.getWholeData(), dataNode.getBaseUri()) - try destination.appendChild(destinationData) - } - } - - public func tail(_ node: SwiftSoup.Node, _ depth: Int) throws { - if node == elementToSkip { - elementToSkip = nil - } else if let elementNode = node as? Element, isSafeTag(node: elementNode) { - if let parent = destination.parent() { - destination = parent - } else { - throw Exception.Error(type: .IllegalArgumentException, Message: "Illegal state") - } - } - } - - private func isSafeTag(node: Node) -> Bool { - guard !isMetaRefresh(node: node) else { return false } - - let tag = node.nodeName().lowercased() - return Self.allowedTags.contains(tag) - } - - private func isMetaRefresh(node: Node) -> Bool { - let tag = node.nodeName().lowercased() - guard tag == "meta" else { return false } - - let attributeValue = try? node.attributes?.getIgnoreCase(key: "http-equiv").trim().lowercased() - return attributeValue == "refresh" - } - } -}