Removed CSV namespacing from Parser and SyncParser types. Removed CSV.Delimiter enum

This commit is contained in:
Caleb Kleveter 2019-04-18 12:19:08 -05:00
parent 1a12fe8bde
commit e4d21444fb
No known key found for this signature in database
GPG Key ID: B38DBD5CF2C98D69
8 changed files with 313 additions and 317 deletions

View File

@ -10,11 +10,4 @@ public struct CSV {
self.fields = fields
}
}
internal struct Delimiter {
static let comma = UInt8(ascii: ",")
static let quote = UInt8(ascii: "\"")
static let newLine = UInt8(ascii: "\n")
static let carriageReturn = UInt8(ascii: "\r")
}
}

View File

@ -1,293 +0,0 @@
import Foundation
public struct ErrorList: Error {
public var errors: [Error]
public init(errors: [Error] = []) {
self.errors = errors
}
var result: Result<Void, ErrorList> {
return self.errors.count == 0 ? .success(()) : .failure(self)
}
}
extension CSV {
public struct Parser {
public typealias HeaderHandler = (_ title: [UInt8])throws -> ()
public typealias CellHandler = (_ title: [UInt8], _ contents: [UInt8])throws -> ()
internal enum Position {
case headers
case cells
}
private struct State {
var headers: [[UInt8]]
var position: Position
var inQuotes: Bool
var store: [UInt8]
var headerIndex: Array<[UInt8]>.Index
var bytesLeft: Int?
init() {
self.headers = []
self.position = .headers
self.inQuotes = false
self.store = []
self.headerIndex = Array<[UInt8]>().startIndex
self.bytesLeft = nil
}
}
public var onHeader: HeaderHandler?
public var onCell: CellHandler?
private var state: State
internal var currentHeader: [UInt8] {
return self.state.headers[self.state.headerIndex % self.state.headers.count]
}
public init(onHeader: HeaderHandler? = nil, onCell: CellHandler? = nil) {
self.onHeader = onHeader
self.onCell = onCell
self.state = State()
}
@discardableResult
public mutating func parse(_ data: [UInt8], length: Int? = nil) -> Result<Void, ErrorList> {
var currentCell: [UInt8] = self.state.store
var index = data.startIndex
var updateState = false
var errors = ErrorList()
var slice: (start: Int, end: Int) = (index, index)
while index < data.endIndex {
let byte = data[index]
switch byte {
case Delimiter.quote:
currentCell.append(contentsOf: data[slice.start..<slice.end])
slice = (index + 1, index + 1)
switch self.state.inQuotes && index + 1 < data.endIndex && data[index + 1] == Delimiter.quote {
case true: index += 1
case false: self.state.inQuotes.toggle()
}
case Delimiter.carriageReturn:
if self.state.inQuotes {
slice.end += 1
} else {
if index + 1 < data.endIndex, data[index + 1] == Delimiter.newLine {
index += 1
}
fallthrough
}
case Delimiter.newLine:
if self.state.inQuotes {
slice.end += 1
} else {
if self.state.position == .headers { updateState = true }
fallthrough
}
case Delimiter.comma:
if self.state.inQuotes {
slice.end += 1
} else {
currentCell.append(contentsOf: data[slice.start..<slice.end])
switch self.state.position {
case .headers:
self.state.headers.append(currentCell)
do { try self.onHeader?(currentCell) }
catch let error { errors.errors.append(error) }
case .cells:
do { try self.onCell?(self.currentHeader, currentCell) }
catch let error { errors.errors.append(error) }
self.state.headerIndex += 1
}
currentCell = []
slice = (index + 1, index + 1)
}
default: slice.end += 1
}
if updateState { self.state.position = .cells }
index += 1
}
currentCell.append(contentsOf: data[slice.start..<slice.end])
if let length = length {
if let left = self.state.bytesLeft {
self.state.bytesLeft = left - ((self.state.store.count + data.count) - currentCell.count)
} else {
self.state.bytesLeft = length - ((self.state.store.count + data.count) - currentCell.count)
}
if (self.state.bytesLeft ?? 0) > currentCell.count {
self.state.store = currentCell
return errors.result
}
}
switch self.state.position {
case .headers:
self.state.headers.append(currentCell)
do { try self.onHeader?(currentCell) }
catch let error { errors.errors.append(error) }
case .cells:
do { try self.onCell?(self.currentHeader, currentCell) }
catch let error { errors.errors.append(error) }
}
return errors.result
}
}
public final class SyncParser {
public init() {}
public func parse(_ data: [UInt8]) -> [[UInt8]: [[UInt8]?]] {
var results: [[UInt8]: [[UInt8]?]] = [:]
var parser = Parser(
onHeader: { header in
results[header] = []
},
onCell: { header, cell in
results[header, default: []].append(cell.count > 0 ? cell : nil)
}
)
parser.parse(data)
return results
}
public func parse(_ data: String) -> [String: [String?]] {
var results: [String: [String?]] = [:]
var parser = Parser(
onHeader: { header in
results[String(decoding: header, as: UTF8.self)] = []
},
onCell: { header, cell in
let title = String(decoding: header, as: UTF8.self)
let contents = String(decoding: cell, as: UTF8.self)
results[title, default: []].append(cell.count > 0 ? contents : nil)
}
)
parser.parse(Array(data.utf8))
return results
}
}
}
extension CSV {
public static func parse(_ csv: Data) -> [String: [String?]] {
let data = Array(csv)
let end = data.endIndex
let estimatedRowCount = data.reduce(0) { $1 == Delimiter.newLine ? $0 + 1 : $0 }
var columns: [(title: String, cells: [String?])] = []
var columnIndex = 0
var iterator = data.startIndex
var inQuotes = false
var cellStart = data.startIndex
var cellEnd = data.startIndex
header: while iterator < end {
let byte = data[iterator]
switch byte {
case Delimiter.quote:
inQuotes = !inQuotes
cellEnd += 1
case Delimiter.comma:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == Delimiter.quote }
guard let title = String(bytes: cell, encoding: .utf8) else { return [:] }
var cells: [String?] = []
cells.reserveCapacity(estimatedRowCount)
columns.append((title, cells))
cellStart = iterator + 1
cellEnd = iterator + 1
case Delimiter.newLine, Delimiter.carriageReturn:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == Delimiter.quote }
guard let title = String(bytes: cell, encoding: .utf8) else { return [:] }
var cells: [String?] = []
cells.reserveCapacity(estimatedRowCount)
columns.append((title, cells))
let increment = byte == Delimiter.newLine ? 1 : 2
cellStart = iterator + increment
cellEnd = iterator + increment
iterator += increment
break header
default: cellEnd += 1
}
iterator += 1
}
while iterator < end {
let byte = data[iterator]
switch byte {
case Delimiter.quote:
inQuotes = !inQuotes
cellEnd += 1
case Delimiter.comma:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == Delimiter.quote }
columns[columnIndex].cells.append(cell.count > 0 ? String(bytes: cell, encoding: .utf8) : nil)
columnIndex += 1
cellStart = iterator + 1
cellEnd = iterator + 1
case Delimiter.newLine, Delimiter.carriageReturn:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == Delimiter.quote }
columns[columnIndex].cells.append(cell.count > 0 ? String(bytes: cell, encoding: .utf8) : nil)
columnIndex = 0
let increment = byte == Delimiter.newLine ? 1 : 2
cellStart = iterator + increment
cellEnd = iterator + increment
iterator += increment
continue
default: cellEnd += 1
}
iterator += 1
}
if cellEnd > cellStart {
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == Delimiter.quote }
columns[columnIndex].cells.append(cell.count > 0 ? String(bytes: cell, encoding: .utf8) : nil)
}
return columns.reduce(into: [:]) { result, column in
result[column.title] = column.cells
}
}
public static func parse(_ data: Data) -> [String: Column] {
let elements: [String: [String?]] = self.parse(data)
return elements.reduce(into: [:]) { columns, element in
columns[element.key] = Column(header: element.key, fields: element.value)
}
}
public static func parse(_ data: Data) -> [Column] {
let elements: [String: [String?]] = self.parse(data)
return elements.reduce(into: []) { columns, element in
columns.append(Column(header: element.key, fields: element.value))
}
}
}

View File

@ -53,8 +53,8 @@ public struct Serializer {
guard data.count > 0 else { return errors.result }
if !self.serializedHeaders {
let headers = data.keys.map { title in Array([[CSV.Delimiter.quote], title.bytes, [CSV.Delimiter.quote]].joined()) }
do { try self.onRow(Array(headers.joined(separator: [CSV.Delimiter.comma]))) }
let headers = data.keys.map { title in Array([[34], title.bytes, [34]].joined()) }
do { try self.onRow(Array(headers.joined(separator: [10]))) }
catch let error { errors.errors.append(error) }
self.serializedHeaders = true
}
@ -62,9 +62,9 @@ public struct Serializer {
guard let first = data.first?.value else { return errors.result }
(first.startIndex..<first.endIndex).forEach { index in
let cells = data.values.map { column -> [UInt8] in
return Array([[CSV.Delimiter.quote], column[index].bytes, [CSV.Delimiter.quote]].joined())
return Array([[34], column[index].bytes, [34]].joined())
}
do { try onRow(Array(cells.joined(separator: [CSV.Delimiter.comma]))) }
do { try onRow(Array(cells.joined(separator: [10]))) }
catch let error { errors.errors.append(error) }
}
@ -85,7 +85,7 @@ public struct SyncSerializer {
var serializer = Serializer { row in rows.append(row) }
serializer.serialize(data)
return Array(rows.joined(separator: [CSV.Delimiter.newLine]))
return Array(rows.joined(separator: [10]))
}
}

View File

@ -50,7 +50,7 @@ public final class CSVSyncEncoder {
}
try objects.forEach(encoder.encode)
return Data(rows.joined(separator: [CSV.Delimiter.newLine]))
return Data(rows.joined(separator: [10]))
}
}

View File

@ -75,7 +75,7 @@ internal final class _CSVAsyncDecoder: Decoder {
}
internal final class AsyncDecoderHandler {
var parser: CSV.Parser
var parser: Parser
var currentRow: [String: [UInt8]]
var onRow: ([String: [UInt8]])throws -> ()
@ -83,7 +83,7 @@ internal final class AsyncDecoderHandler {
private var currentColumn: Int
init(onRow: @escaping ([String: [UInt8]])throws -> ()) {
self.parser = CSV.Parser()
self.parser = Parser()
self.currentRow = [:]
self.onRow = onRow
self.columnCount = 0

296
Sources/CSV/Parser.swift Normal file
View File

@ -0,0 +1,296 @@
import Foundation
// '\n' => 10
// '\r' => 13
// '"' => 34
// ',' => 44
public struct ErrorList: Error {
public var errors: [Error]
public init(errors: [Error] = []) {
self.errors = errors
}
var result: Result<Void, ErrorList> {
return self.errors.count == 0 ? .success(()) : .failure(self)
}
}
public struct Parser {
public typealias HeaderHandler = (_ title: [UInt8])throws -> ()
public typealias CellHandler = (_ title: [UInt8], _ contents: [UInt8])throws -> ()
internal enum Position {
case headers
case cells
}
private struct State {
var headers: [[UInt8]]
var position: Position
var inQuotes: Bool
var store: [UInt8]
var headerIndex: Array<[UInt8]>.Index
var bytesLeft: Int?
init() {
self.headers = []
self.position = .headers
self.inQuotes = false
self.store = []
self.headerIndex = Array<[UInt8]>().startIndex
self.bytesLeft = nil
}
}
public var onHeader: HeaderHandler?
public var onCell: CellHandler?
private var state: State
internal var currentHeader: [UInt8] {
return self.state.headers[self.state.headerIndex % self.state.headers.count]
}
public init(onHeader: HeaderHandler? = nil, onCell: CellHandler? = nil) {
self.onHeader = onHeader
self.onCell = onCell
self.state = State()
}
@discardableResult
public mutating func parse(_ data: [UInt8], length: Int? = nil) -> Result<Void, ErrorList> {
var currentCell: [UInt8] = self.state.store
var index = data.startIndex
var updateState = false
var errors = ErrorList()
var slice: (start: Int, end: Int) = (index, index)
while index < data.endIndex {
let byte = data[index]
switch byte {
case 34:
currentCell.append(contentsOf: data[slice.start..<slice.end])
slice = (index + 1, index + 1)
switch self.state.inQuotes && index + 1 < data.endIndex && data[index + 1] == 34 {
case true: index += 1
case false: self.state.inQuotes.toggle()
}
case 13:
if self.state.inQuotes {
slice.end += 1
} else {
if index + 1 < data.endIndex, data[index + 1] == 10 {
index += 1
}
fallthrough
}
case 10:
if self.state.inQuotes {
slice.end += 1
} else {
if self.state.position == .headers { updateState = true }
fallthrough
}
case 44:
if self.state.inQuotes {
slice.end += 1
} else {
currentCell.append(contentsOf: data[slice.start..<slice.end])
switch self.state.position {
case .headers:
self.state.headers.append(currentCell)
do { try self.onHeader?(currentCell) }
catch let error { errors.errors.append(error) }
case .cells:
do { try self.onCell?(self.currentHeader, currentCell) }
catch let error { errors.errors.append(error) }
self.state.headerIndex += 1
}
currentCell = []
slice = (index + 1, index + 1)
}
default: slice.end += 1
}
if updateState { self.state.position = .cells }
index += 1
}
currentCell.append(contentsOf: data[slice.start..<slice.end])
if let length = length {
if let left = self.state.bytesLeft {
self.state.bytesLeft = left - ((self.state.store.count + data.count) - currentCell.count)
} else {
self.state.bytesLeft = length - ((self.state.store.count + data.count) - currentCell.count)
}
if (self.state.bytesLeft ?? 0) > currentCell.count {
self.state.store = currentCell
return errors.result
}
}
switch self.state.position {
case .headers:
self.state.headers.append(currentCell)
do { try self.onHeader?(currentCell) }
catch let error { errors.errors.append(error) }
case .cells:
do { try self.onCell?(self.currentHeader, currentCell) }
catch let error { errors.errors.append(error) }
}
return errors.result
}
}
public final class SyncParser {
public init() {}
public func parse(_ data: [UInt8]) -> [[UInt8]: [[UInt8]?]] {
var results: [[UInt8]: [[UInt8]?]] = [:]
var parser = Parser(
onHeader: { header in
results[header] = []
},
onCell: { header, cell in
results[header, default: []].append(cell.count > 0 ? cell : nil)
}
)
parser.parse(data)
return results
}
public func parse(_ data: String) -> [String: [String?]] {
var results: [String: [String?]] = [:]
var parser = Parser(
onHeader: { header in
results[String(decoding: header, as: UTF8.self)] = []
},
onCell: { header, cell in
let title = String(decoding: header, as: UTF8.self)
let contents = String(decoding: cell, as: UTF8.self)
results[title, default: []].append(cell.count > 0 ? contents : nil)
}
)
parser.parse(Array(data.utf8))
return results
}
}
extension CSV {
public static func parse(_ csv: Data) -> [String: [String?]] {
let data = Array(csv)
let end = data.endIndex
let estimatedRowCount = data.reduce(0) { $1 == 10 ? $0 + 1 : $0 }
var columns: [(title: String, cells: [String?])] = []
var columnIndex = 0
var iterator = data.startIndex
var inQuotes = false
var cellStart = data.startIndex
var cellEnd = data.startIndex
header: while iterator < end {
let byte = data[iterator]
switch byte {
case 34:
inQuotes = !inQuotes
cellEnd += 1
case 10:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == 34 }
guard let title = String(bytes: cell, encoding: .utf8) else { return [:] }
var cells: [String?] = []
cells.reserveCapacity(estimatedRowCount)
columns.append((title, cells))
cellStart = iterator + 1
cellEnd = iterator + 1
case 13:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == 34 }
guard let title = String(bytes: cell, encoding: .utf8) else { return [:] }
var cells: [String?] = []
cells.reserveCapacity(estimatedRowCount)
columns.append((title, cells))
let increment = byte == 10 ? 1 : 2
cellStart = iterator + increment
cellEnd = iterator + increment
iterator += increment
break header
default: cellEnd += 1
}
iterator += 1
}
while iterator < end {
let byte = data[iterator]
switch byte {
case 34:
inQuotes = !inQuotes
cellEnd += 1
case 10:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == 34 }
columns[columnIndex].cells.append(cell.count > 0 ? String(bytes: cell, encoding: .utf8) : nil)
columnIndex += 1
cellStart = iterator + 1
cellEnd = iterator + 1
case 13:
if inQuotes { cellEnd += 1; break }
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == 34 }
columns[columnIndex].cells.append(cell.count > 0 ? String(bytes: cell, encoding: .utf8) : nil)
columnIndex = 0
let increment = byte == 10 ? 1 : 2
cellStart = iterator + increment
cellEnd = iterator + increment
iterator += increment
continue
default: cellEnd += 1
}
iterator += 1
}
if cellEnd > cellStart {
var cell = Array(data[cellStart...cellEnd-1])
cell.removeAll { $0 == 34 }
columns[columnIndex].cells.append(cell.count > 0 ? String(bytes: cell, encoding: .utf8) : nil)
}
return columns.reduce(into: [:]) { result, column in
result[column.title] = column.cells
}
}
public static func parse(_ data: Data) -> [String: Column] {
let elements: [String: [String?]] = self.parse(data)
return elements.reduce(into: [:]) { columns, element in
columns[element.key] = Column(header: element.key, fields: element.value)
}
}
public static func parse(_ data: Data) -> [Column] {
let elements: [String: [String?]] = self.parse(data)
return elements.reduce(into: []) { columns, element in
columns.append(Column(header: element.key, fields: element.value))
}
}
}

View File

@ -15,7 +15,7 @@ class CSVTests: XCTestCase {
func testAsyncParseSpeed()throws {
let url = URL(string: "file:/Users/calebkleveter/Development/developer_survey_2018.csv")!
let data = try Array(Data(contentsOf: url))
let parser = CSV.SyncParser()
let parser = SyncParser()
// 10.473
measure {
@ -26,7 +26,7 @@ class CSVTests: XCTestCase {
func testAsyncParseStringSpeed()throws {
let url = URL(string: "file:/Users/calebkleveter/Development/developer_survey_2018.csv")!
let data = try String(contentsOf: url)
let parser = CSV.SyncParser()
let parser = SyncParser()
// 18.083
measure {
@ -127,7 +127,7 @@ class CSVTests: XCTestCase {
let decodingOptions = CSVCodingOptions(boolCodingStrategy: .fuzzy, nilCodingStrategy: .custom("NA"))
let decoder = CSVDecoder(decodingOptions: decodingOptions)
// 20.948
// 19.767
measure {
do {
_ = try decoder.sync.decode(Response.self, from: data)
@ -156,7 +156,7 @@ class CSVTests: XCTestCase {
func testCSVSyncSeralizationSpeed() throws {
let url = URL(string: "file:/Users/calebkleveter/Development/developer_survey_2018.csv")!
let data = try Array(Data(contentsOf: url))
let parsed = CSV.SyncParser().parse(data)
let parsed = SyncParser().parse(data)
let serializer = SyncSerializer()
// 18.049

View File

@ -3,7 +3,7 @@ import XCTest
final class ParserTests: XCTestCase {
func testParserInit() {
let parser = CSV.Parser(onHeader: nil, onCell: nil)
let parser = Parser(onHeader: nil, onCell: nil)
XCTAssert(parser.onCell == nil)
XCTAssert(parser.onHeader == nil)
@ -13,7 +13,7 @@ final class ParserTests: XCTestCase {
var headers: [String] = []
var cells: [String: [String?]] = [:]
var parser = CSV.Parser(
var parser = Parser(
onHeader: { header in
if let title = String(bytes: header, encoding: .utf8) {
headers.append(title)
@ -49,7 +49,7 @@ final class ParserTests: XCTestCase {
var headers: [String] = []
var cells: [String: [String?]] = [:]
var parser = CSV.Parser(
var parser = Parser(
onHeader: { header in
if let title = String(bytes: header, encoding: .utf8) {
headers.append(title)
@ -86,7 +86,7 @@ final class ParserTests: XCTestCase {
}
func testMeasureFullParse() {
var parser = CSV.Parser(onHeader: { _ in return }, onCell: { _, _ in return })
var parser = Parser(onHeader: { _ in return }, onCell: { _, _ in return })
let csv = Array(data.utf8)
// 1.497
@ -98,7 +98,7 @@ final class ParserTests: XCTestCase {
}
func testMeasureChunkedParse() {
var parser = CSV.Parser(onHeader: { _ in return }, onCell: { _, _ in return })
var parser = Parser(onHeader: { _ in return }, onCell: { _, _ in return })
let chnks = chunks.map { Array($0.utf8) }
let length = chnks.reduce(0) { $0 + $1.count }