I retrieve a docx via a web service call. I then unzip it with SSZipArchive. From there, I parse it with SwiftSoup. See code below
if let xmlURL = self.extractDocxAttachment(data: data) {
let string = self.getStringFrom(docURL: xmlURL)
print(string)
} else {
print("Could not convert doc")
}
func extractDocxAttachment(data: Data) -> URL? {
print(#function)
do {
// Save ZIP data to a temporary file
let tempZipURL = URL(fileURLWithPath: NSTemporaryDirectory()).appendingPathComponent("temp.zip")
try data.write(to: tempZipURL)
// Extract ZIP archive using SSZipArchive
let destinationDir = URL(fileURLWithPath: NSTemporaryDirectory()).appendingPathComponent("extracted-docx")
let success = SSZipArchive.unzipFile(atPath: tempZipURL.path, toDestination: destinationDir.path)
// Clean up temporary ZIP file
try FileManager.default.removeItem(at: tempZipURL)
if success {
// Get URL of word/document.xml file
let documentXMLFileURL = destinationDir.appendingPathComponent("word").appendingPathComponent("document.xml")
return documentXMLFileURL
} else {
print("Failed to extract DOCX file.")
return nil
}
} catch {
print("Error extracting DOCX file: \(error)")
return nil
}
}
func getStringFrom(docURL: URL) -> String {
print(#function)
// Initialize an ordered set to store unique text content while preserving order
var uniqueTexts = OrderedSet<String>()
do {
// Read XML file as string
let xmlString = try String(contentsOf: docURL, encoding: .utf8)
// Parse the XML string using SwiftSoup
let document = try SwiftSoup.parse(xmlString)
// Extract text content from XML document, preserving newline characters
let elements = try document.select("body *").array() // Select all elements inside <body>
for element in elements {
// Get text content of the element
let elementText = try element.text()
// Insert non-empty element texts into the ordered set
if !elementText.isEmpty {
uniqueTexts.insert(elementText)
}
}
} catch {
// Handle any parsing or file reading errors and print an error message
print("Error parsing XML file \(docURL.lastPathComponent): \(error)")
}
// Return the concatenated XML text with newline characters
return uniqueTexts.arrayRepresentation().joined(separator: "\n")
}
struct OrderedSet<T: Hashable> {
private var array = [T]()
private var set = Set<T>()
mutating func insert(_ element: T) {
if !set.contains(element) {
array.append(element)
set.insert(element)
}
}
func arrayRepresentation() -> [T] {
return array
}
The problem I'm having is that the parsed data is being split up strangely because of something called "gramStart." When I look at the XML, I can see that it sometimes separates out the last word or words of a given line. For example, see below:
<w:t xml:space="preserve">4 cups cooked white </w:t>
</w:r>
<w:proofErr w:type="gramStart"/>
<w:r w:rsidRPr="0052776F">
<w:rPr>
<w:rFonts w:ascii="Helvetica" w:hAnsi="Helvetica"/>
<w:kern w:val="0"/>
<w:sz w:val="27"/>
<w:szCs w:val="27"/>
<w14:ligatures w14:val="none"/>
</w:rPr>
<w:t>rice</w:t>
</w:r>
<w:proofErr w:type="gramEnd"/>
</w:p>
From what I can tell from the documentation, that tag occurs when Word has flagged a phrase as needing grammar checking for some reason. This is ruining my parsing because it interprets those as separate lines - i.e. I receive:
4 cups cooked white rice
My question is, how can I avoid this? Can I either, 1). get SwiftSoup to ignore that tag, 2). get ZipArchive, when unzipping, to ignore that tag, or 3). use different extensions to do this instead?
Thanks!