Substitute all xml text with beautifulsoup library

78 views Asked by At

I need to replace all the text in a xml using Beautifulsoup library in Python. For example, i have this peace of xml:

<Paragraph>
Procedure general informations
<IntLink Target="il_0_mob_411" Type="MediaObject"/>
<Strong>DIFFICULTY: </Strong>
<IntLink Target="il_0_mob_231" Type="MediaObject"/>
<IntLink Target="il_0_mob_231" Type="MediaObject"/> - 
<Strong>DURATION:</Strong> 15 min.<br/>
<Strong>TOOLS REQUIRED:</Strong> 4mm Allen Key, Pin driver
</Paragraph>

And i need it going to beacome like this:

<Paragraph>
0
<IntLink Target="il_0_mob_411" Type="MediaObject"/>
<Strong>1</Strong>
<IntLink Target="il_0_mob_231" Type="MediaObject"/>
<IntLink Target="il_0_mob_231" Type="MediaObject"/> - 
<Strong>2</Strong>3<br/>
<Strong>4</Strong>5
</Paragraph>

Thank you!

1

There are 1 answers

0
Francesco Pedrotti On

Here's the code:

# -*- coding: utf-8 -*-
import HTMLParser
import codecs
import os
import sys
from bs4 import BeautifulSoup


xml_doc = open("export_2.xml")
soup = BeautifulSoup(xml_doc)
pars = HTMLParser.HTMLParser()

open('export.txt', 'w').close()
file_xml = open('export_ph.xml', 'w')
counter = 0
all_texts = soup.find_all(text=True)

print "Inizio esportazione:"
for text in all_texts:
    s = pars.unescape(text)
    s = str(counter)+ ";"+ s + "\n"
    if not (s == "" or s.isspace()):
        with codecs.open("export.txt", "a", encoding="utf-8") as file_text:
            file_text.write(s)

    counter = counter+1
    print ".",

## put  placeholder in the xml
all_xml = soup.find_all()
for text in all_xml:
    s = pars.unescape(text.get_text())
    with codecs.open("export_ph.xml", "a", encoding="utf-8") as file_xml:
        file_xml.write(s)


file_xml_info = os.path.getsize('export_ph.xml')
file_txt_info = os.path.getsize('export.txt')
if (file_txt_info > 0 and file_xml_info > 0):
    print "\nEsportazione completata: \nFile xml: " + str(file_xml_info) + "B" + "\nFile testo a 3 colonne: " + str(file_txt_info) + "B"