Creating one XML file for each row in Excel sheet, based on predefined XML structure

213 views Asked by At

I have the following XML file and would like to import data from an Excel spreadsheet to be placed between certain elements such as eadid and titleproper. I've tried the attached Python code but this produced an XML file that did not include the full schema. I want to save each row of data in the Excel file to a separate XML file.

<?xml version="1.0" encoding="UTF-8"?>
<ead xmlns="urn:isbn:1-931666-22-9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:isbn:1-931666-22-9 http://www.loc.gov/ead/ead.xsd">
<eadheader>
    <eadid></eadid>
    <filedesc>
    <titlestmt>
        <titleproper></titleproper>
    </titlestmt>
    </filedesc>
    <profiledesc>
        <langusage>
            <language langcode="eng" scriptcode="Latn"></language>
        </langusage>
    </profiledesc>
    </eadheader>
        <archdesc level="fonds">
            <did>
                <unitid></unitid>
                <langmaterial>
                    <language langcode="eng" scriptcode="Latn"></language>
                </langmaterial>
                <unittitle></unittitle>
                <unitdate normal=""></unitdate>
                <physdesc>
                    <extent>1 Cubic Feet</extent>
                </physdesc>
                <langmaterial>
                    <language langcode="eng" scriptcode="Latn">English</language>
                </langmaterial>
            </did>
            <dsc>
            </dsc>
        </archdesc>
</ead>

This is the code I have tried in Python:

import pandas as pd
pip install openpyxl
from lxml import etree as et
import xml.etree.ElementTree as Xet
tree = et.parse('test_resource.xml')
root = tree.getroot()
raw_data = pd.read_excel(r'/Users/smeyerkukan/Desktop/ArchivesSpace/Python Coding/aspace.xlsx')
tree = et.parse('test_resource.xml')
root = tree.getroot()
for row in raw_data.iterrows():
    root_tags = et.SubElement(root, 'ExportData')
    Column_heading_1 = et.SubElement(root_tags, 'titleproper')
    Column_heading_2 = et.SubElement(root_tags, 'unittitle')
    Column_heading_3 = et.SubElement(root_tags, 'eadid')
    Column_heading_4 = et.SubElement(root_tags, 'unitid')
    Column_heading_7 = et.SubElement(root_tags, 'unitdate')
    Column_heading_1.text = str(row[1]['<titleproper>'])
    Column_heading_2.text = str(row[1]['<unittitle>'])
    Column_heading_3.text = str(row[1]['<eadid>'])
    Column_heading_4.text = str(row[1]['<unitid>'])
    Column_heading_7.text = str(row[1]['<unitdate>'])
tree = et.ElementTree(root)
et.indent(tree, space="\t", level=0)
tree.write('output.xml', encoding="utf-8")

This is the output I would like to get:

<?xml version="1.0" encoding="UTF-8"?>
<ead xmlns="urn:isbn:1-931666-22-9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:isbn:1-931666-22-9 http://www.loc.gov/ead/ead.xsd">
<eadheader>
    <eadid>Church RG 095</eadid>
    <filedesc>
    <titlestmt>
        <titleproper>Salem Reformed Church (Philadelphia, PA)</titleproper>
    </titlestmt>
    </filedesc>
    <profiledesc>
        <langusage>
            <language langcode="eng" scriptcode="Latn"></language>
        </langusage>
    </profiledesc>
    </eadheader>
        <archdesc level="fonds">
            <did>
                <unitid>Church RG 095</unitid>
                <langmaterial>
                    <language langcode="eng" scriptcode="Latn"></language>
                </langmaterial>
                <unittitle>Salem Reformed Church (Philadelphia, PA)</unittitle>
                <unitdate normal="1811/2003"></unitdate>
                <physdesc>
                    <extent>1 Cubic Feet</extent>
                </physdesc>
                <langmaterial>
                    <language langcode="eng" scriptcode="Latn">English</language>
                </langmaterial>
            </did>
            <dsc>
            </dsc>
        </archdesc>
</ead>

This is the output I received:

    <ExportData>
    <titleproper>Salem Reformed Church (Philadelphia, PA)</titleproper>
    <unittitle>Salem Reformed Church (Philadelphia, PA)</unittitle>
    <eadid>Church RG 095</eadid>
    <unitid>Church RG 095</unitid>
    <unitdate>1811/2003</unitdate>
</ExportData>
<ExportData>
    <titleproper>First Zion Lehigh Union Church (Alburtis, Pa.)</titleproper>
    <unittitle>First Zion Lehigh Union Church (Alburtis, Pa.)</unittitle>
    <eadid>Church RG 001</eadid>
    <unitid>Church RG 001</unitid>
    <unitdate>1843/2019</unitdate>
</ExportData>
<ExportData>
    <titleproper>Christ Reformed Church (Allentown, PA)</titleproper>
    <unittitle>Christ Reformed Church (Allentown, PA)</unittitle>
    <eadid>Church RG 002</eadid>
    <unitid>Church RG 002</unitid>
    <unitdate>1876/1982</unitdate>
</ExportData>

Here is a screenshot of the Excel file I am using. There are 916 rows in total:

Screenshot of the Excel file

1

There are 1 answers

13
Hermann12 On

OPTION 1 with CSV:

If you can save your Excel as a csv-file you don't need pandas. I would suggest a function which fills your empty xml template and save it to disk with the row number as file id.

import xml.etree.ElementTree as ET
import csv

def create_XML(row, no):
    # Your Template ead.xml will be modified with row data.
    # Register namespaces
    # ns = dict(node for event, node in ET.iterparse('ead.xml', events = ['start-ns']))
    ns = {'': 'urn:isbn:1-931666-22-9', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}
    for prefix, uri in ns.items():
        ET.register_namespace(prefix, uri)
        
    tree = ET.parse('ead.xml')
    root = tree.getroot()
    
    titleproper = root.find('.//titleproper', ns)
    titleproper.text = row[0]
  
    unittitle = root.find('.//unittitle', ns)
    unittitle.text = row[1]
    
    eadid = root.find('.//eadid', ns)
    eadid.text = row[2]
    
    unitid = root.find('.//unitid', ns)
    unitid.text = row[3]
    
    unitdate = root.find('.//unitdate', ns)
    unitdate.attrib['normal'] = row[4]
    # If you remove the hash sign on the next two code rows you get a format like: <unitdate normal="1811/2003 " unitdatetype="inclusive">1811-2003 </unitdate>
    # unitdate.attrib['unitdatetype'] = "inclusive"
    # unitdate.text = row[4].replace('/','-')
    
    
    new_tree = ET.ElementTree(root)
    ET.indent(root, space='  ')
    new_tree.write(f"{no}_book.xml", xml_declaration=True, encoding='UTF-8')
    print(f"{no}_book.xml written!")
    

# Export the Excel to csv with ';' - delimiter e.g:
# <titleproper>;<unittitle> ;<eadid> ;<unitid> ;BeginDate;EndDate;<unitdate>
# Row_No for filename of each CSV row
no = 1
with open("Biblio.csv", newline='') as csvfile:
    book = csv.reader(csvfile, delimiter=' ', quotechar='|')
    next(book, None)
    for row in book:
        line = ' '.join(row).split(';')
        row_xml = [line[0], line[1], line[2], line[3], line[6]]
        create_XML(row_xml, no)
        no += 1

Output example file from 2nd row:

<?xml version='1.0' encoding='UTF-8'?>
<ead xmlns="urn:isbn:1-931666-22-9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:isbn:1-931666-22-9 http://www.loc.gov/ead/ead.xsd">
  <eadheader>
    <eadid>Church RG 001</eadid>
    <filedesc>
      <titlestmt>
        <titleproper>First Zion Lehigh Union Church (Alburtis, PA.)</titleproper>
      </titlestmt>
    </filedesc>
    <profiledesc>
      <langusage>
        <language langcode="eng" scriptcode="Latn" />
      </langusage>
    </profiledesc>
  </eadheader>
  <archdesc level="fonds">
    <did>
      <unitid>Church RG 001</unitid>
      <langmaterial>
        <language langcode="eng" scriptcode="Latn" />
      </langmaterial>
      <unittitle>First Zion Lehigh Union Church (Alburtis, PA.)</unittitle>
      <unitdate normal="1843/2019" />
      <physdesc>
        <extent>1 Cubic Feet</extent>
      </physdesc>
      <langmaterial>
        <language langcode="eng" scriptcode="Latn">English</language>
      </langmaterial>
    </did>
    <dsc />
  </archdesc>
</ead>

My CSV looks like:

<titleproper>;<unittitle> ;<eadid> ;<unitid> ;BeginDate;EndDate;<unitdate>
Salem Reformed Church (Philadelphia, PA);Salem Reformed Church (Philadelphia, PA) ;Church RG 095 ;Church RG 095 ;1811;2003;1811/2003 
First Zion Lehigh Union Church (Alburtis, PA.);First Zion Lehigh Union Church (Alburtis, PA.);Church RG 001;Church RG 001;1843;2019;1843/2019

If you would use a better format you can add some lines more:

For format like this: <unitdate normal="1811/2003" unitdatetype="inclusive">1811-2003</unitdate>

unitdate = root.find('.//unitdate', ns)
unitdate.attrib['normal'] = row[4]
unitdate.attrib['unitdatetype'] = "inclusive"
unitdate.text = row[4].replace('/','-')

OPTION 2 with PANDAS:

Additional here a solution with pandas. CAUTION .xls is not supported! Use .xlsx instead, please. I have used here the extended format for <unitdate>. You can comment it out, if you don't like it:

import xml.etree.ElementTree as ET
import pandas as pd

def create_XML(row, no):
    # Your Template ead.xml will be modified with row data.
    # Register namespaces
    # ns = dict(node for event, node in ET.iterparse('ead.xml', events = ['start-ns']))
    ns = {'': 'urn:isbn:1-931666-22-9', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}
    for prefix, uri in ns.items():
        ET.register_namespace(prefix, uri)
        
    tree = ET.parse('ead.xml')
    root = tree.getroot()
    
    titleproper = root.find('.//titleproper', ns)
    titleproper.text = row[0]
  
    unittitle = root.find('.//unittitle', ns)
    unittitle.text = row[1]
    
    eadid = root.find('.//eadid', ns)
    eadid.text = row[2]
    
    unitid = root.find('.//unitid', ns)
    unitid.text = row[3]
    
    unitdate = root.find('.//unitdate', ns)
    unitdate.attrib['normal'] = row[4]
    unitdate.attrib['unitdatetype'] = "inclusive"
    unitdate.text = row[4].replace('/','-')
     
    new_tree = ET.ElementTree(root)
    ET.indent(root, space='  ')
    new_tree.write(f"{no}_book.xml", xml_declaration=True, encoding='UTF-8')
    print(f"{i+1}_book.xml written!")
    
df = pd.read_excel("Biblio.xlsx")
for i in range(len(df)):
    row_xml = [df.iloc[i,0], df.iloc[i,1], df.iloc[i,2], df.iloc[i,3], df.iloc[i,6]]
    create_XML(row_xml, i+1)

Output here with extended <unitdate>:

<?xml version='1.0' encoding='UTF-8'?>
<ead xmlns="urn:isbn:1-931666-22-9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:isbn:1-931666-22-9 http://www.loc.gov/ead/ead.xsd">
  <eadheader>
    <eadid>Church RG 001</eadid>
    <filedesc>
      <titlestmt>
        <titleproper>First Zion Lehigh Union Church (Alburtis, PA.)</titleproper>
      </titlestmt>
    </filedesc>
    <profiledesc>
      <langusage>
        <language langcode="eng" scriptcode="Latn" />
      </langusage>
    </profiledesc>
  </eadheader>
  <archdesc level="fonds">
    <did>
      <unitid>Church RG 001</unitid>
      <langmaterial>
        <language langcode="eng" scriptcode="Latn" />
      </langmaterial>
      <unittitle>First Zion Lehigh Union Church (Alburtis, PA.)</unittitle>
      <unitdate normal="1843/2019" unitdatetype="inclusive">1843-2019</unitdate>
      <physdesc>
        <extent>1 Cubic Feet</extent>
      </physdesc>
      <langmaterial>
        <language langcode="eng" scriptcode="Latn">English</language>
      </langmaterial>
    </did>
    <dsc />
  </archdesc>
</ead>