Linked Questions

Popular Questions

Speed up XML processing with Python and lxml

Asked by At

I've implemented the following function, that processes xml file and extracts specific data from the RcrdSts nodes:

import logging
import pandas as pd
from lxml import etree

logging.basicConfig(filename="log.log",
                    filemode="w",
                    format="%(asctime)s - %(levelname)s - %(message)s",
                    level=logging.INFO)
                    
def fca_feedback_read_record_status(working_dir, response_file):
    # initialize variables
    ns3 = "urn:iso:std:iso:20022:tech:xsd:auth.031.001.01"
    column_names = ['record_id', 'status', 'error_code', 'error_description']
    record_list = []
    local_start = 0
    local_end = 0

    # open response file and initialize tree
    logging.info("Extracting rejected records from file '%s' ... " % response_file)
    tree = etree.parse(working_dir + response_file)
    root = tree.getroot()

    # count the number of RcrdSts nodes in the responce file
    record_status_nodes_count = len(root.xpath("//ns3:RcrdSts",
                                               namespaces={"ns3": ns3}))

    if record_status_nodes_count > 0:
        logging.info("| --> Number of record status nodes: '%s'" % record_status_nodes_count)

        # iterate over the tree
        for i in range(record_status_nodes_count):
            # get record id
            record_id = root.xpath("//ns3:RcrdSts/ns3:OrgnlRcrdId",
                                   namespaces={"ns3": ns3})[i].text

            # get record status
            record_status = root.xpath("//ns3:RcrdSts/ns3:Sts",
                                       namespaces={"ns3": ns3})[i].text

            # get number of validation errors for the current records
            local_validation_rule_count = len(root.xpath("//ns3:RcrdSts",
                                                         namespaces={"ns3": ns3})[i]) - 2

            
            # LOOP section
            local_end = local_end + local_validation_rule_count

            for k in range(local_start, local_end):
                # get error code
                validation_rule_id = root.xpath("//ns3:RcrdSts/ns3:VldtnRule",
                                                namespaces={"ns3": ns3})[k][0].text
                # get error description
                validation_rule_desc = root.xpath("//ns3:RcrdSts/ns3:VldtnRule",
                                                  namespaces={"ns3": ns3})[k][1].text
                # append values to list
                record_list.append([record_id, record_status, validation_rule_id, validation_rule_desc])

            local_start = local_start + local_validation_rule_count

        logging.info("Creating output file ... ")
        export_df = pd.DataFrame(record_list, columns=column_names)
        logging.info("Exporting file ... ")
        export_filename = response_file.replace(".xml", ".csv")
        export_df.to_csv("D:\\Personal\\" + export_filename, index=None)

        logging.info("Processing completed. ")

        return True

    else:

        logging.info("Rejected records not found in file '%s' ... " % response_file)

        return False
        
        
if __name__ == "__main__":
    workdir = "D:\\Personal\\"
    response_xml = "NCAGB_FDBTRA_I213800H4ECWD5Z7JJ680_004665_23.xml"

    result = process_fca_feedback_file(workdir, response_xml)

The source xml file looks like this:

<?xml version='1.0' encoding='UTF-8'?>
<BizData xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ns3="urn:iso:std:iso:20022:tech:xsd:auth.031.001.01" xmlns:ns2="urn:iso:std:iso:20022:tech:xsd:head.001.001.01" xmlns="urn:iso:std:iso:20022:tech:xsd:head.003.001.01" xsi:schemaLocation="urn:iso:std:iso:20022:tech:xsd:head.003.001.01 head.003.001.01.xsd urn:iso:std:iso:20022:tech:xsd:head.001.001.01 head.001.001.01_ESMAUG_1.0.0.xsd urn:iso:std:iso:20022:tech:xsd:auth.031.001.01 auth.031.001.01_ESMAUG_Reporting_1.1.0.xsd">
  <Hdr>
    <ns2:AppHdr>
      <ns2:Fr>
        <ns2:OrgId>
          <ns2:Id>
            <ns2:OrgId>
              <ns2:Othr>
                <ns2:Id>GB</ns2:Id>
              </ns2:Othr>
            </ns2:OrgId>
          </ns2:Id>
        </ns2:OrgId>
      </ns2:Fr>
      <ns2:To>
        <ns2:OrgId>
          <ns2:Id>
            <ns2:OrgId>
              <ns2:Othr>
                <ns2:Id>253800H4ECWD5Z7JJ890</ns2:Id>
              </ns2:Othr>
            </ns2:OrgId>
          </ns2:Id>
        </ns2:OrgId>
      </ns2:To>
      <ns2:BizMsgIdr>005469-0_23</ns2:BizMsgIdr>
      <ns2:MsgDefIdr>auth.031.001.01</ns2:MsgDefIdr>
      <ns2:CreDt>2023-05-10T16:43:26Z</ns2:CreDt>
      <ns2:Rltd>
        <ns2:Fr>
          <ns2:OrgId>
            <ns2:Id>
              <ns2:OrgId>
                <ns2:Othr>
                  <ns2:Id>213800H4ECWD5Z7JJ680</ns2:Id>
                  <ns2:SchmeNm>
                    <ns2:Prtry>LEI</ns2:Prtry>
                  </ns2:SchmeNm>
                </ns2:Othr>
              </ns2:OrgId>
            </ns2:Id>
          </ns2:OrgId>
        </ns2:Fr>
        <ns2:To>
          <ns2:OrgId>
            <ns2:Id>
              <ns2:OrgId>
                <ns2:Othr>
                  <ns2:Id>GB</ns2:Id>
                  <ns2:SchmeNm>
                    <ns2:Prtry>FCA</ns2:Prtry>
                  </ns2:SchmeNm>
                </ns2:Othr>
              </ns2:OrgId>
            </ns2:Id>
          </ns2:OrgId>
        </ns2:To>
        <ns2:BizMsgIdr>10052023REP09</ns2:BizMsgIdr>
        <ns2:MsgDefIdr>auth.016.001.01</ns2:MsgDefIdr>
        <ns2:CreDt>2023-05-10T12:01:09Z</ns2:CreDt>
      </ns2:Rltd>
    </ns2:AppHdr>
  </Hdr>
  <Pyld>
    <ns3:Document>
      <ns3:FinInstrmRptgStsAdvc>
        <ns3:StsAdvc>
          <ns3:MsgRptIdr>000175-0_23</ns3:MsgRptIdr>
          <ns3:MsgSts>
            <ns3:Sts>RJCT</ns3:Sts>
            <ns3:Sttstcs>
              <ns3:TtlNbOfRcrds>1</ns3:TtlNbOfRcrds>
              <ns3:NbOfRcrdsPerSts>
                <ns3:DtldNbOfRcrds>1</ns3:DtldNbOfRcrds>
                <ns3:DtldSts>RJCT</ns3:DtldSts>
              </ns3:NbOfRcrdsPerSts>
            </ns3:Sttstcs>
          </ns3:MsgSts>
          <ns3:RcrdSts>
            <ns3:OrgnlRcrdId>213800H4ECWD5Z7JJ68009052023D00009</ns3:OrgnlRcrdId>
            <ns3:Sts>RJCT</ns3:Sts>
            <ns3:VldtnRule>
              <ns3:Id>CON-610</ns3:Id>
              <ns3:Desc>Waiver indicator is inconsistent with trading venue</ns3:Desc>
            </ns3:VldtnRule>
            <ns3:VldtnRule>
              <ns3:Id>CON-640</ns3:Id>
              <ns3:Desc>Commodity derivative indicator is missing</ns3:Desc>
            </ns3:VldtnRule>
          </ns3:RcrdSts>
        </ns3:StsAdvc>
      </ns3:FinInstrmRptgStsAdvc>
    </ns3:Document>
  </Pyld>
</BizData>

The code works fine and it does extract the content from multiple RcrdSts nodes. The loop is needed, because sometimes there are multiple validation errors and descriptions per record.

The issue I'm facing is that when the number of nodes increases - like 1 million nodes, the process takes forever. For large files the code processes some 4.5 records per second, which is slow IMO.

I would appreciate ideas how to optimize the code for speed.

Thanks in advance for everyone's time and efforts.

Related Questions