Check mismatch data from 2 file CSV and XML

83 views Asked by At

I have 2 kinds of file contains the data are CSV and XML, I want to code in Python and probably using Pandas to read data from 2 file and compare to check if when one of the file have changed the data but the other wasn't so it print messages to the user. Sum it all, code python for compare mismatch data and print messages to the users

I have tried:

def read_csv(file_path):
...
def read_xml(file_path):
...
def compare_files
    mismatches = pd.concat([df_csv_subset, df_xml_subset]).drop_duplicates(keep=False)


# Identify missing data
missing_csv = df_xml_subset.merge(df_csv_subset, how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

missing_xml = df_csv_subset.merge(df_xml_subset, how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)
##orderd the code block
2

There are 2 answers

0
Arunbh Yashaswi On

Compare file function is present and its initialization is wrong

def compare_files(df_csv, df_xml):
    mismatches_csv = pd.concat([df_csv, df_xml]).drop_duplicates(keep=False)
    mismatches_xml = pd.concat([df_xml, df_csv]).drop_duplicates(keep=False)

    return mismatches_csv, mismatches_xml

mismatches_csv, mismatches_xml = compare_files(df_csv, df_xml)

I think this should give what you desire

0
TheHungryCub On

This worked for me. Please try:

import pandas as pd
import xml.etree.ElementTree as ET

# Read data from CSV file
def read_csv(file_path):
    return pd.read_csv(file_path)

# Read data from XML file
def read_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []
    for record in root.findall('.//record'):
        row = {}
        for field in record.findall('.//field'):
            row[field.attrib['name']] = field.text
        data.append(row)

    return pd.DataFrame(data)

# Compare CSV and XML files
def compare_files(csv_path, xml_path):
    df_csv = read_csv(csv_path)
    df_xml = read_xml(xml_path)

    # Compare files
    mismatches = pd.concat([df_csv, df_xml]).drop_duplicates(keep=False)

    # Identify missing data in CSV
    missing_csv = df_xml.merge(df_csv, how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

    # Identify missing data in XML
    missing_xml = df_csv.merge(df_xml, how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

    # Print messages to the user
    if not mismatches.empty:
        print("Mismatched data found:")
        print(mismatches)

    if not missing_csv.empty:
        print("Data missing in CSV file:")
        print(missing_csv)

    if not missing_xml.empty:
        print("Data missing in XML file:")
        print(missing_xml)

compare_files('data.csv', 'data.xml')