I am using Visual Studio 2022. I have 9 million XML files and need to transfer the data into SQL Server.
I am reading in the XML through XmlDocument
, Xmlnode
list. It's working fine, but taking so much time to complete - around 4 hours to complete the task. With SqlBulkCopy
, it's taking 5 minutes only.
I am reading xml file one by one. How could I read multiple XML files on each thread? I think using thread concept, I read all XML files easily.
My code:
DateTime startTime = DateTime.Now;
_xmlfiles = Directory.GetFiles(source, "*.xml", SearchOption.AllDirectories).ToList();
_xmlfiles.Remove(Path.Combine(Path.GetFullPath(source), "abc.xml"));
_xmlfiles.Remove(Path.Combine(Path.GetFullPath(source), "xyz.xml"));
WriteLog(_logPath, string.Format("{0} |xml files are found in given path.{1}", DateTime.Now,_xmlfiles.Count ));
DataTable _table = /* code for get table structure */;
foreach (string _xmlfile in _xmlfiles)
{
_casFile = Path.GetFileNameWithoutExtension(_xmlfile);
LoadXml(_xmlfile, _table);
}
WriteLog(_logPath, string.Format("{0} |Reading the files is completed,Found the Total Phrases {1}.",DateTime.Now, _table.Rows.Count ));
DataSet ds = new DataSet();
ds.Tables.Add(_table);
_sqlWriter = new SQLWriter();
WriteLog(_logPath, string.Format("{0} |Writing Phrases into data base is started......",DateTime.Now ));
_sqlWriter.AutoSqlBulkCopy(ds);
Load XML method:
XmlDocument _casFile = new XmlDocument();
_casFile.Load(xmlfile);
XmlNodeList _instanceList = _casFile.SelectNodes("Specification/Substance/Property/Instance");
foreach (XmlNode _instance in _instanceList)
{
string _day = _instance.Attributes["DD"].Value;
string _month = _instance.Attributes["MM"].Value;
string _year = _instance.Attributes["YYYY"].Value;
string _active =_instance.Attributes["Active"].Value;
_uniqueID = _instance.Attributes["UniqueID"].Value;
_baseCode = _instance.Attributes["LitSource"].Value;
XmlNodeList _phraseList = _instance.SelectNodes("Fields/Phrases");
foreach (XmlNode _phrase in _phraseList)
{
string _Characteristic = _phrase.Attributes["ID"].Value;
XmlNodeList _childList = _phrase.ChildNodes;
foreach(XmlNode _child in _childList)
{
DataRow row = table.NewRow();
row["Substance"] = _substance;
row["UniqueID"] = _uniqueID;
row["Characteristic"] = _Characteristic;
row["PhraseID"] = _child.Attributes["Key"].Value;
if (_child.Attributes["Code"] != null)
{
row["Code"] = _child.Attributes["Code"].Value;
}
else
{
row["Code"] = string.Empty;
}
row["BaseCode"] = _baseCode;
row["Description"] = _child.InnerText;
row["Active"] = _active;
row["DD"] = _day;
row["MM"] = _month;
row["YYYY"] = _year;
table.Rows.Add(row);
}
}
}
You should use Linq and the .AsParallel() method if you are sure that the way you insert stuff into the database has no order dependency.
I am not sure exactly how you implement this, but Microsoft has a nice example here: https://learn.microsoft.com/en-us/dotnet/standard/parallel-programming/how-to-iterate-file-directories-with-the-parallel-class
You should probably not load the xml file when you test your solution, but do something simple instead. Perhaps change your method to receive an Action to execute when it resolves an xml file path.