Import huge csv file into neo4j

1k views Asked by At

I am aware of the Import tool but in my case i have to read a row and break it down into nodes and relationships . Using the load csv query with periodic commit with indexes and ,to import 2Million rows its taking more than 12 hours. Is there a way for me to use above mentioned tool with out having to preprocess the csv into nodes and relationships?

Following is the sample query i use

CREATE INDEX ON :Patient(mrno);
CREATE INDEX ON :Location(city);
CREATE INDEX ON :Department(id);

USING PERIODIC COMMIT 1000
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH line,
(CASE  WHEN line.MRNo='' OR line.MRNo='null'  THEN "BLEH" ELSE line.MRNo END, "NA") AS mrn,
(CASE  WHEN line.ID_Admit='' OR line.ID_Admit='NULL'  THEN -1 ELSE line.ID_Admit END,0) AS ID_Admit,
(CASE  WHEN line.DeptCode_Admit='' OR line.DeptCode_Admit='NULL'  THEN -1 ELSE line.DeptCode_Admit END,0) AS DeptCode_Admit,
(CASE  WHEN line.City='' OR line.City='NULL'  THEN "BLEH" ELSE line.City END,"NA") AS city

MERGE (p:Person { mrn: mrn}) ON MATCH SET p.DOB=line.DateOfBirth,p.gender=line.GenderDescription,p.prefix=line.PrefixDescription ON CREATE SET p.DOB=line.DateOfBirth,p.gender=line.GenderDescription,p.prefix=line.PrefixDescription
CREATE (a:Admission{HospitalName:line.Hospital,id:toInt(ID_Admit),unitId:line.UnitID_Admit,IPDNo:line.IPDNO,DateOfAdmission:line.Date_Admit})
MERGE(d:Department{id:toInt(DeptCode_Admit)}) ON MATCH SET d.name=line.DeptName_Admit
MERGE(l:Location{city:city}) ON MATCH SET l.country=line.Country,l.state=line.State


merge  p-[:Admitted]->a 
MERGE a-[:Located]->l
1

There are 1 answers

2
Michael Hunger On

It should be pretty straightforward to just do multiple runs (you can even do those in parallel-with multiple browser or neo4j-shell sessions).

  1. Remove the ON MATCH SET
  2. You misspelt mrno
  3. You are missing indexes for :Person(mrno), :Admission(id)
  4. Your case statements are off
  5. you used on match set when you meant ON CREATE SET
  6. you can further optimize the import by running a distinct on your WITH with only the fields you want to import, see Department

Here is your fixed / full / multiple-run import script:

CREATE INDEX ON :Patient(mrno);


CREATE INDEX ON :Location(city);
CREATE INDEX ON :Department(id);

// additional indexes / constraints

CREATE INDEX ON :Person(mrno);

CREATE CONSTRAINT ON (a:Admission) assert a.id is unique;

USING PERIODIC COMMIT 100000
explain
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH line,
CASE  WHEN line.MRNo='' OR line.MRNo='null'  THEN "NA" ELSE line.MRNo END AS mrno

MERGE (p:Person { mrno: mrno}) 
  ON CREATE SET p.DOB=line.DateOfBirth,p.gender=line.GenderDescription,p.prefix=line.PrefixDescription;


USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH line,
CASE  WHEN line.ID_Admit='' OR line.ID_Admit='NULL'  THEN -1 ELSE toInt(line.ID_Admit) END AS ID_Admit

CREATE (a:Admission{HospitalName:line.Hospital,id:ID_Admit,unitId:line.UnitID_Admit,IPDNo:line.IPDNO,DateOfAdmission:line.Date_Admit});

USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH distinct line.DeptName_Admit AS DeptName_Admit,
CASE  WHEN line.DeptCode_Admit='' OR line.DeptCode_Admit='NULL'  THEN -1 ELSE toInt(line.DeptCode_Admit) END AS DeptCode_Admit

MERGE (d:Department{id:DeptCode_Admit}) 
  ON CREATE SET d.name=DeptName_Admit;


USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH line,
CASE  WHEN line.City='' OR line.City='NULL'  THEN "NA" ELSE line.City END AS city

MERGE(l:Location{city:city}) 
  ON CREATE SET l.country=line.Country,l.state=line.State;


USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH
CASE  WHEN line.MRNo='' OR line.MRNo='null'  THEN "NA" ELSE line.MRNo END AS mrno,
CASE  WHEN line.ID_Admit='' OR line.ID_Admit='NULL'  THEN -1 ELSE toInt(line.ID_Admit) END AS ID_Admit

MATCH (p:Person { mrno: mrno}) 
MATCH (a:Admission {id:ID_Admit})
MERGE (p)-[:Admitted]->(a);

USING PERIODIC COMMIT 10000
explain
LOAD CSV WITH HEADERS FROM "file:///home/geralt/Desktop/Temp_Admission.csv" AS line
WITH
CASE  WHEN line.ID_Admit='' OR line.ID_Admit='NULL'  THEN -1 ELSE toInt(line.ID_Admit) END AS ID_Admit,
CASE  WHEN line.City='' OR line.City='NULL'  THEN "NA" ELSE line.City END AS city

MATCH (a:Admission {id:ID_Admit})
MATCH (l:Location{city:city}) 
MERGE (a)-[:Located]->(l);