I am trying to figure out the loading time, traversal time, and queries response time of RDFLib using MeSH dataset in N-Triple format.
I am trying to execute the queries for two days but no luck.
Here is the code:
import time
from rdflib import ConjunctiveGraph
def mem_func():
query1="""
PREFIX rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
PREFIX rdfs: http://www.w3.org/2000/01/rdf-schema#
PREFIX xsd: http://www.w3.org/2001/XMLSchema#
PREFIX owl: http://www.w3.org/2002/07/owl#
PREFIX meshv: http://id.nlm.nih.gov/mesh/vocab#
PREFIX mesh: http://id.nlm.nih.gov/mesh/
PREFIX mesh2015: http://id.nlm.nih.gov/mesh/2015/
PREFIX mesh2016: http://id.nlm.nih.gov/mesh/2016/
PREFIX mesh2017: http://id.nlm.nih.gov/mesh/2017/
SELECT ?d ?dName ?c ?cName
FROM <http://id.nlm.nih.gov/mesh>
WHERE {
?d a meshv:Descriptor .
?d meshv:active 1 .
?d meshv:concept ?c .
?d rdfs:label ?dName .
?c rdfs:label ?cName
FILTER(REGEX(?dName,"infection","i") || REGEX(?cName,"infection","i"))
}
ORDER BY ?d
"""
g = ConjunctiveGraph()
print("RDFlib DS-2 Loading Started:")
start_time = time.time()
g.parse("mesh.nt", format="nt")
print("RDFlib DS-2 Loading Finished:")
print("--- Loading Time approx %s ---" % (time.time() - start_time))
print("RDF DS-2 Traversal Time Started ")
start_time_traversal = time.time()
file1 = open('mesh.nt','r')
Lines = file1.readlines()
for line in Lines:
print(line.strip())
print("RDF DS-2 Traversal Time Finished ")
print("--- Traversal Time approx %s ---" % (time.time() - start_time_traversal))
results = g.query(query1)
for row in results:
print(row)
if name == "main":
mem_func()
When my program hits the line where the query executes it is throwing an exception:
https://id.nlm.nih.gov/mesh/!DOCTYPE html does not look like a valid URI, trying to serialize this will break.
https://id.nlm.nih.gov/mesh/html lang="en" does not look like a valid URI, trying to serialize this
will break.
Traceback (most recent call last):
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\ntriples.py", line 154, in parse
self.parseline()
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\ntriples.py", line 197, in parseline
subject = self.subject()
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\ntriples.py", line 224, in subject
subj = self.uriref() or self.nodeid()
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\ntriples.py", line 243, in uriref
uri = self.eat(r_uriref).group(1)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\ntriples.py", line 218, in eat
raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line))
rdflib.plugins.parsers.ntriples.ParseError: Failed to eat <([^:]+:[^\s"<>]*)> at
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\sparql\sparql.py", line 285, in _load
return graph.load(source, format='nt', **kwargs)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\graph.py", line 1085, in load
self.parse(source, publicID, format)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\graph.py", line 1549, in parse
context.parse(source, publicID=publicID, format=format, **args)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\graph.py", line 1078, in parse
parser.parse(source, self, **args)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\nt.py", line 26, in parse
parser.parse(f)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\parsers\ntriples.py", line 156, in parse
raise ParseError("Invalid line: %r" % self.line)
rdflib.plugins.parsers.ntriples.ParseError: Invalid line: ''
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:\ToP_RDF_research\rdflib_python_eclipse\src\pyRDF_D2.py", line 135, in
mem_func()
File "E:\ToP_RDF_research\rdflib_python_eclipse\src\pyRDF_D2.py", line 107, in mem_func
results = g.query(query1)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\graph.py", line 1131, in query
return result(processor.query(query_object, initBindings, initNs, **kwargs))
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\sparql\processor.py", line 80, in query
return evalQuery(self.graph, query, initBindings, base)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\sparql\evaluate.py", line 526, in evalQuery
ctx.load(d.default, default=True)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\sparql\sparql.py", line 299, in load
_load(self.graph, source)
File "C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-
packages\rdflib\plugins\sparql\sparql.py", line 289, in _load
source))
Exception: Could not load http://id.nlm.nih.gov/mesh as either RDF/XML, N3 or NTriples
So far, I only figure out that it takes approximately 3hrs loading time and 14 min traversal time.
Where am I going wrong, and how would I be able to run this successfully?