XML to data frame with missing nodes

900 views Asked by At

Versions of this question have been asked before, as here and here. However, I still can't get it to work. I'm trying to parse an XML document into a data frame. The problem is that the some of the variables are not present for some of the observations, so I'm getting an error because there are a different number of rows. My data looks like this:

library("xml2")
library("dplyr")

example <- read_xml(
'
<particDesc>
<person role="participant" sameAs="#P484" xml:id="EDcon250_S1">
<age value="3">35-49</age>
<sex value="1">male</sex>
<occupation>waiter</occupation>
<langKnowledge>
<langKnown level="L1" tag="ita"/>
</langKnowledge>
</person>
<person role="participant" sameAs="#P485" xml:id="EDcon250_S7">
<age value="0">unknown</age>
<sex value="2">female</sex>
<occupation>waitress</occupation>
<langKnowledge>
<langKnown level="L1" tag="ger-AT"/>
</langKnowledge>
</person>
<person role="participant" sameAs="#P465" xml:id="EDcon250_S2">
<age value="2">25-34</age>
<sex value="2">female</sex>
<langKnowledge>
<langKnown level="L1" tag="ger-AT"/>
<langKnown level="L1" tag="eng-US"/>
</langKnowledge>
</person>
</particDesc>
')

I'm using Wickham's xml2 package to read the xml. I would prefer using this package, but would be open to using XML if that is the best (or only) way of solving this problem. Anyways, my code is the following:

participants <- xml_find_all(example, './/person[@role = "participant"]')

extract_participants <- function(div){
id <- xml_attr(div, "id")
same_as <- xml_attr(div, "sameAs")
role <- xml_attr(div, "role")
age <- xml_find_all(div, ".//age") %>% xml_text()
sex <- xml_find_all(div, ".//sex") %>% xml_text()
occupation <- xml_find_all(div, ".//occupation") %>% xml_text()

data_frame(id, same_as,role, age, sex, occupation)
}


parts_ls <- lapply(participants, extract_participants)

participants_df <- do.call(rbind, parts_ls)

The problem in this specific concerns the occupation variable (the third person doesn't have one), but in my actual data it could be one of the other variables as well. As I said, I can see that this question has been asked before, but I could get any of the suggestions to work (probably due to my not completing understanding the solution). Ultimately, I would like NAs to be returned whenever a particular node is missing (so the occupation variable for the third person would be NA.

On edit: Here's the alternative XML version

library("XML")
library("magrittr")

example2 <- xmlParse(
'
 <particDesc>
 <person role="participant" sameAs="#P484" xml:id="EDcon250_S1">
 <age value="3">35-49</age>
 <sex value="1">male</sex>
 <occupation>waiter</occupation>
 <langKnowledge>
 <langKnown level="L1" tag="ita"/>
 </langKnowledge>
 </person>
 <person role="participant" sameAs="#P485" xml:id="EDcon250_S7">
 <age value="0">unknown</age>
 <sex value="2">female</sex>
 <occupation>waitress</occupation>
 <langKnowledge>
 <langKnown level="L1" tag="ger-AT"/>
 </langKnowledge>
 </person>
 <person role="participant" sameAs="#P465" xml:id="EDcon250_S2">
 <age value="2">25-34</age>
 <sex value="2">female</sex>
 <langKnowledge>
 <langKnown level="L1" tag="ger-AT"/>
 <langKnown level="L1" tag="eng-US"/>
 </langKnowledge>
 </person>
 </particDesc>
')

example_root <- xmlRoot(example2) 

process <- function(x){
id <- xmlGetAttr(x, "id")
role <- xmlGetAttr(x, "role")
age <- getNodeSet(x, ".//age") %>% xmlSApply(xmlValue)
sex <- getNodeSet(x, ".//sex") %>% xmlSApply(xmlValue)
#occupation <- getNodeSet(x, ".//occupation") %>% xmlSApply(xmlValue)
data.frame(id = id,
           role = role,
           #occupation = occupation,
           age = age,
           sex = sex,
           stringsAsFactors = FALSE)
}


 ls <- xpathApply(example_root, "//person", process)
 df <- do.call(rbind, ls)

Just uncomment occupation to see the problem.

1

There are 1 answers

0
JoeF On BEST ANSWER

I got something to work, but I'm not sure whether it is an ideal solution (I think it is pretty long-winded). Anyways, here is what I have so far. Suggestions for improvement are welcome.

library("XML")
library("magrittr")

example2 <- xmlParse(
'
<particDesc>
<person role="participant" sameAs="#P484" xml:id="EDcon250_S1">
<age value="3">35-49</age>
<sex value="1">male</sex>
<occupation>waiter</occupation>
<langKnowledge>
<langKnown level="L1" tag="ita"/>
</langKnowledge>
</person>
<person role="participant" sameAs="#P485" xml:id="EDcon250_S7">
<age value="0">unknown</age>
<sex value="2">female</sex>
<occupation>waitress</occupation>
<langKnowledge>
<langKnown level="L1" tag="ger-AT"/>
</langKnowledge>
</person>
<person role="participant" sameAs="#P465" xml:id="EDcon250_S2">
<age value="2">25-34</age>
<sex value="2">female</sex>
<langKnowledge>
<langKnown level="L1" tag="ger-AT"/>
<langKnown level="L1" tag="eng-US"/>
</langKnowledge>
</person>
</particDesc>
')

example_root <- xmlRoot(example2) 
person <- getNodeSet(example_root, "//person")

id <- lapply(person, xmlGetAttr, "id") %>% unlist()
role <- lapply(person, xmlGetAttr, "role") %>% unlist()
age <- lapply(person, xpathSApply, ".//age", xmlValue) %>% unlist()
sex <- lapply(person, xpathSApply, ".//sex", xmlValue) %>% unlist()
occupation <- lapply(person, xpathSApply, ".//occupation", xmlValue)
occupation[sapply(occupation, is.list)] <- NA 
occupation <- unlist(occupation)

df <- data.frame(
   id = id,
   role = role,
   age = age,
   sex = sex,
   occupation = occupation)

On Edit: For completion, here is the corresponding xml2 version (abridged)

example <- read_xml(
'
 <particDesc>
 <person role="participant" sameAs="#P484" xml:id="EDcon250_S1">
 <age value="3">35-49</age>
 <sex value="1">male</sex>
 <occupation>waiter</occupation>
 <langKnowledge>
 <langKnown level="L1" tag="ita"/>
 </langKnowledge>
 </person>
 <person role="participant" sameAs="#P485" xml:id="EDcon250_S7">
 <age value="0">unknown</age>
 <sex value="2">female</sex>
 <occupation>waitress</occupation>
 <langKnowledge>
 <langKnown level="L1" tag="ger-AT"/>
 </langKnowledge>
 </person>
 <person role="participant" sameAs="#P465" xml:id="EDcon250_S2">
 <age value="2">25-34</age>
 <sex value="2">female</sex>
 <langKnowledge>
 <langKnown level="L1" tag="ger-AT"/>
 <langKnown level="L1" tag="eng-US"/>
 </langKnowledge>
 </person>
 </particDesc>
')

participants <- xml_find_all(example, './/person[@role = "participant"]')

id <- lapply(participants, xml_attr, "id")
occupation <- lapply(participants, xml_find_all, ".//occupation")
occupation <- lapply(occupation, xml_text)
occupation[!sapply(occupation, function(y) length(y == 0))] <- NA

occupation <- unlist(occupation)
id <- unlist(id)

data_frame(
  id = id, 
  occupation = occupation)