Importing graphml file into R igraph causes PCDATA invalid char value error

829 views Asked by At

For my master thesis I have to do some network analysis. At first I use Java to restructure my tweet data to an edge list with edge and vertex attributes.

After importing the csv file created by Java into R I create the vertices data frame out of the edge data to finally build a graph using the igraph package.

After saving the graph with write.graph I need to import it again at a later date. Using read.graph caused at first an error that the data is not proper UTF-8 (similar like Graphml parse error), so I prevented that by using iconv().

Now I receive the error:

Error in .Call("R_igraph_read_graph_graphml", file, as.numeric(index),  : 
  At foreign-graphml.c:1202 : PCDATA invalid Char value 3
, Parse error

Apparently I have invalid XML characters in the graphml file. I tried to prevent invalid characters in the first place by using replaceAll() in my Java program that was supposed to remove those characters during the restructuring process, but all regular expressions I found did not help. One expression turned the error to char value 16 instead of 3, but I unfortunately can't find it anymore.

Can someone give me an expression that matches all possible invalid characters respectively prevents the error above?

Here is my Java code that restructures the data:

public List<Edge> buildEdges(BufferedReader reader)
            throws RestructurerException, IOException {
        List<Edge> edges = new LinkedList<Edge>();
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] values = line.split("     ;     "); // split each tweet line by tweet elements
            if (values.length == 14) {
                if (values[7].equals("")) {
                    values[7] = "NULL";
                } else {
                    values[7] = values[7].replaceAll(";", ""); // remove semicolon in that tweet element since it causes problems (seperator)
                }
                long timestamp = Timestamp.valueOf(values[4]).getTime()
                        - TWEET_EPOCH;
                long profileAge = Timestamp.valueOf(values[12]).getTime()
                        - PROFILE_EPOCH;
                String mentiontext = values[3];
                String[] mentions = mentiontext.split(" "); // split the mentioned users seperated by whitespace
                for (String mention : mentions) {
                    Edge edge = new Edge(mention, values[1],
                            values[2].replaceAll(";", ""), timestamp,
                            values[5], values[6], values[7], values[8],
                            values[9], values[10], values[11], profileAge,
                            values[13]);
                    edges.add(edge);
                }

            }
        }
        System.out.println("Anzahl Edges:" + edges.size());
        return edges;
    }

The egde list is written to a file as csv by another method that is likely not so important to show it.

My R code to turn the edge list to a graph (I shorted the paths for demonstration aims):

library(igraph)
edges <- read.csv2("C:/.../Mentions_iwS_Edges.csv", header=TRUE, quote=""); 
amount <- nrow(edges);
amount;
sources <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));
targets <- data.frame(Vertexname = character(amount), Description = character(amount), Follower = numeric(amount), Friends = numeric(amount), Favourites = numeric(amount), Statuses = numeric(amount), ProfileAge = numeric(amount), Listed = numeric(amount), Timestamp = numeric(amount), OutDegree = numeric(amount), InDegree = numeric(amount), WOutDegree = numeric(amount), WInDegree = numeric(amount));

for (i in 1:ncol(edges)) {
    edges[,i] <- iconv(edges[,i], to = "UTF-8", sub = "");
}
print("REPORT: Data converted to UTF-8");

sources[,1] <- edges[,1];
sources[,2:8] <- NA;
sources[,9] <- edges[,4];
sources[,10:13] <- NA;
targets[,1] <- edges[,2];
targets[,2] <- edges[,7];
targets[,3] <- edges[,8];
targets[,4] <- edges[,9];
targets[,5] <- edges[,10];
targets[,6] <- edges[,11];
targets[,7] <- edges[,12];
targets[,8] <- edges[,13];
targets[,9:13] <- NA;

print("REPORT: vertices data frames filled")

sources <- unique(sources);
targets <- unique(targets);
print("REPORT: Duplicated sources and targets removed");

nodes <- within(merge(sources, targets, by="Vertexname", all=TRUE), {
            Description <- ifelse(is.na(Description.x), paste(Description.y), Description.x); Description.x = NULL; Description.y = NULL; 
            Follower <- ifelse(is.na(Follower.x), Follower.y, Follower.x); Follower.x = NULL; Follower.y = NULL; 
            Friends <- ifelse(is.na(Friends.x), Friends.y, Friends.x); Friends.x = NULL; Friends.y = NULL;
            Favourites <- ifelse(is.na(Favourites.x), Favourites.y, Favourites.x); Favourites.x = NULL; Favourites.y = NULL;
            Statuses <- ifelse(is.na(Statuses.x), Statuses.y, Statuses.x); Statuses.x = NULL; Statuses.y = NULL;
            ProfileAge <- ifelse(is.na(ProfileAge.x), ProfileAge.y, ProfileAge.x); ProfileAge.x = NULL; ProfileAge.y = NULL;
            Listed <- ifelse(is.na(Listed.x), Listed.y, Listed.x); Listed.x = NULL; Listed.y = NULL;
            Timestamp <- ifelse(is.na(Timestamp.y), Timestamp.x, Timestamp.y); Timestamp.x = NULL; Timestamp.y = NULL;
            OutDegree <- ifelse(is.na(OutDegree.x), OutDegree.y, OutDegree.x); OutDegree.x = NULL; OutDegree.y = NULL;
            InDegree <- ifelse(is.na(InDegree.x), InDegree.y, InDegree.x); InDegree.x = NULL; InDegree.y = NULL;
            WOutDegree <- ifelse(is.na(WOutDegree.x), WOutDegree.y, WOutDegree.x); WOutDegree.x = NULL; WOutDegree.y = NULL;
            WInDegree <- ifelse(is.na(WInDegree.x), WInDegree.y, WInDegree.x); WInDegree.x = NULL; WInDegree.y = NULL});
print("REPORT: Sources and Targets merged");

nodes <- subset(nodes, !duplicated(nodes$Vertexname));
print("REPORT: Duplicated vertices removed");

nrow(nodes);

edges <- edges[complete.cases(edges[,1:2]),];
nodes <- nodes[complete.cases(nodes[,1]),];
print("REPORT: Invalid edges and nodes removed");

g <- graph.data.frame(edges, directed=TRUE, nodes);
print("REPORT: Graph created");

outdegrees <- degree(g, v=V(g), mode="out");
indegrees <- degree(g, v=V(g), mode="in");
woutdegrees <- graph.strength(g, v=V(g), mode="out");
windegrees <- graph.strength(g, v=V(g), mode="in");
g <- set.vertex.attribute(g, "OutDegree", V(g), outdegrees);
g <- set.vertex.attribute(g, "InDegree", V(g), indegrees);
g <- set.vertex.attribute(g, "WOutDegree", V(g), woutdegrees);
g <- set.vertex.attribute(g, "WInDegree", V(g), windegrees);
print("REPORT: Degree calculated and added as vertex attribute");

write.graph(g, "C:/.../Mentions_iwS_Graph.graphml", format="graphml");
print("REPORT: Graph saved");

The R script that throws the error:

library(igraph);
g <- read.graph("C:/.../Mentions_iwS_Graph.graphml", format="graphml");
length(E(g));
length(V(g));

Thank you for your help already!

0

There are 0 answers