How to plot dendrogram of large word assocations

391 views Asked by At

I would like to plot the word association for a text file. Part of the problem seems to be the number of words, and the processing time, which I've tried to speed up using lapply to replace the nested loop. But, I'm not sure if the lapply replacement is correct. Then the denegram may be too dense to be useful. The questions are: 1) how to speed up the nested for loops, and 2) how to display the denegram.

library(RXKCD)
library(tm)
library(wordcloud)
library(RColorBrewer)
require(gdata)

path <- system.file("xkcd", package = "RXKCD")
datafiles <- list.files(path)
xlsdf <- read.csv(file.path(path, datafiles))


ap.corpus <- Corpus(DataframeSource(data.frame(as.character(xlsdf[,'transcript'])))) 
ap.corpus <- tm_map(ap.corpus, removePunctuation) 
ap.corpus <- tm_map(ap.corpus, tolower) 
ap.corpus <- tm_map(ap.corpus, removeNumbers)
ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, stopwords("english"))) 
# additional stopwords can be used as shown below  
#ap.corpus <- tm_map(ap.corpus, function(x) removeWords(x, c("ukoer","oer"))) 
ap.corpus <- tm_map(ap.corpus, PlainTextDocument)
ap.tdm <- TermDocumentMatrix(ap.corpus) 
findFreqTerms(ap.tdm, lowfreq=40)
ap.m <- as.matrix(ap.tdm) 
ap.v <- sort(rowSums(ap.m),decreasing=TRUE) 
ap.d <- data.frame(word = names(ap.v),freq=ap.v) 
print(table(ap.d$freq) )
pal2 <- brewer.pal(8,"Dark2") 

# png("wordcloud_packages.png", width=1280,height=800) 
#print(wordcloud(ap.d$word,ap.d$freq, scale=c(8,.2),min.freq=40, 
#          max.words=Inf, random.order=FALSE, rot.per=.05, colors=pal2))
# dev.off()

f <- matrix (0, ncol=nrow(ap.tdm), nrow=nrow(ap.tdm))  
colnames (f) <- rownames(ap.tdm)
rownames (f) <- rownames(ap.tdm)

# This is the nested loop to replace
#for (i in rownames (ap.tdm)) { 
#  ff <- findAssocs (ap.tdm,i,0)
#  for  (j in rownames (ff)) {
#    f[j,i]=ff[j,]
#  }
#}

fcn2 <- function(j,ff) { ff[j]; }
fcn1 <- function(i) {ff<-findAssocs(ap.tdm,i,0); 
                     f[rownames(ff),i]<-lapply(rownames(ff), fcn2, ff);}
lapply(rownames(ap.tdm), fcn1)

fd <- as.dist(f) # calc distance matrix
plot(hclust(fd, method="ward"))  # plot dendrogram

# very simple dendrogram
hc = hclust(dist(f))
plot(hc)
0

There are 0 answers