I'm trying to convert a list of PDF files located in my computer directory, into txt format so that R can read it and begin text mining. Do you know what is wrong with this code?
library(tm) #load text mining library
setwd('D:/Directory') #sets R's working directory to near where my files are
ae.corpus<-Corpus(DirSource("D:/Directory/NewsArticles"),readerControl=list(reader=readPlain))
exe <- "C:\\Program Files\\xpdfbin-win-3.03\\bin32\\pdftotext.exe"
system(paste("\"", exe, "\" \"", ae.corpus, "\"", sep = ""), wait = F)
filetxt <- sub(".pdf", ".txt", dest)
shell.exec(filetxt); shell.exec(filetxt) # strangely the first try always throws an error..
summary(ae.corpus) #check what went in
ae.corpus <- tm_map(ae.corpus, tolower)
ae.corpus <- tm_map(ae.corpus, removePunctuation)
ae.corpus <- tm_map(ae.corpus, removeNumbers)
myStopwords <- c(stopwords('english'), "available", "via")
ae.corpus <- tm_map(ae.corpus, removeWords, myStopwords) # this stopword file is at C:\Users\[username]\Documents\R\win-library\2.13\tm\stopwords
ae.tdm <- DocumentTermMatrix(ae.corpus, control = list(minWordLength = 3))
inspect(ae.tdm)
findFreqTerms(ae.tdm, lowfreq=2)
findAssocs(ae.tdm, "economic",.7)
d<- Dictionary (c("economic", "uncertainty", "policy"))
inspect(DocumentTermMatrix(ae.corpus, list(dictionary = d)))
Try and use this instead:
and then
How that helps!