Add detected topics to input data

161 views Asked by At
library(dplyr)
library(ggplot2)
library(stm)
library(janeaustenr)
library(tidytext)

library(quanteda)
testDfm <- gadarian$open.ended.response %>%
    tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)  %>%
    dfm()
    
out <- convert(testDfm, to = "stm")
documents <- out$documents
vocab <- out$vocab
meta <- out$meta

topic_model<- stm(documents = out$documents, vocab = out$vocab, K = 5)

Using these lines a topic modeling approach is possible

How is it possible to use tidytext in order to receive for every row of input data gadarian see every row linkedin to which topic, adding topics to input data?

Example of expected output

"MetaID" "treatment" "pid_rep"  "open.ended.response" "topic_number"

Update code as example of expected output:

library(stm)
library(tidyr)
library(quanteda)
testDfm <- gadarian$open.ended.response %>%
    tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)  %>%
    dfm()
    
out <- convert(testDfm, to = "stm")
documents <- out$documents
vocab <- out$vocab
meta <- out$meta

fittedModel <- stm(documents = out$documents, vocab = out$vocab, K = 5)

documentMatches <- findThoughts(fittedModel, texts = gadarian$open.ended.response, n = 1)
docTopics <- sapply(1:nrow(gadarian), function(docIndex) { names(documentMatches$index[documentMatches$index == docIndex][1]) })
gadarian$topic <- docTopics
1

There are 1 answers

3
Paolo Lorenzini On
install.packages("reshape2")
library(reshape2)
td_beta <- tidy(fittedModel)
td_beta
td_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  ggplot(aes(term, beta)) +
  geom_col() +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
td_gamma <- tidy(fittedModel, matrix = "gamma",
                 document_names = rownames(gadarian))
td_gamma