|
楼主 |
发表于 2013-4-29 12:47:10
|
显示全部楼层
2 R语言实现主题模型
通过上面简介,我们了解了主题模型是什么。这样心里就大体知道主题模型都能用来做什么工作。
下面我们通过一些小的例子,通过R语言来实现主题模型的计算过程。 深入的学习还需自行挖掘。此处仅作一例子示范,让大家能够实践此过程,方便学习。
重要的不是结果,而是整个过程。
install.packages("corpus.JSS.papers",
repos = "http://datacube.wu.ac.at/",
type = "source")
data("JSS_papers", package = "corpus.JSS.papers");
require("OAIHarvester")
x <- oaih_list_records("http://www.jstatsoft.org/oai")
JSS_papers <- oaih_transform(x[, "metadata"])
JSS_papers <- JSS_papers[order(as.Date(unlist(JSS_papers[, "date"]))), ]
JSS_papers <- JSS_papers[grep("Abstract:", JSS_papers[, "description"]), ]
JSS_papers[, "description"] <- sub(".*\nAbstract:\n", "",unlist(JSS_papers[, "description"]))
set.seed(1102)
#library("topicmodels")
library("XML")
library(Snowball)
library(tm)
remove_HTML_markup <-function(s) {
doc <- htmlTreeParse(s, asText = TRUE, trim = FALSE)
iconv(xmlValue(xmlRoot(doc)), "", "UTF-8")
}
corpus <- Corpus(VectorSource(sapply(JSS_papers[, "description"],remove_HTML_markup)))
JSS_dtm <- DocumentTermMatrix(corpus,
control = list(stemming = TRUE, stopwords = TRUE, minWordLength = 3,
removeNumbers = TRUE))
dim(JSS_dtm)
library("slam")
term_tfidf <-
tapply(JSS_dtm$v/row_sums(JSS_dtm)[JSS_dtm$i],
JSS_dtm$j, mean) *
log2(nDocs(JSS_dtm)/col_sums(JSS_dtm > 0))
summary(term_tfidf)
JSS_dtm <- JSS_dtm[,term_tfidf >= 0.1]
JSS_dtm <- JSS_dtm[row_sums(JSS_dtm) > 0,]
dim(JSS_dtm)
################################################
## 主题模型
################################################
library("topicmodels")
k <- 30
jss_TM <- LDA(JSS_dtm, k = k,
control = list(seed = 2010))
Topic <- topics(jss_TM, 1)
table(Topic)
Terms <- terms(jss_TM, 5)
Terms[,1:4]
## Volume 24: Special Issue ``Statistical Modeling of
## Social Networks with `statnet'''
p_v24 <- grep("/v24/", JSS_papers[, "identifier"])
avg <- colMeans(posterior(jss_TM)$topics[p_v24,])
topics_v24 <- which.max(avg)
most_frequent_v24 <- which.max(tabulate(topics_v24))
terms(jss_TM, 10)[, most_frequent_v24]
对于主题模型的深入研究,大家可以参考下面
http://www.stat.uni-muenchen.de/ ... fcim-beispiele-09.R
Topic Models in R.pdf
(168.84 KB, 下载次数: 78)
|
|