主题模型
library(NLP)
library(tm)
library(slam)
library(wordcloud)
## Loading required package: RColorBrewer
library(topicmodels)
# R语言环境下的文本可视化及主题分析
setwd("D:/BaiduYunDisk/BaiduYunDisk/galaxystatistics/galaxystatistics/银河统计-Statistics_Method/report/text_analysis")
# getwd()
Data1 <- readLines("./data/meidi_jd_pos_cut.txt",encoding = "UTF-8")
Data2 <- readLines("./data/meidi_jd_neg_cut.txt",encoding = "UTF-8")
stopwords <- unlist (readLines("./data/stoplist.txt",encoding = "UTF-8"))
# 删除stopwords
removeStopWords <- function(x,words) {
ret <- character(0)
index <- 1
it_max <- length(x)
while (index <= it_max) {
if (length(words[words==x[index]]) < 1) ret <- c(ret,x[index])
index <- index +1
}
ret
}
# 删除空格、字母
Data1 <- gsub("([a~z])","", Data1)
Data2 <- gsub("([a~z])","", Data2)
# 删除停用词
sample.words1 <- lapply(Data1, removeStopWords, stopwords)
sample.words2 <- lapply(Data2, removeStopWords, stopwords)
# 构建语料库
corpus1 <- Corpus(VectorSource(sample.words1))
# 建立文档-词条矩阵
sample.dtm1 <- DocumentTermMatrix(corpus1, control = list(wordLengths = c(2, Inf)))
# 主题模型分析
Gibbs <- LDA(sample.dtm1, k = 3, method = "Gibbs",control = list(seed = 2015, burnin = 1000,thin = 100, iter = 1000))
# 最可能的主题文档
Topic1 <- topics(Gibbs, 1)
table(Topic1)
## Topic1
## 1 2 3
## 16939 14456 8029
# 每个Topic前10个Term
Terms1 <- terms(Gibbs, 15)
Terms1
## Topic 1 Topic 2 Topic 3
## [1,] "很 好 " "不错" "安装"
## [2,] "送货" "的 " "了 "
## [3,] "快 " "东西" "师傅"
## [4,] "就是" "还 不错" "的 "
## [5,] "好 " "京东" "美的"
## [6,] "加热" "价格" "自己"
## [7,] "速度" "很 不错" "元 "
## [8,] "很快" "感觉" "没有"
## [9,] "非常" "美的" "售后"
## [10,] "服务" "值得" "还是"
## [11,] "安装" "买 的 " "但是"
## [12,] "外观" "挺 好 的 " "不过"
## [13,] "保温" "好用" "收费"
## [14,] "有点" "吧 " "费用"
## [15,] "质量" "使用" "上门"
#### 负面评价LDA分析
# 构建语料库
corpus2 <- Corpus(VectorSource(sample.words2))
#建立文档-词条矩阵
sample.dtm2 <- DocumentTermMatrix(corpus2, control = list(wordLengths = c(2, Inf)))
#主题模型分析
# library(topicmodels)
Gibbs2 <- LDA(sample.dtm2, k = 4, method = "Gibbs",control = list(seed = 2015, burnin = 1000,thin = 100, iter = 1000))
#最可能的主题文档
Topic2 <- topics(Gibbs2, 1)
table(Topic2)
## Topic2
## 1 2 3 4
## 2052 1329 1210 850
#每个Topic前10个Term
Terms2 <- terms(Gibbs2, 30)
Terms2
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "就是" "了 " "安装" "美的"
## [2,] "加热" "的 " "不错" "没有"
## [3,] "不 知道" "不过" "师傅" "京东"
## [4,] "有点" "自己" "元 " "送货"
## [5,] "还 可以" "东西" "服务" "售后"
## [6,] "使用" "还是" "不好" "非常"
## [7,] "速度" "但是" "上门" "客服"
## [8,] "买 的 " "价格" "很 好 " "人员"
## [9,] "还 没有" "这个" "真心" "问题"
## [10,] "吧 " "可以" "收费" "以后"
## [11,] "热水" "好 " "很快" "结果"
## [12,] "感觉" "啊 " "配件" "而且"
## [13,] "效果" "希望" "费用" "电话"
## [14,] "用 " "不要" "hellip" "产品"
## [15,] "快 " "不是" "的 师傅" "还有"
## [16,] "怎么样 " "呢 " "真的" "小时"
## [17,] "还 行 " "质量" "两个" "现在"
## [18,] "比较" "还 没 安装" "还好" "物流"
## [19,] "还 不错" "便宜" "预约" "收到"
## [20,] "保温" "只是" "钱 " "不能"
## [21,] "应该" "因为" "漏水" "热水器 "
## [22,] "麻烦" "家里" "如果" "发现"
## [23,] "时间" "不会" "态度" "还要"
## [24,] "慢 " "等待" "建议" "联系"
## [25,] "还 没用" "没 问题" "太贵" "发货"
## [26,] "用 着" "还 没用" "装 的 " "包装"
## [27,] "出水" "块 " "贵 " "购买"
## [28,] "还行" "评价" "服务态度" "水管"
## [29,] "好用" "还 没装" "管子" "怎么"
## [30,] "温度" "多 " "的 配件" "电热水器"