主题模型

library(NLP)
library(tm)
library(slam)
library(wordcloud) 
## Loading required package: RColorBrewer
library(topicmodels)
# R语言环境下的文本可视化及主题分析
setwd("D:/BaiduYunDisk/BaiduYunDisk/galaxystatistics/galaxystatistics/银河统计-Statistics_Method/report/text_analysis")
# getwd()
Data1 <- readLines("./data/meidi_jd_pos_cut.txt",encoding = "UTF-8")
Data2 <- readLines("./data/meidi_jd_neg_cut.txt",encoding = "UTF-8")
stopwords <- unlist (readLines("./data/stoplist.txt",encoding = "UTF-8"))
# 删除stopwords
removeStopWords <- function(x,words) {  
  ret <- character(0)
  index <- 1
  it_max <- length(x)
  while (index <= it_max) {
    if (length(words[words==x[index]]) < 1) ret <- c(ret,x[index])
    index <- index +1
  }
  ret
}
# 删除空格、字母
Data1 <- gsub("([a~z])","", Data1)
Data2 <- gsub("([a~z])","", Data2)
# 删除停用词
sample.words1 <- lapply(Data1, removeStopWords, stopwords)
sample.words2 <- lapply(Data2, removeStopWords, stopwords)
# 构建语料库
corpus1 <- Corpus(VectorSource(sample.words1))

# 建立文档-词条矩阵
sample.dtm1 <- DocumentTermMatrix(corpus1, control = list(wordLengths = c(2, Inf)))

# 主题模型分析
Gibbs <- LDA(sample.dtm1, k = 3, method = "Gibbs",control = list(seed = 2015, burnin = 1000,thin = 100, iter = 1000))
# 最可能的主题文档
Topic1 <- topics(Gibbs, 1)
table(Topic1)
## Topic1
##     1     2     3 
## 16939 14456  8029
# 每个Topic前10个Term
Terms1 <- terms(Gibbs, 15)
Terms1
##       Topic 1  Topic 2     Topic 3
##  [1,] "很 好 " "不错"      "安装" 
##  [2,] "送货"   "的 "       "了 "  
##  [3,] "快 "    "东西"      "师傅" 
##  [4,] "就是"   "还 不错"   "的 "  
##  [5,] "好 "    "京东"      "美的" 
##  [6,] "加热"   "价格"      "自己" 
##  [7,] "速度"   "很 不错"   "元 "  
##  [8,] "很快"   "感觉"      "没有" 
##  [9,] "非常"   "美的"      "售后" 
## [10,] "服务"   "值得"      "还是" 
## [11,] "安装"   "买 的 "    "但是" 
## [12,] "外观"   "挺 好 的 " "不过" 
## [13,] "保温"   "好用"      "收费" 
## [14,] "有点"   "吧 "       "费用" 
## [15,] "质量"   "使用"      "上门"
#### 负面评价LDA分析
# 构建语料库
corpus2 <- Corpus(VectorSource(sample.words2))
#建立文档-词条矩阵
sample.dtm2 <- DocumentTermMatrix(corpus2, control = list(wordLengths = c(2, Inf)))
#主题模型分析
# library(topicmodels)
Gibbs2 <- LDA(sample.dtm2, k = 4, method = "Gibbs",control = list(seed = 2015, burnin = 1000,thin = 100, iter = 1000))
#最可能的主题文档
Topic2 <- topics(Gibbs2, 1)
table(Topic2)
## Topic2
##    1    2    3    4 
## 2052 1329 1210  850
#每个Topic前10个Term
Terms2 <- terms(Gibbs2, 30)
Terms2
##       Topic 1   Topic 2      Topic 3    Topic 4   
##  [1,] "就是"    "了 "        "安装"     "美的"    
##  [2,] "加热"    "的 "        "不错"     "没有"    
##  [3,] "不 知道" "不过"       "师傅"     "京东"    
##  [4,] "有点"    "自己"       "元 "      "送货"    
##  [5,] "还 可以" "东西"       "服务"     "售后"    
##  [6,] "使用"    "还是"       "不好"     "非常"    
##  [7,] "速度"    "但是"       "上门"     "客服"    
##  [8,] "买 的 "  "价格"       "很 好 "   "人员"    
##  [9,] "还 没有" "这个"       "真心"     "问题"    
## [10,] "吧 "     "可以"       "收费"     "以后"    
## [11,] "热水"    "好 "        "很快"     "结果"    
## [12,] "感觉"    "啊 "        "配件"     "而且"    
## [13,] "效果"    "希望"       "费用"     "电话"    
## [14,] "用 "     "不要"       "hellip"   "产品"    
## [15,] "快 "     "不是"       "的 师傅"  "还有"    
## [16,] "怎么样 " "呢 "        "真的"     "小时"    
## [17,] "还 行 "  "质量"       "两个"     "现在"    
## [18,] "比较"    "还 没 安装" "还好"     "物流"    
## [19,] "还 不错" "便宜"       "预约"     "收到"    
## [20,] "保温"    "只是"       "钱 "      "不能"    
## [21,] "应该"    "因为"       "漏水"     "热水器 " 
## [22,] "麻烦"    "家里"       "如果"     "发现"    
## [23,] "时间"    "不会"       "态度"     "还要"    
## [24,] "慢 "     "等待"       "建议"     "联系"    
## [25,] "还 没用" "没 问题"    "太贵"     "发货"    
## [26,] "用 着"   "还 没用"    "装 的 "   "包装"    
## [27,] "出水"    "块 "        "贵 "      "购买"    
## [28,] "还行"    "评价"       "服务态度" "水管"    
## [29,] "好用"    "还 没装"    "管子"     "怎么"    
## [30,] "温度"    "多 "        "的 配件"  "电热水器"