情感分析
# setwd("D:/BaiduYunDisk/BaiduYunDisk/galaxystatistics/galaxystatistics/银河统计-Statistics_Method/report/text_analysis")
mdata <- read.csv("./data/huizong.csv",header=T,encoding='utf-8')
colnames(mdata) <- c("x1","x2","x3","x4","x5","x6","x7","x8","x9")
index <- which(mdata$x5=="美的")
# 文本数据处理
jd_data <- as.character(mdata[index, 6])
jd_data <- unique(jd_data)
jd_data <- jd_data[nchar(jd_data)>10]
# which.max(nchar(jd_data))
# max(nchar(jd_data))
# min(nchar(jd_data))
# length(jd_data)
# jd_data <- data.frame(term=jd_data)
# jd_data <- data.frame(apply(jd_data, 2, as.character))
# str(jd_data)
# class(jd_data)
# dim(jd_data)
# # write.table(meidi_jd_data,"./tmp/meidi_jd.txt",row.names=FALSE)
# library(wordcloud2)
# library(stringr)
# library(tm)
# mdata <- read.csv('http://data.galaxystatistics.com/blog_data/text_analysis/sms_spam.csv')
# str(mdata)
# str(mdata$type)
# table(mdata$type)
# mdata$text <- as.character(mdata$text)
# str(mdata)
library(stringr)
library(jiebaR)
library(plyr)
# 导入文本
# text <- readLines("1.txt") #按段落读取
# text <- meidi_jd_data[27]
text <- jd_data
# 数据清洗加分词
clean_text <- function(x) {
x <- gsub("\\/","",x) # 清除/
x <- gsub("\\,","",x)
x <- gsub("\\。","",x)
x <- gsub("\\:","",x)
x <- gsub("\\;","",x)
x <- gsub("\\%","",x)
x <- gsub("\\.","",x)
x <- gsub("\\*","",x)
x <- gsub("\\、","",x)
x <- gsub("\\s","",x) # 任何空白字符
x <- gsub("[[:digit:]]","",x) # 清除数字
x <- gsub("[a-zA-Z]","",x) # 清除英文字符
x <- gsub("\\(","",x)
x <- gsub("\\)","",x)
x <- gsub(" ","",x) # 去除空格
x <- gsub("","\\1",x) # 去除空格
x <- x[!nchar(x)<2] # 清除字符数<2
x <- x[!is.na(x)]
# x <- gsub("(责任.*)","",x)
}
# text <- clean_text(oText)
# text <- list(text)
text <- lapply(text[1:length(text)], clean_text)
text <- unique(unlist(text))
text <- text[nchar(text)>10]
text <- data.frame(term=text)
# text <- data.frame(apply(text, 2, as.character))
# which.max(nchar(text))
# max(nchar(text))
# min(nchar(text))
# length(text)
# text[1:50]
# class(text)
# 情感词典导入[不同的情感词典,权重不同,结果不同]
negative_positive <- function(){
pos <- read.table("tsinghua.positive.gb.txt",header=F,stringsAsFactors=FALSE)
# 正面情感词语赋权重为1
weight <- rep(1, length(pos[,1]))
pos <- cbind(pos, weight)
neg <- read.table("tsinghua.negative.gb.txt",header=F,stringsAsFactors=FALSE)
# 负面情感词语赋权重为-1
weight <- rep(-1, length(neg[,1]))
neg <- cbind(neg, weight)
posneg <- rbind(pos, neg)
colnames(posneg) <- c("term", "weight")
return(posneg)
}
pos_neg_word <- negative_positive()
# 情感词典导入词库
# library(jiebaR)
user <- pos_neg_word[,"term"]
w <- worker()
new_user_word(w, user)
# res <- segment(text, w) # 分词
# 情感指数
sentiment_index_fun <- function(oText, pos_neg_word, jieba_w){
res <- segment(oText, jieba_w) # 分词
temp <- data.frame()
# temp[c(1:length(res)), 1] <- rep("1.text", length(res)) # id
temp[c(1:length(res)), 1] <- res[1:length(res)] # term
# colnames(temp) <- c("id", "term")
colnames(temp) <- c("term")
# 关联情感权重
# library(plyr)
temp <- join(temp, pos_neg_word, by="term")
temp <- temp[!is.na(temp$weight),]
# head(temp)
# 计算情感指数
sentiment <- sum(temp$weight)
sentiment
}
# apply(head(text,90), 1, sentiment_index_fun, pos_neg_word, w)
t1 <- Sys.time()
# sentiment_index <- apply(head(text,100), 1, sentiment_index_fun, pos_neg_word, w)
sentiment_index <- apply(text, 1, sentiment_index_fun, pos_neg_word, w)
sentiment_result <- data.frame(sentiment_index=sentiment_index, text)
write.table(sentiment_result,"meidi_jd_sentiment_result.txt",row.names=FALSE)
t2 <- Sys.time()
print(t2-t1)
# (38796/100)*14/3600
情感分析占比
# R语言环境下的文本可视化及主题分析
setwd("D:/BaiduYunDisk/BaiduYunDisk/galaxystatistics/galaxystatistics/银河统计-Statistics_Method/report/text_analysis")
# getwd()
mdata <- read.table('meidi_jd_sentiment_result.txt', row.names=NULL, header=T)
mdata <- data.frame(mdata)
# summary(mdata$sentiment_index)
# table(mdata$sentiment_index)
mdata_cut <- cut(mdata$sentiment_index, breaks=c(-200, -5, -4, -3, -2, -1, 0.5, 1, 2, 3, 4, 200))
table(mdata_cut)
## mdata_cut
## (-200,-5] (-5,-4] (-4,-3] (-3,-2] (-2,-1] (-1,0.5] (0.5,1]
## 28 21 63 305 1739 8834 12183
## (1,2] (2,3] (3,4] (4,200]
## 9162 3928 1488 1045
nums <- as.numeric(table(mdata_cut))
names <- c('<=-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '>=5')
# length(nums)
# length(names)
result1 <- data.frame(names, nums)
# hist(result1$nums)
barplot(result1$nums, main="情感分析", xlab="情感得分值",
ylab="情感得分占比", names.arg=names,
border="blue", density=c(10,15,20,25,30,35,40,45,50,55,60))
mdata_cut1 <- cut(mdata$sentiment_index, breaks=c(-200, -0.5, 0.5, 200))
table(mdata_cut1)
## mdata_cut1
## (-200,-0.5] (-0.5,0.5] (0.5,200]
## 2156 8834 27806
nums <- as.numeric(table(mdata_cut1))
names <- c('贬义', '中性', '褒义')
# length(nums)
# length(names)
result2 <- data.frame(names, nums)
# hist(result2$nums)
barplot(result2$nums, main="情感分析", xlab="情感得分值",
ylab="情感得分占比", names.arg=c('贬义', '中性', '褒义'),
border="blue", density=c(10,20,30))
# prop.table(mdata_cut)
# str(mdata_cut)