情感分析

# setwd("D:/BaiduYunDisk/BaiduYunDisk/galaxystatistics/galaxystatistics/银河统计-Statistics_Method/report/text_analysis")
mdata <- read.csv("./data/huizong.csv",header=T,encoding='utf-8')
colnames(mdata) <- c("x1","x2","x3","x4","x5","x6","x7","x8","x9")
index <- which(mdata$x5=="美的")


# 文本数据处理
jd_data <- as.character(mdata[index, 6])
jd_data <- unique(jd_data)
jd_data <- jd_data[nchar(jd_data)>10]
# which.max(nchar(jd_data))
# max(nchar(jd_data))
# min(nchar(jd_data))
# length(jd_data)
# jd_data <- data.frame(term=jd_data)
# jd_data <- data.frame(apply(jd_data, 2, as.character))
# str(jd_data)
# class(jd_data)
# dim(jd_data)
# # write.table(meidi_jd_data,"./tmp/meidi_jd.txt",row.names=FALSE)


# library(wordcloud2)
# library(stringr)
# library(tm)

# mdata <- read.csv('http://data.galaxystatistics.com/blog_data/text_analysis/sms_spam.csv')
# str(mdata)
# str(mdata$type)
# table(mdata$type)
# mdata$text <- as.character(mdata$text)
# str(mdata)


library(stringr)
library(jiebaR)
library(plyr)

# 导入文本
# text <- readLines("1.txt")   #按段落读取
# text <- meidi_jd_data[27]
text <- jd_data

# 数据清洗加分词
clean_text <- function(x) {
  x <- gsub("\\/","",x)             # 清除/
  x <- gsub("\\,","",x)
  x <- gsub("\\。","",x)
  x <- gsub("\\:","",x)
  x <- gsub("\\;","",x)
  x <- gsub("\\%","",x)
  x <- gsub("\\.","",x)
  x <- gsub("\\*","",x)
  x <- gsub("\\、","",x)
  x <- gsub("\\s","",x)             # 任何空白字符
  x <- gsub("[[:digit:]]","",x)     # 清除数字
  x <- gsub("[a-zA-Z]","",x)        # 清除英文字符
  x <- gsub("\\(","",x)
  x <- gsub("\\)","",x)
  x <- gsub(" ","",x)               # 去除空格
  x <- gsub("","\\1",x)             # 去除空格
  x <- x[!nchar(x)<2]               # 清除字符数<2
  x <- x[!is.na(x)]
  # x <- gsub("(责任.*)","",x)
}
# text <- clean_text(oText)
# text <- list(text)
text <- lapply(text[1:length(text)], clean_text)
text <- unique(unlist(text))
text <- text[nchar(text)>10]
text <- data.frame(term=text)
# text <- data.frame(apply(text, 2, as.character))
# which.max(nchar(text))
# max(nchar(text))
# min(nchar(text))
# length(text)
# text[1:50]
# class(text)


# 情感词典导入[不同的情感词典,权重不同,结果不同]
negative_positive <- function(){
  
  pos <- read.table("tsinghua.positive.gb.txt",header=F,stringsAsFactors=FALSE)
  # 正面情感词语赋权重为1
  weight <- rep(1, length(pos[,1]))  
  pos <- cbind(pos, weight)
  neg <- read.table("tsinghua.negative.gb.txt",header=F,stringsAsFactors=FALSE)
  # 负面情感词语赋权重为-1
  weight <- rep(-1, length(neg[,1]))  
  neg <- cbind(neg, weight)
  posneg <- rbind(pos, neg)
  colnames(posneg) <- c("term", "weight")
  return(posneg)
  
}
pos_neg_word <- negative_positive()


# 情感词典导入词库
# library(jiebaR)
user <- pos_neg_word[,"term"]
w <- worker()
new_user_word(w, user)
# res <- segment(text, w)     # 分词


# 情感指数
sentiment_index_fun <- function(oText, pos_neg_word, jieba_w){
  
  res <- segment(oText, jieba_w)     # 分词
  temp <- data.frame()
  # temp[c(1:length(res)), 1] <- rep("1.text", length(res))  # id
  temp[c(1:length(res)), 1] <- res[1:length(res)]          # term
  # colnames(temp) <- c("id", "term")
  colnames(temp) <- c("term")
  
  # 关联情感权重
  # library(plyr)
  temp <- join(temp, pos_neg_word, by="term")
  temp <- temp[!is.na(temp$weight),]
  # head(temp)
  
  # 计算情感指数
  sentiment <- sum(temp$weight)
  sentiment
  
}
# apply(head(text,90), 1, sentiment_index_fun, pos_neg_word, w)

t1 <- Sys.time()
# sentiment_index <- apply(head(text,100), 1, sentiment_index_fun, pos_neg_word, w)
sentiment_index <- apply(text, 1, sentiment_index_fun, pos_neg_word, w)
sentiment_result <- data.frame(sentiment_index=sentiment_index, text)
write.table(sentiment_result,"meidi_jd_sentiment_result.txt",row.names=FALSE)
t2 <- Sys.time()
print(t2-t1)

# (38796/100)*14/3600

情感分析占比

# R语言环境下的文本可视化及主题分析
setwd("D:/BaiduYunDisk/BaiduYunDisk/galaxystatistics/galaxystatistics/银河统计-Statistics_Method/report/text_analysis")
# getwd()

mdata <- read.table('meidi_jd_sentiment_result.txt', row.names=NULL, header=T)
mdata <- data.frame(mdata)

# summary(mdata$sentiment_index)
# table(mdata$sentiment_index)

mdata_cut <- cut(mdata$sentiment_index, breaks=c(-200, -5, -4, -3, -2, -1, 0.5, 1, 2, 3, 4, 200)) 
table(mdata_cut)
## mdata_cut
## (-200,-5]   (-5,-4]   (-4,-3]   (-3,-2]   (-2,-1]  (-1,0.5]   (0.5,1] 
##        28        21        63       305      1739      8834     12183 
##     (1,2]     (2,3]     (3,4]   (4,200] 
##      9162      3928      1488      1045
nums <- as.numeric(table(mdata_cut))
names <- c('<=-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '>=5')
# length(nums)
# length(names)
result1 <- data.frame(names, nums)
# hist(result1$nums)
barplot(result1$nums, main="情感分析", xlab="情感得分值",  
   ylab="情感得分占比", names.arg=names, 
   border="blue", density=c(10,15,20,25,30,35,40,45,50,55,60))

mdata_cut1 <- cut(mdata$sentiment_index, breaks=c(-200, -0.5, 0.5, 200)) 
table(mdata_cut1)
## mdata_cut1
## (-200,-0.5]  (-0.5,0.5]   (0.5,200] 
##        2156        8834       27806
nums <- as.numeric(table(mdata_cut1))
names <- c('贬义', '中性', '褒义')
# length(nums)
# length(names)
result2 <- data.frame(names, nums)
# hist(result2$nums)
barplot(result2$nums, main="情感分析", xlab="情感得分值",  
   ylab="情感得分占比", names.arg=c('贬义', '中性', '褒义'), 
   border="blue", density=c(10,20,30))

# prop.table(mdata_cut)
# str(mdata_cut)