前言
bank数据案例
# klaR包与e1071包都可以做朴素贝叶斯分类
# data
bank <- read.csv('http://data.galaxystatistics.com/blog_data/common_use/bank.csv',header=T)
n <- nrow(bank)
train_data <- bank[1:4000,]
test_data <- bank[4001:n,1:16]
test_data1 <- bank[4001:n,]
# klaR-NaiveBayes
library(klaR)
formula <- as.formula(paste(names(train_data)[ncol(train_data)], " ~ ", paste(names(train_data)[-ncol(train_data)], collapse= "+")))
bfit <- NaiveBayes(formula, train_data, na.action=na.pass)
# bfit <- NaiveBayes(y~age+job+marital+education+default+balance+housing+loan+contact
# +day+month+duration+campaign+pdays+previous+poutcome,bank,na.action=na.pass)
result <- predict(bfit, test_data)
result_1 <- data.frame(result)
pre <- result_1[,1]
bayes_table <- table(pre, test_data1[,ncol(test_data1)])
accuracy_test_bayes <- sum(diag(bayes_table))/sum(bayes_table)
list('测试集预测混淆矩阵'=bayes_table, '测试集预测正确率'=accuracy_test_bayes)
## $测试集预测混淆矩阵
##
## pre no yes
## no 418 32
## yes 40 31
##
## $测试集预测正确率
## [1] 0.8618042
# e1071-NaiveBayes
library(e1071)
formula <- as.formula(paste(names(train_data)[ncol(train_data)], " ~ ", paste(names(train_data)[-ncol(train_data)], collapse= "+")))
efit <- naiveBayes(formula, train_data)
# efit <- naiveBayes(y~age+job+marital+education+default+balance+housing+loan+contact
# +day+month+duration+campaign+pdays+previous+poutcome,bank_train)
pre <- predict(efit, test_data)
bayes_table <- table(pre, test_data1[,ncol(test_data1)])
accuracy_test_bayes <- sum(diag(bayes_table))/sum(bayes_table)
list('测试集预测混淆矩阵'=bayes_table, '测试集预测正确率'=accuracy_test_bayes)
## $测试集预测混淆矩阵
##
## pre no yes
## no 418 32
## yes 40 31
##
## $测试集预测正确率
## [1] 0.8618042
classAgreement(bayes_table)
## $diag
## [1] 0.8618042
##
## $kappa
## [1] 0.3837156
##
## $rand
## [1] 0.7613465
##
## $crand
## [1] 0.3149805
Mushroom数据案例
# 下载并加载所需的应用包
if(!suppressWarnings(require(caret))){
install.packages('caret')
require(caret)
}
if(!suppressWarnings(require(klaR))){
install.packages('klaR')
require(klaR)
}
if(!suppressWarnings(require(pROC))){
install.packages('pROC')
require(pROC)
}
# 读取蘑菇数据集--http://archive.ics.uci.edu/ml/datasets/Mushroom
mdata <- read.csv('http://data.galaxystatistics.com/blog_data/common_use/agaricus_lepiota.csv',header=T)
# 简单的了解一下数据
str(mdata)
## 'data.frame': 8124 obs. of 23 variables:
## $ x1 : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ x2 : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
## $ x3 : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ x4 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
## $ x5 : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ x6 : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ x7 : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ x8 : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
## $ x9 : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ x10: Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ x11: Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
## $ x12: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ x13: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ x14: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ x15: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ x16: Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ x17: Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ x18: Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ x19: Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ x20: Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ x21: Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ x22: Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
## $ y : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
head(mdata)
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x20
## 1 x s n t p f c n k e e s s w w p w o p k
## 2 x s y t a f c b k e c s s w w p w o p n
## 3 b s w t l f c b n e c s s w w p w o p n
## 4 x y w t p f c n n e e s s w w p w o p k
## 5 x s g f n f w b k t e s s w w p w o e n
## 6 x y y t a f c b n e c s s w w p w o p k
## x21 x22 y
## 1 s u p
## 2 n g e
## 3 n m e
## 4 s u p
## 5 a g e
## 6 n g e
# 该数据集中包含了8124个样本和22个变量(如蘑菇的颜色、形状、光滑度等)。
# 抽样,并将总体分为训练集和测试集
set.seed(12)
index <- sample(1:nrow(mdata), size=0.75*nrow(mdata))
train_data <- mdata[index,]
test_data <- mdata[-index,]
# 大致查看抽样与总体之间是否吻合
prop.table(table(mdata$y))
##
## e p
## 0.5179714 0.4820286
prop.table(table(train_data$y))
##
## e p
## 0.5178073 0.4821927
prop.table(table(test_data$y))
##
## e p
## 0.5184638 0.4815362
# 原始数据中毒蘑菇与非毒蘑菇之间的比较比较接近,通过抽选训练集和测试集,发现比重与总体比例大致一样,故可认为抽样的结果能够反映总体状况,可进一步进行建模和测试。
# 由于影响蘑菇是否有毒的变量有22个,可以先试着做一下特征选择,这里我们就采用随机森林方法(借助caret包实现特征选择的工作)进行重要变量的选择:
# 构建rfe函数的控制参数(使用随机森林函数和10重交叉验证抽样方法,并抽取5组样本)
rfeControls_rf <- rfeControl(functions = rfFuncs, method = 'cv', repeats = 5)
# 使用rfe函数进行特征选择
fs_nb <- rfe(x = train_data[,-ncol(train_data)], y = train_data[,ncol(train_data)], sizes = seq(4,21,2), rfeControl = rfeControls_rf)
fs_nb
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 4 0.9954 0.9908 0.0033558 0.006725
## 6 0.9998 0.9997 0.0005193 0.001040
## 8 1.0000 1.0000 0.0000000 0.000000 *
## 10 1.0000 1.0000 0.0000000 0.000000
## 12 1.0000 1.0000 0.0000000 0.000000
## 14 1.0000 1.0000 0.0000000 0.000000
## 16 1.0000 1.0000 0.0000000 0.000000
## 18 1.0000 1.0000 0.0000000 0.000000
## 20 1.0000 1.0000 0.0000000 0.000000
## 22 1.0000 1.0000 0.0000000 0.000000
##
## The top 5 variables (out of 8):
## x5, x20, x8, x9, x21
# 结果显示,22个变量中,只需要选择重要变量即可,下图也可以说明这一点,所需要选择的变量是:
plot(fs_nb, type = c('g', 'o'))
fs_nb$optVariables
## [1] "x5" "x20" "x8" "x9" "x21" "x11" "x22" "x3"
# 接下来,我们就针对这些重要变量,使用朴素贝叶斯算法进行建模和预测:
# 使用klaR包中的NaiveBayes函数构建朴素贝叶斯算法
vars <- c('y', fs_nb$optVariables)
fit <- NaiveBayes(y ~ ., data = train_data[,vars])
# 预测
pred <- predict(fit, newdata=test_data[,vars][,-ncol(test_data)])
# 构建混淆矩阵
freq <- table(pred$class, test_data[,ncol(test_data)])
freq
##
## e p
## e 1038 44
## p 15 934
# 模型的准确率
accuracy <- sum(diag(freq))/sum(freq)
accuracy
## [1] 0.9709503
# 模型的AUC值
modelroc <- roc(as.integer(test_data[,ncol(test_data)]), as.integer(factor(pred$class)))
# 绘制ROC曲线
plot(modelroc, print.auc = TRUE, auc.polygon = TRUE,
grid = c(0.1,0.2), grid.col = c('green', 'red'),
max.auc.polygon = TRUE, auc.polygon.col = 'steelblue')
# 通过朴素贝叶斯模型,在测试集中,模型的准确率约为97%,而且AUC的值也非常高,一般超过0.8就说明模型比较理想了。
iris数据案例
# 我们仍然以以iris数据集为例进行朴素贝叶斯分类。
# 首先,我们以Species为待判别变量,以data_train来生成贝叶斯判别规则,过程如下:
library(klaR)
library(MASS)
set.seed(1234)
ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7,0.3))
train_data <- iris[ind==1,]
test_data <- iris[ind==2,]
fit_Bayes1 <- NaiveBayes(Species~., train_data)
names(fit_Bayes1)
## [1] "apriori" "tables" "levels" "call" "x" "usekernel"
## [7] "varnames"
# 这里的输出结果中,apriori()与lda()函数所给出的prior项一样,记录了该次执行过程中所使用的先验概率:
fit_Bayes1$apriori
## grouping
## setosa versicolor virginica
## 0.3571429 0.3392857 0.3035714
# tables项存储了用于建立判别规则的所有变量在各类别下的条件概率,这是运行贝叶斯判别算法中的一种重要过程:
fit_Bayes1$tables
## $Sepal.Length
## [,1] [,2]
## setosa 5.010000 0.3455208
## versicolor 5.976316 0.4784176
## virginica 6.605882 0.6866436
##
## $Sepal.Width
## [,1] [,2]
## setosa 3.440000 0.3733425
## versicolor 2.776316 0.2944846
## virginica 2.991176 0.3213302
##
## $Petal.Length
## [,1] [,2]
## setosa 1.480000 0.1757037
## versicolor 4.297368 0.4641102
## virginica 5.611765 0.5819324
##
## $Petal.Width
## [,1] [,2]
## setosa 0.255000 0.1108244
## versicolor 1.336842 0.2019112
## virginica 2.035294 0.2717869
# 其他的判别变量等级项levels、判别命令项call、是否使用标准密度估计usekernel,以及参与判别规则制定的特征变量名varnames这几项的输出结果如下:
fit_Bayes1$levels
## [1] "setosa" "versicolor" "virginica"
fit_Bayes1$call
## NaiveBayes.default(x = X, grouping = Y)
fit_Bayes1$usekernel
## [1] FALSE
# fit_Bayes1$x
fit_Bayes1$varnames
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
# 接下来我们根据得到的判别规则,做出相应的密度曲线图
plot(fit_Bayes1)
# plot(fit_Bayes1, var=names(train_data)[1])
# plot(fit_Bayes1, var="Sepal.Length")
# ?plot.NaiveBayes
# 从Sepal.Length(花萼长度)来看,setosa(山鸢尾)的长度集中在4.6cm~5.3cm之间,versicolor(杂色鸢尾)长度集中在5,5cm~6.5cm之间,而virginica(维吉尼亚鸢尾)的长度集中在6.0cm~7.3cm之间。其中versicolor和virginica之间的花萼长度存在较为明显的交集。
# 从Sepal.Width(花萼宽度)来看,versicolor和virginica的花萼宽度差异并不是很大,setosa的宽度和另外两类存在较为明显的差异,所以但从花萼宽度来看还是很容易区分出setosa这种花类的。
# Petal.Length(花瓣长度)和Petal.Width(花瓣宽度)的密度曲线图非常相似,setosa的花瓣长、宽有着非常明显的特征,长度不超过2cm,宽度大多数不少过0.5cm。而versicolor和virginica的花瓣长、宽也有较为明显的差异。就花瓣长宽总体来说,virginica < versicolor < setosa。
# 接着,我们对模型进行预测分析和评测
pre_Bayes1 <- predict(fit_Bayes1, test_data)
pre_Bayes1
## $class
## 5 14 16 26 28 29
## setosa setosa setosa setosa setosa setosa
## 36 39 40 50 53 58
## setosa setosa setosa setosa versicolor versicolor
## 60 61 66 72 74 81
## versicolor versicolor versicolor versicolor versicolor versicolor
## 86 90 92 100 111 113
## versicolor versicolor versicolor versicolor virginica virginica
## 116 117 120 121 122 123
## virginica virginica versicolor virginica virginica virginica
## 124 131 135 137 140 142
## virginica virginica versicolor virginica virginica virginica
## 147 149
## virginica virginica
## Levels: setosa versicolor virginica
##
## $posterior
## setosa versicolor virginica
## 5 1.000000e+00 2.790555e-19 4.804157e-25
## 14 1.000000e+00 4.681860e-20 8.584231e-26
## 16 1.000000e+00 4.965007e-18 1.617037e-22
## 26 1.000000e+00 2.919231e-16 6.817218e-23
## 28 1.000000e+00 5.355506e-18 5.209729e-24
## 29 1.000000e+00 3.397560e-18 2.601146e-24
## 36 1.000000e+00 1.103180e-18 6.604019e-25
## 39 1.000000e+00 1.239266e-18 9.601292e-25
## 40 1.000000e+00 7.167103e-18 5.268064e-24
## 50 1.000000e+00 2.808498e-18 1.783320e-24
## 53 5.323015e-115 5.701487e-01 4.298513e-01
## 58 9.659583e-32 9.999991e-01 8.658274e-07
## 60 1.655466e-64 9.998612e-01 1.387868e-04
## 61 4.541736e-38 9.999992e-01 8.394677e-07
## 66 1.000900e-87 9.856329e-01 1.436707e-02
## 72 1.205796e-66 9.998444e-01 1.555779e-04
## 74 3.191784e-91 9.989301e-01 1.069908e-03
## 81 1.804099e-51 9.999953e-01 4.709269e-06
## 86 1.451633e-96 9.003209e-01 9.967912e-02
## 90 2.868701e-65 9.999491e-01 5.088571e-05
## 92 7.614003e-94 9.934054e-01 6.594628e-03
## 100 5.874664e-69 9.998619e-01 1.381040e-04
## 111 2.203389e-149 1.088649e-03 9.989114e-01
## 113 1.856574e-179 1.510435e-05 9.999849e-01
## 116 2.569809e-179 1.886245e-06 9.999981e-01
## 117 1.497545e-159 4.554217e-03 9.954458e-01
## 120 2.939555e-117 9.735871e-01 2.641290e-02
## 121 5.876038e-205 4.261153e-08 1.000000e+00
## 122 4.934524e-136 3.048338e-02 9.695166e-01
## 123 2.031354e-257 7.578507e-10 1.000000e+00
## 124 2.725711e-127 2.649040e-01 7.350960e-01
## 131 6.078848e-208 1.428685e-06 9.999986e-01
## 135 3.522568e-144 6.518680e-01 3.481320e-01
## 137 1.691260e-202 2.829088e-08 1.000000e+00
## 140 2.364394e-174 1.501887e-05 9.999850e-01
## 142 7.278610e-172 1.764858e-06 9.999982e-01
## 147 4.272198e-138 6.431249e-02 9.356875e-01
## 149 1.879617e-183 8.795346e-07 9.999991e-01
# 准确率计算
table(test_data$Species, pre_Bayes1$class)
##
## setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 12 0
## virginica 0 2 14
error_Bayes1 <- sum(as.numeric(as.numeric(pre_Bayes1$class)!=as.numeric(test_data$Species)))/nrow(test_data)
error_Bayes1
## [1] 0.05263158
Titanic数据案例
data(Titanic)
m <- naiveBayes(Survived ~ ., data = Titanic)
m
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.formula(formula = Survived ~ ., data = Titanic)
##
## A-priori probabilities:
## Survived
## No Yes
## 0.676965 0.323035
##
## Conditional probabilities:
## Class
## Survived 1st 2nd 3rd Crew
## No 0.08187919 0.11208054 0.35436242 0.45167785
## Yes 0.28551336 0.16596343 0.25035162 0.29817159
##
## Sex
## Survived Male Female
## No 0.91543624 0.08456376
## Yes 0.51617440 0.48382560
##
## Age
## Survived Child Adult
## No 0.03489933 0.96510067
## Yes 0.08016878 0.91983122
贝叶斯网的R实现(Bayesian network in R)