data(iris)
ind <- sample(1:2, nrow(iris), replace=TRUE, prob=c(0.7, 0.3))
train_data <- iris[ind==1,]
test_data <- iris[ind==2,]
str(train_data)
## 'data.frame': 101 obs. of 5 variables:
## $ Sepal.Length: num 4.9 4.6 5 5.4 4.6 5 4.9 4.8 4.8 5.8 ...
## $ Sepal.Width : num 3 3.1 3.6 3.9 3.4 3.4 3.1 3.4 3 4 ...
## $ Petal.Length: num 1.4 1.5 1.4 1.7 1.4 1.5 1.5 1.6 1.4 1.2 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.4 0.3 0.2 0.1 0.2 0.1 0.2 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
str(test_data)
## 'data.frame': 49 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.7 4.4 5.4 4.3 5.7 5.1 5.1 5.1 4.8 ...
## $ Sepal.Width : num 3.5 3.2 2.9 3.7 3 4.4 3.5 3.7 3.3 3.4 ...
## $ Petal.Length: num 1.4 1.3 1.4 1.5 1.1 1.5 1.4 1.5 1.7 1.9 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.1 0.4 0.3 0.4 0.5 0.2 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(e1071)
data("iris")
attach(iris)
library(lattice)
xyplot(Petal.Length ~ Petal.Width,data = iris,groups = Species,auto.key = list(corner = c(1,0)))
# model1
subdata = iris[iris$Species != "virginica",]
subdata$Species = factor(subdata$Species)
model1 = svm(Species ~ Petal.Length + Petal.Width,data = subdata)
summary(model1)
##
## Call:
## svm(formula = Species ~ Petal.Length + Petal.Width, data = subdata)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.5
##
## Number of Support Vectors: 7
##
## ( 3 4 )
##
##
## Number of Classes: 2
##
## Levels:
## setosa versicolor
plot(model1, subdata, Petal.Length ~ Petal.Width)
# model2 使用全部特征量
model2 = svm(Species ~ .,data = iris)
summary(model2)
##
## Call:
## svm(formula = Species ~ ., data = iris)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 51
##
## ( 8 22 21 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
plot(model2, iris, Petal.Length ~ Petal.Width)
# model3
x = iris[,-5] # 除第五列以外数据作为特征变量
y = iris[,5] # 提取第5列数据做为结果变量
model3 = svm(x,y,kernel = "radial", gamma = if(is.vector(x)) 1 else 1/ncol(x))
str(model3)
## List of 29
## $ call : language svm.default(x = x, y = y, kernel = "radial", gamma = if (is.vector(x)) 1 else 1/ncol(x))
## $ type : num 0
## $ kernel : num 2
## $ cost : num 1
## $ degree : num 3
## $ gamma : num 0.25
## $ coef0 : num 0
## $ nu : num 0.5
## $ epsilon : num 0.1
## $ sparse : logi FALSE
## $ scaled : logi [1:4] TRUE TRUE TRUE TRUE
## $ x.scale :List of 2
## ..$ scaled:center: Named num [1:4] 5.84 3.06 3.76 1.2
## .. ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## ..$ scaled:scale : Named num [1:4] 0.828 0.436 1.765 0.762
## .. ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ y.scale : NULL
## $ nclasses : int 3
## $ levels : chr [1:3] "setosa" "versicolor" "virginica"
## $ tot.nSV : int 51
## $ nSV : int [1:3] 8 22 21
## $ labels : int [1:3] 1 2 3
## $ SV : num [1:51, 1:4] -1.743 -1.864 -0.173 -0.535 -1.501 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:51] "9" "14" "16" "21" ...
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ index : int [1:51] 9 14 16 21 23 24 26 42 51 53 ...
## $ rho : num [1:3] -0.0203 0.1312 -0.0629
## $ compprob : logi FALSE
## $ probA : NULL
## $ probB : NULL
## $ sigma : NULL
## $ coefs : num [1:51, 1:2] 0.0891 0 0.8652 0 0 ...
## $ na.action : NULL
## $ fitted : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
## $ decision.values: num [1:150, 1:3] 1.2 1.06 1.18 1.11 1.19 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:150] "1" "2" "3" "4" ...
## .. ..$ : chr [1:3] "setosa/versicolor" "setosa/virginica" "versicolor/virginica"
## - attr(*, "class")= chr "svm"
pred = predict(model3,x)
table(pred,y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
还可以使用SVM预测连续变量,也就是使用SVM实现回归分析。在接下来的样例中,我们使用名为eps-regression模型说明如何使用SVM执行回归分析。
# 使用Quartet数据集来训练一个支持向量机:
library(car)
data(Quartet)
model.regression <- svm(Quartet$y1~Quartet$x, type="eps-regression")
# 使用predict函数得到预测结果
predict.y <- predict(model.regression, Quartet$x)
predict.y
## 1 2 3 4 5 6 7 8
## 8.196894 7.152946 8.807471 7.713099 8.533578 8.774046 6.186349 5.763689
## 9 10 11
## 8.726925 6.621373 5.882946
# 调用plot绘图函数,预测值用正方形,训练数据用圆形:
plot(Quartet$x, Quartet$y1, pch = 19)
points(Quartet$x, predict.y, pch = 15, col="red")
# 第一步:R导入数据集,采用C50包自带的电信客户流失数据集churn
library(C50)
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
# 说明:数据集可以是R中自带的数据集,也可以是从不同数据源导入的数据集,在实际项目中,数据集来自于不同数据源。
head(churnTrain)
## state account_length area_code international_plan voice_mail_plan
## 1 KS 128 area_code_415 no yes
## 2 OH 107 area_code_415 no yes
## 3 NJ 137 area_code_415 no no
## 4 OH 84 area_code_408 yes no
## 5 OK 75 area_code_415 yes no
## 6 AL 118 area_code_510 yes no
## number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1 25 265.1 110 45.07
## 2 26 161.6 123 27.47
## 3 0 243.4 114 41.38
## 4 0 299.4 71 50.90
## 5 0 166.7 113 28.34
## 6 0 223.4 98 37.98
## total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1 197.4 99 16.78 244.7
## 2 195.5 103 16.62 254.4
## 3 121.2 110 10.30 162.6
## 4 61.9 88 5.26 196.9
## 5 148.3 122 12.61 186.9
## 6 220.6 101 18.75 203.9
## total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1 91 11.01 10.0 3
## 2 103 11.45 13.7 3
## 3 104 7.32 12.2 5
## 4 89 8.86 6.6 7
## 5 121 8.41 10.1 3
## 6 118 9.18 6.3 6
## total_intl_charge number_customer_service_calls churn
## 1 2.70 1 no
## 2 3.70 1 no
## 3 3.29 0 no
## 4 1.78 2 no
## 5 2.73 3 no
## 6 1.70 0 no
# 第二步:R划分数据集
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
# 说明:利用sample()函数,采用随机抽样的方法把数据集划分为训练集和测试集
# 第三步:观察划分后的数据集
dim(trainset)
## [1] 2315 20
str(trainset)
## 'data.frame': 2315 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 32 36 20 19 50 16 40 27 13 ...
## $ account_length : int 128 137 84 121 117 141 65 74 95 62 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 1 3 1 2 2 2 3 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 1 1 1 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 1 1 2 1 2 1 1 1 1 ...
## $ number_vmail_messages : int 25 0 0 24 0 37 0 0 0 0 ...
## $ total_day_minutes : num 265 243 299 218 184 ...
## $ total_day_calls : int 110 114 71 88 97 84 137 127 88 70 ...
## $ total_day_charge : num 45.1 41.4 50.9 37.1 31.4 ...
## $ total_eve_minutes : num 197.4 121.2 61.9 348.5 351.6 ...
## $ total_eve_calls : int 99 110 88 108 80 111 83 148 75 76 ...
## $ total_eve_charge : num 16.78 10.3 5.26 29.62 29.89 ...
## $ total_night_minutes : num 245 163 197 213 216 ...
## $ total_night_calls : int 91 104 89 118 90 97 111 94 115 99 ...
## $ total_night_charge : num 11.01 7.32 8.86 9.57 9.71 ...
## $ total_intl_minutes : num 10 12.2 6.6 7.5 8.7 11.2 12.7 9.1 12.3 13.1 ...
## $ total_intl_calls : int 3 5 7 7 4 5 6 5 5 6 ...
## $ total_intl_charge : num 2.7 3.29 1.78 2.03 2.35 3.02 3.43 2.46 3.32 3.54 ...
## $ number_customer_service_calls: int 1 0 2 3 1 0 4 0 3 4 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 1 2 2 2 ...
dim(testset)
## [1] 1018 20
str(testset)
## 'data.frame': 1018 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 36 37 2 25 13 35 14 4 25 19 ...
## $ account_length : int 107 75 118 147 168 161 85 130 20 172 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 3 2 1 2 1 2 2 1 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 2 2 2 1 1 1 1 1 1 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 2 1 1 1 ...
## $ number_vmail_messages : int 26 0 0 0 0 0 27 0 0 0 ...
## $ total_day_minutes : num 162 167 223 157 129 ...
## $ total_day_calls : int 123 113 98 79 96 67 139 112 109 121 ...
## $ total_day_charge : num 27.5 28.3 38 26.7 21.9 ...
## $ total_eve_minutes : num 196 148 221 103 105 ...
## $ total_eve_calls : int 103 122 101 94 71 97 90 99 84 115 ...
## $ total_eve_charge : num 16.62 12.61 18.75 8.76 8.92 ...
## $ total_night_minutes : num 254 187 204 212 141 ...
## $ total_night_calls : int 103 121 118 96 128 128 75 78 102 78 ...
## $ total_night_charge : num 11.45 8.41 9.18 9.53 6.35 ...
## $ total_intl_minutes : num 13.7 10.1 6.3 7.1 11.2 5.4 13.8 9.5 6.3 12.6 ...
## $ total_intl_calls : int 3 3 6 6 2 9 4 19 6 10 ...
## $ total_intl_charge : num 3.7 2.73 1.7 1.92 3.02 1.46 3.73 2.57 1.7 3.4 ...
## $ number_customer_service_calls: int 1 3 0 0 1 4 1 0 0 3 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 1 2 2 2 2 ...
# 除了选择不同的特征集和核函数,还可以借助参数gamma以及惩罚因子来调整支持向量机的性能,可以写一个for函数来实现。SVM提供了tune.svm函数简化了这个过程。
# 准备好训练数据集trainset,使用tune.svm调整支持向量机
tuned <- tune.svm(churn ~ .,data = trainset,gamma = 10^(-6:-1),cost = 10^(1:2))
summary(tuned)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.01 100
##
## - best performance: 0.08338558
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1e-06 10 0.14774780 0.02237867
## 2 1e-05 10 0.14774780 0.02237867
## 3 1e-04 10 0.14774780 0.02237867
## 4 1e-03 10 0.14774780 0.02237867
## 5 1e-02 10 0.09936558 0.02083915
## 6 1e-01 10 0.09462420 0.02002077
## 7 1e-06 100 0.14774780 0.02237867
## 8 1e-05 100 0.14774780 0.02237867
## 9 1e-04 100 0.14774780 0.02237867
## 10 1e-03 100 0.11233766 0.02603795
## 11 1e-02 100 0.08338558 0.01951954
## 12 1e-01 100 0.09592290 0.02147357
# 使用turning函数得到最佳参数设置支持向量机
model.tuned <- svm(churn ~ ., data=trainset, gamma=tuned$best.parameters$gamma, cost=tuned$best.parameters$cost)
summary(model.tuned)
##
## Call:
## svm(formula = churn ~ ., data = trainset, gamma = tuned$best.parameters$gamma,
## cost = tuned$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 100
## gamma: 0.01
##
## Number of Support Vectors: 645
##
## ( 401 244 )
##
##
## Number of Classes: 2
##
## Levels:
## yes no
# 调用predict函数基于刚配置好的SVM模型进行类标号的预测:
svm.tuned.pred <- predict(model.tuned, testset[,!names(testset) %in% c("churn")])
svm.tuned.table <- table(svm.tuned.pred, testset$churn)
svm.tuned.table
##
## svm.tuned.pred yes no
## yes 82 36
## no 59 841
accuracy.test.svm <- sum(diag(svm.tuned.table))/sum(svm.tuned.table)
list('测试集预测混淆矩阵'=svm.tuned.table, '测试集预测正确率'=accuracy.test.svm)
## $测试集预测混淆矩阵
##
## svm.tuned.pred yes no
## yes 82 36
## no 59 841
##
## $测试集预测正确率
## [1] 0.9066798
# 调用classAgreement函数得到相关系数完成算法性能评测:
classAgreement(svm.tuned.table)
## $diag
## [1] 0.9066798
##
## $kappa
## [1] 0.5802262
##
## $rand
## [1] 0.8306105
##
## $crand
## [1] 0.5104647
# 最后,调用confusionMatrix评价系统模型
library(caret)
## Loading required package: ggplot2
confusionMatrix(svm.tuned.table)
## Confusion Matrix and Statistics
##
##
## svm.tuned.pred yes no
## yes 82 36
## no 59 841
##
## Accuracy : 0.9067
## 95% CI : (0.8871, 0.9238)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : 7.269e-06
##
## Kappa : 0.5802
## Mcnemar's Test P-Value : 0.024
##
## Sensitivity : 0.58156
## Specificity : 0.95895
## Pos Pred Value : 0.69492
## Neg Pred Value : 0.93444
## Prevalence : 0.13851
## Detection Rate : 0.08055
## Detection Prevalence : 0.11591
## Balanced Accuracy : 0.77026
##
## 'Positive' Class : yes
##
原理相关参数详解
调整支持向量机可以采用试错法来寻找最佳的gamma和惩罚因子,用户需要采用不同的参数组合以训练出不同的支持向量机。svm.tune函数使用了12组不同的参数组合,函数采用10遍交叉检验的方法获得每次组合的错误偏差,最后选择误差最低的最优参数组合,从summary表可以知道,gamma等于0.01和惩罚因子的100时,算法最优。
当得到最佳参数值后,可以用他们在训练一个新的支持向量机,并基于模型的分类预测结果的样例集的实际类别生成分类表以及混淆矩阵。从混淆矩阵的输出结果可以得到新旧两个模型的正确差异。
本节调用predict函数获得测试数据集的预测模型,然后用table函数产生测试数据集的分类表,接下来的性能评测过程与前述章节其他方法其他分类方法的评测类似。
引入了一个新的函数classAgreement用来计算一个二维列联表行列之间多种一致性关系数。
diag系数为分类表主对角性上数据点的百分比
kappa系数是对diag系数随机一致性的修正
rand代表聚类评价指标(rand index),主要用来横量两个聚簇之间的相似性
crand系数是出现元素随机分类情况对Rand index 修正结果
要可视化已经构造好的模型,用户可以首先使用plot函数绘制散点图来说明输入的数据以及相应的SVM模型。在图中,支持向量和类别可以被高亮显示,这样和色彩的样例区分开来,另外,用户还可以采用等高线图绘制类的边缘,从等高线图可以更加容易地判断被错分的样例。
# 使用iris数据集和telecom churn两个数据集。
data("iris")
model.iris <- svm(Species ~ .,iris)
plot(model.iris, iris, Petal.Width ~ Petal.Length, slice=list(Sepal.Width=3, Sepal.Length=4))
# ??plot.svm
# dataSim的函数:
simData <- function(radius, width, distance, sample_size) {
aa1=runif(sample_size/2)
aa2=runif(sample_size/2)
rad=(radius-width/2)+width*aa1
theta=pi*aa2
x=rad*cos(theta)
y=rad*sin(theta)
label=1*rep(1,length(x))
x1=rad*cos(-theta)+rad
y1=rad*sin(-theta)-distance
label1=-1*rep(1,length(x1))
n_row=length(x)+length(x1)
data=matrix(rep(0,3*n_row),nrow=n_row,ncol=3)
data[,1]=c(x,x1)
data[,2]=c(y,y1)
data[,3]=c(label,label1)
data
}
dataSim <- simData(radius=10, width=6, distance=-6, sample_size=3000)
colnames(dataSim) <- c("x","y","label")
dataSim <- as.data.frame(dataSim)
# Sigmoid核的分类预测:
m1 <- svm(label~x+y, data=dataSim, cross=10, type="C-classification", kernel="sigmoid")
m1
##
## Call:
## svm(formula = label ~ x + y, data = dataSim, cross = 10, type = "C-classification",
## kernel = "sigmoid")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## gamma: 0.5
## coef.0: 0
##
## Number of Support Vectors: 1080
summary(m1)
##
## Call:
## svm(formula = label ~ x + y, data = dataSim, cross = 10, type = "C-classification",
## kernel = "sigmoid")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## gamma: 0.5
## coef.0: 0
##
## Number of Support Vectors: 1080
##
## ( 540 540 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 64.6
## Single Accuracies:
## 64.33333 65 69.66667 66.66667 62.33333 65.66667 58.66667 66.66667 61.33333 65.66667
pred1 <- fitted(m1)
table(pred1, dataSim[,3])
##
## pred1 -1 1
## -1 967 532
## 1 533 968
# 核函数那一小节作图的各种东西:
linear.svm.fit <- svm(label ~ x + y, data = dataSim, kernel ='linear')
with(dataSim, mean(label == ifelse(predict(linear.svm.fit) > 0, 1, -1)))
## [1] 0.836
polynomial.svm.fit <- svm(label ~ x + y, data = dataSim, kernel ='polynomial')
with(dataSim, mean(label == ifelse(predict(polynomial.svm.fit) >0, 1, -1)))
## [1] 0.8836667
radial.svm.fit <- svm(label ~ x + y, data = dataSim, kernel ='radial')
with(dataSim, mean(label == ifelse(predict(radial.svm.fit) > 0, 1, -1)))
## [1] 0.9923333
sigmoid.svm.fit <- svm(label ~ x + y, data = dataSim, kernel ='sigmoid')
with(dataSim, mean(label == ifelse(predict(sigmoid.svm.fit) > 0, 1, -1)))
## [1] 0.4603333
df <- cbind(dataSim,
data.frame(LinearSVM = ifelse(predict(linear.svm.fit) > 0, 1, -1),
PolynomialSVM = ifelse(predict(polynomial.svm.fit) > 0, 1, -1),
RadialSVM = ifelse(predict(radial.svm.fit) > 0, 1, -1),
SigmoidSVM = ifelse(predict(sigmoid.svm.fit) > 0, 1, -1)))
library(reshape)
predictions <- melt(df, id.vars = c('x', 'y'))
library(ggplot2)
ggplot(predictions, aes(x = x, y = y, color = factor(value))) +
geom_point() +
facet_grid(variable ~ .)
绘图原理说明
本节讨论了如何使用plot函数实现SVM模型的可视化,我们基于iris数据集训练得到了支持向量集,并调用plot函数绘制了SVM模型。
在参数列表中,plot的第一个参数名为模型名称,第二个参数为指定的样本数据集(该数据集必须与构建的模型的数据集一致),第三个参数是对分类图坐标轴的说明,默认情况下,plot函数将绘制一个二维(x轴-y轴)的散点图。
标记为X与O的数据点依次分布在图中,X代表支持向量,O代表样例数据,可以通过配置svSymbol和dataSymbol来调整样例的标记。所有的支持向量及true class都已经高亮并根据类别不同选取不同的颜色。最后一个参数slice,仅在变量个数大于2时设置,在本例中我们还使用了两个向量Sepal.Width,Sepal.Length,并分别用3和4为它们赋值。
包里函数ksvm()通过.Call接口,使用bsvm和libsvm库中的优化方法,得以实现svm算法。对于分类,有C-SVM分类算法和v-SVM分类算法,同时还包括C分类器的有界约束的版本。对于回归,提供了ε-SVM回归算法和v-SVM回归算法。对于多类分类,有一对一(one-against-one)方法和原生多类分类方法,下面将会有介绍。
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
data("iris")
irismodel <- ksvm(Species ~ ., data=iris, type="C-bsvc", kernel="rbfdot", kpar=list(sigma=0.1), C=10, prob.model=TRUE)
irismodel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-bsvc (classification)
## parameter : cost C = 10
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.1
##
## Number of Support Vectors : 32
##
## Objective Function Value : -5.8442 -3.0652 -136.9786
## Training error : 0.02
## Probability model included.
predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5], type="probabilities")
## setosa versicolor virginica
## [1,] 0.985366528 0.007922261 0.006711211
## [2,] 0.982051844 0.010862789 0.007085366
## [3,] 0.005248523 0.948266785 0.046484692
## [4,] 0.009468921 0.987287708 0.003243371
## [5,] 0.011712349 0.104614637 0.883673014
## [6,] 0.010487079 0.193888658 0.795624262
# Ksvm支持自定义核函数
k <- function(x, y) { (sum(x * y) + 1) * exp(0.001 * sum((x - y)^2)) }
class(k) <- "kernel"
data("promotergene")
gene <- ksvm(Class ~ ., data=promotergene, kernel=k, C=10, cross=5)
gene
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 10
##
##
## Number of Support Vectors : 44
##
## Objective Function Value : -23.3052
## Training error : 0.084906
## Cross validation error : 0.122944
predict(gene, promotergene[c(3, 10, 56, 68, 23, 12), -1])
## [1] + + - - - -
## Levels: + -
# 对于二分类问题,可以对结果用plot()进行可视化
x <- rbind(matrix(rnorm(120),60,2), matrix(rnorm(120, mean = 3),60,2))
y <- matrix(c(rep(1, 60), rep(-1, 60)))
svp <- ksvm(x, y, type="C-svc", kernel="rbfdot", kpar=list(sigma=2))
plot(svp)
type表示是用于分类还是回归,还是检测,取决于y是否是一个因子。缺省取C-svc或eps-svr。可取值
• C-svc C classification
• nu-svc nu classification
• C-bsvc bound-constraint svm classification
• spoc-svc Crammer, Singer native multi-class
• kbb-svc Weston, Watkins native multi-class
• one-svc novelty detection
• eps-svr epsilon regression
• nu-svr nu regression
• eps-bsvr bound-constraint svm regression
Kernel设置核函数,可设核函数
• rbfdot Radial Basis kernel "Gaussian"
• polydot Polynomial kernel
• vanilladot Linear kernel
• tanhdot Hyperbolic tangent kernel
• laplacedot Laplacian kernel
• besseldot Bessel kernel
• anovadot ANOVA RBF kernel
• splinedot Spline kernel
• stringdot String kernel
# 准备基础数据
vdata <- iris[,1:4]
colnames(vdata) <- c("x1","x2","x3","y")
# 标准化x1~x3
vdata[,1:3] <- scale(vdata[,1:3])
n <- nrow(vdata)
x <- as.matrix(vdata[,1:3])
y <- as.matrix(vdata[,4])
y <- rbind(0,y)
I <- t(t(rep(1,n)))
# 设置参数sigma
sigma <- 1
omiga <- matrix(rep(0,n*n),ncol <- n)
for(i in 1:n)
{
xi <- x[i,]
deltaX <- (x-matrix(rep(xi,n),byrow = T,ncol = 3))^2
omiga[i,] <- exp(-rowSums(deltaX)/(sigma^2))
}
# 设置平衡参数gama
gama <- 10
# 构建矩阵A
A <- (omiga+(1/gama)*diag(n))
A <- cbind(I,A)
A <- rbind(c(0,t(I)),A)
# 求b和alpha参数
b_alpha <- solve(A)%*%y
b <- b_alpha[1]
alpha <- b_alpha[-1]
# 基于vdata进行预测
ypred <- NULL
for(i in 1:n)
{
xi <- x[i,]
deltaX <- (x-matrix(rep(xi,n),byrow = T,ncol = 3))^2
ypred <- c(ypred,drop(exp(-rowSums(deltaX)/(sigma^2))%*%t(t(alpha)))+b)
}
# 查看数据前几行
head(data.frame(y <- vdata$y,ypred))
## y ypred
## 1 0.2 0.2485166
## 2 0.2 0.2029999
## 3 0.2 0.1689355
## 4 0.2 0.1588311
## 5 0.2 0.2458173
## 6 0.4 0.3122185
# 误差平方和
sum((vdata[,4]-ypred)^2)
## [1] 2.867035