数据集

数据集概况

数据文件名称:car_test_frame.csv

数据集简介:数据集的行名为各种车型的名称,且共有8个变量,分别为“价格(Price)”、“产地()”、“可靠性()”、“英里数()”、“类型()”、“车重()”、“发动机功率()”以及“净马力”。

程序包加载

https://cran.r-project.org/web/packages/rpart/
https://cran.r-project.org/web/packages/C50/
https://cran.r-project.org/web/packages/RWeka/

install.packages('rpart')
install.packages('rpart.plot')
install.packages('maptree')
install.packages('RWeka')
install.packages('C50')

library(rpart)
library(rpart.plot)
library(maptree)
library(RWeka)
library(C50)

数据集加载及基本特点探索

oPathFile <- "http://data.galaxystatistics.com/blog_data/decision_tree/car_test_frame.csv"
car.test.frame <- read.csv(oPathFile,header=T)
dim(car.test.frame)
## [1] 60  9
head(car.test.frame)
##                  X Price   Country Reliability Mileage  Type Weight Disp.
## 1   Eagle Summit 4  8895       USA           4      33 Small   2560    97
## 2  Ford Escort   4  7402       USA           2      33 Small   2345   114
## 3   Ford Festiva 4  6319     Korea           4      37 Small   1845    81
## 4    Honda Civic 4  6635 Japan/USA           5      32 Small   2260    91
## 5  Mazda Protege 4  6599     Japan           5      32 Small   2440   113
## 6 Mercury Tracer 4  8672    Mexico           4      26 Small   2285    97
##    HP
## 1 113
## 2  90
## 3  63
## 4  92
## 5 103
## 6  82
car.test.frame <- car.test.frame[,-1]
dim(car.test.frame)
## [1] 60  8
head(car.test.frame)
##   Price   Country Reliability Mileage  Type Weight Disp.  HP
## 1  8895       USA           4      33 Small   2560    97 113
## 2  7402       USA           2      33 Small   2345   114  90
## 3  6319     Korea           4      37 Small   1845    81  63
## 4  6635 Japan/USA           5      32 Small   2260    91  92
## 5  6599     Japan           5      32 Small   2440   113 103
## 6  8672    Mexico           4      26 Small   2285    97  82

为了在决策树分析等过程中便于理解,此处将该数据集的变量名改设为中文,且将其中的“英里数”换算为 ,我们所熟悉的“油耗”指标。为了进一步了解各变量的信息我们可以分别使用str( )和summary( )函数,探寻数据集内部结构和概括信息。通过使用str( )函数,我们可知,数据集的维度为608,其中“产地”及“类型”变量时分别含有8个和6个水平的因子型变量,其他6个变量都为整型变量。通过使用summary( )函数,在输出结果中,对于因子型变量,给出了各个水平分别对应的样本个数,而数值型数据则给出了最值及中位数等基本的描述性统计指标。

car.test.frame$Mileage=100*4.546/(1.6*car.test.frame$Mileage)  #将"英里数"换算成"油耗"指标
names(car.test.frame)=c("价格","产地","可靠性","油耗","类型","车重","发动机功率","净马力")
head(car.test.frame)
##   价格      产地 可靠性      油耗  类型 车重 发动机功率 净马力
## 1 8895       USA      4  8.609848 Small 2560         97    113
## 2 7402       USA      2  8.609848 Small 2345        114     90
## 3 6319     Korea      4  7.679054 Small 1845         81     63
## 4 6635 Japan/USA      5  8.878906 Small 2260         91     92
## 5 6599     Japan      5  8.878906 Small 2440        113    103
## 6 8672    Mexico      4 10.927885 Small 2285         97     82
str(car.test.frame)
## 'data.frame':    60 obs. of  8 variables:
##  $ 价格      : int  8895 7402 6319 6635 6599 8672 7399 7254 9599 5866 ...
##  $ 产地      : Factor w/ 8 levels "France","Germany",..: 8 8 5 4 3 6 4 5 3 3 ...
##  $ 可靠性    : int  4 2 4 5 5 4 5 1 5 NA ...
##  $ 油耗      : num  8.61 8.61 7.68 8.88 8.88 ...
##  $ 类型      : Factor w/ 6 levels "Compact","Large",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ 车重      : int  2560 2345 1845 2260 2440 2285 2275 2350 2295 1900 ...
##  $ 发动机功率: int  97 114 81 91 113 97 97 98 109 73 ...
##  $ 净马力    : int  113 90 63 92 103 82 90 74 90 73 ...
summary(car.test.frame)
##       价格              产地        可靠性           油耗       
##  Min.   : 5866   USA      :26   Min.   :1.000   Min.   : 7.679  
##  1st Qu.: 9932   Japan    :19   1st Qu.:2.000   1st Qu.:10.523  
##  Median :12216   Japan/USA: 7   Median :3.000   Median :12.353  
##  Mean   :12616   Korea    : 3   Mean   :3.388   Mean   :11.962  
##  3rd Qu.:14933   Germany  : 2   3rd Qu.:5.000   3rd Qu.:13.530  
##  Max.   :24760   France   : 1   Max.   :5.000   Max.   :15.785  
##                  (Other)  : 2   NA's   :11                      
##       类型         车重        发动机功率        净马力     
##  Compact:15   Min.   :1845   Min.   : 73.0   Min.   : 63.0  
##  Large  : 3   1st Qu.:2571   1st Qu.:113.8   1st Qu.:101.5  
##  Medium :13   Median :2885   Median :144.5   Median :111.5  
##  Small  :13   Mean   :2901   Mean   :152.1   Mean   :122.3  
##  Sporty : 9   3rd Qu.:3231   3rd Qu.:180.0   3rd Qu.:142.8  
##  Van    : 7   Max.   :3855   Max.   :305.0   Max.   :225.0  
## 

数据预处理

我们需要着重来看“油耗”指标,因为在以下建模过程中,将以该变量作为目标变量,为了使用这一数据集分别构建以离散型变量为目标变量的分类树和以连续型变量为目标变量的回归树,考虑添加一列变量—“分组油耗”,即将“油耗”变量划分为三个组别,成为含有3个水平A、B、C的因子变量。

其中:

数据集的进一步处理

Group_Mileage <- matrix(0,60,1)
Group_Mileage[which(car.test.frame$"油耗">=11.6)] <- "A"
Group_Mileage[which(car.test.frame$"油耗"<=9)] <- "C"
Group_Mileage[which(Group_Mileage==0)] <- "B"
car.test.frame$"分组油耗" <- Group_Mileage
car.test.frame[1:10,c(4,9)]
##         油耗 分组油耗
## 1   8.609848        C
## 2   8.609848        C
## 3   7.679054        C
## 4   8.878906        C
## 5   8.878906        C
## 6  10.927885        B
## 7   8.609848        C
## 8  10.147321        B
## 9  11.365000        B
## 10  8.356618        C
head(car.test.frame)
##   价格      产地 可靠性      油耗  类型 车重 发动机功率 净马力 分组油耗
## 1 8895       USA      4  8.609848 Small 2560         97    113        C
## 2 7402       USA      2  8.609848 Small 2345        114     90        C
## 3 6319     Korea      4  7.679054 Small 1845         81     63        C
## 4 6635 Japan/USA      5  8.878906 Small 2260         91     92        C
## 5 6599     Japan      5  8.878906 Small 2440        113    103        C
## 6 8672    Mexico      4 10.927885 Small 2285         97     82        B

为了评价比较各决策树算法,及体现构建决策树的目的所在,我们通过抽样将数据集分为训练集(Train_Car)测试集(Test_Car),两者间比例为3:1,即通过3/4的样本建立起决策树模型,来预测另1/4样本的油耗/分组油耗的取值。并且为保持数据集分布,使用sampling软件包中的strata()函数来进行分层抽样,即在A、B、C组的英里数样本中分别抽取1/4共同构成测试集。

训练集和测试集的生成(抽样)

#install.packages('sampling')
library(sampling)
a <- round(1/4*sum(car.test.frame$"分组油耗"=="A"))
b <- round(1/4*sum(car.test.frame$"分组油耗"=="B"))
c <- round(1/4*sum(car.test.frame$"分组油耗"=="C"))
a;b;c
## [1] 9
## [1] 4
## [1] 2
sub <- strata(car.test.frame,stratanames="分组油耗",size=c(c,b,a),method="srswor")
sub
##    分组油耗 ID_unit      Prob Stratum
## 2         C       2 0.2222222       1
## 5         C       5 0.2222222       1
## 6         B       6 0.2500000       2
## 15        B      15 0.2500000       2
## 21        B      21 0.2500000       2
## 23        B      23 0.2500000       2
## 16        A      16 0.2571429       3
## 27        A      27 0.2571429       3
## 37        A      37 0.2571429       3
## 39        A      39 0.2571429       3
## 40        A      40 0.2571429       3
## 42        A      42 0.2571429       3
## 44        A      44 0.2571429       3
## 50        A      50 0.2571429       3
## 52        A      52 0.2571429       3
Train_Car <- car.test.frame[-sub$ID_unit,]
Test_Car <- car.test.frame[sub$ID_unit,]
dim(Train_Car)
## [1] 45  9
head(Train_Car)
##   价格      产地 可靠性      油耗  类型 车重 发动机功率 净马力 分组油耗
## 1 8895       USA      4  8.609848 Small 2560         97    113        C
## 3 6319     Korea      4  7.679054 Small 1845         81     63        C
## 4 6635 Japan/USA      5  8.878906 Small 2260         91     92        C
## 7 7399 Japan/USA      5  8.609848 Small 2275         97     90        C
## 8 7254     Korea      1 10.147321 Small 2350         98     74        B
## 9 9599     Japan      5 11.365000 Small 2295        109     90        B
dim(Test_Car)
## [1] 15  9
head(Test_Car)
##     价格    产地 可靠性      油耗    类型 车重 发动机功率 净马力 分组油耗
## 2   7402     USA      2  8.609848   Small 2345        114     90        C
## 5   6599   Japan      5  8.878906   Small 2440        113    103        C
## 6   8672  Mexico      4 10.927885   Small 2285         97     82        B
## 15  9745     USA      1 10.523148  Sporty 2885        153    100        B
## 21 10855     USA     NA 10.927885  Sporty 2840        107     92        B
## 23 18900 Germany     NA 10.523148 Compact 2670        121    108        B

应用案例

CART应用

CART 算法(分类回归树)是由Leo Breiman ,Jerome Friedman 等专家提出的一种数据勘测和预测算法。CART 树是一种二叉树,它采用一种二分递归分割的技术将当前样本集分割成为两个子样本集,使得生成的决策树的每个非叶子节点都有两个分支。CART 树的一大优点是它将模型的验证和最优通用树的发现嵌在了算法中。CART 树是这样实现这一目标的,它首先生成一棵非常复杂的树,再根据交叉验证和测试集验证的结果对树进行剪枝,从而得到最优通用树,这棵树是根据剪枝后不同版本的树在测试集数据上的性能得到的。复杂的树很少能在备用数据上表现出好的性能,因为对训练数据来说它是过适应的,使用交叉验证,能够克服过适应性,得到最适应未来数据的树。

对“油耗”变量建立回归树—数字结果

library(rpart)
library(rpart.plot)
library(maptree)
#设定模型公式,记为 formula_Car_Reg
formula_Car_Reg <- "油耗~价格+产地+可靠性+类型+车重+发动机功率+净马力"
#按照公式formula_Car_Reg对训练集Train_Car建立回归树,记为rp_Car_Reg
rp_Car_Reg <- rpart(formula_Car_Reg,Train_Car,method="anova")
#导出回归树基本信息
print(rp_Car_Reg)
## n= 45 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570  
##     6) 车重< 3165 12  10.69595 12.518360 *
##     7) 车重>=3165 14  17.48563 14.193890 *
#导出回归树的cp表格
printcp(rp_Car_Reg)
## 
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova")
## 
## Variables actually used in tree construction:
## [1] 车重       发动机功率
## 
## Root node error: 211.85/45 = 4.7078
## 
## n= 45 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.627666      0   1.00000 1.03382 0.168052
## 2 0.085627      1   0.37233 0.41239 0.066937
## 3 0.010000      2   0.28671 0.44581 0.087820
#获取决策树rp_Car_Reg详细信息
summary(rp_Car_Reg)
## Call:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova")
##   n= 45 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.62766590      0 1.0000000 1.0338154 0.16805195
## 2 0.08562688      1 0.3723341 0.4123883 0.06693738
## 3 0.01000000      2 0.2867072 0.4458136 0.08782003
## 
## Variable importance
## 发动机功率       车重     净马力       价格       类型       产地 
##         25         24         15         14         14          8 
## 
## Node number 1: 45 observations,    complexity param=0.6276659
##   mean=11.95109, MSE=4.707762 
##   left son=2 (19 obs) right son=3 (26 obs)
##   Primary splits:
##       发动机功率 < 137     to the left,  improve=0.6276659, (0 missing)
##       车重       < 2747.5  to the left,  improve=0.6132483, (0 missing)
##       价格       < 11507.5 to the left,  improve=0.5368155, (0 missing)
##       类型       splits as  RRRLRR,      improve=0.4829313, (0 missing)
##       净马力     < 105     to the left,  improve=0.4490789, (0 missing)
##   Surrogate splits:
##       车重   < 2747.5  to the left,  agree=0.956, adj=0.895, (0 split)
##       净马力 < 105     to the left,  agree=0.822, adj=0.579, (0 split)
##       价格   < 11507.5 to the left,  agree=0.800, adj=0.526, (0 split)
##       类型   splits as  RRRLRR,      agree=0.800, adj=0.526, (0 split)
##       产地   splits as  LLRLL-RR,    agree=0.711, adj=0.316, (0 split)
## 
## Node number 2: 19 observations
##   mean=9.940236, MSE=1.713533 
## 
## Node number 3: 26 observations,    complexity param=0.08562688
##   mean=13.42057, MSE=1.781599 
##   left son=6 (12 obs) right son=7 (14 obs)
##   Primary splits:
##       车重       < 3165    to the left,  improve=0.3916100, (0 missing)
##       净马力     < 148.5   to the left,  improve=0.2221799, (0 missing)
##       价格       < 13599   to the left,  improve=0.1904112, (0 missing)
##       类型       splits as  LRR-LR,      improve=0.1620824, (0 missing)
##       发动机功率 < 181.5   to the left,  improve=0.1041802, (0 missing)
##   Surrogate splits:
##       价格       < 13172.5 to the left,  agree=0.769, adj=0.500, (0 split)
##       发动机功率 < 158     to the left,  agree=0.769, adj=0.500, (0 split)
##       类型       splits as  LRR-RR,      agree=0.731, adj=0.417, (0 split)
##       净马力     < 144.5   to the left,  agree=0.731, adj=0.417, (0 split)
##       产地       splits as  --R-L-LR,    agree=0.615, adj=0.167, (0 split)
## 
## Node number 6: 12 observations
##   mean=12.51836, MSE=0.8913294 
## 
## Node number 7: 14 observations
##   mean=14.19389, MSE=1.248974

[注:由于训练数据存在随机抽样的情况,以下描述可能会有稍微的出入!!!]

在如上输出结果中,我们看到各节点信息按照“node), split, n, deviance, yval”的格式给出,且按照节点层次以不同缩进量列出,如节点1缩进量最小,其次为节点2和节点3,并在每条节点信息后以星号*标示出是否为叶节点。具体的,我们可以看出,1)为根节点共含有45个样本,即全部训练样本;2) 和3) 以“发动机功率”变量为节点,且以“134”为分割值划分为两支,分别包含20个和25个样本;4) 和5) 以及 6) 和7) 以此类推。

由此可以看到,在建树过程中用到的变量有“发动机功率”、“价格”和“类型”这三种,且各节点的CP值,节点序号nsplit、错误率rel error、交互验证错误率xerror等也被列出,其中CP值对于选择控制树的复杂程度十分重要。

若想获得每个节点更详细的信息,可以对已有决策树模型rp_Car_Reg使用 summary()函数。所得输出结果除了与上面printcp()给出值相同的部分外,另有变量重要程度(Variable importance)、每一个分支变量对生成树的提升程度(improve)等信息。

对“油耗”变量建立回归树—数字结果(修改若干参数值)

下面我们尝试改变rpart()函数的若干参数值,来深入探究该函数的使用及数据信息。

#将分支包含最小样本数minsplit从默认值20更改为10,新的回归树记为rp_Car_Reg1
rp_Car_Reg1 <- rpart(formula_Car_Reg,Train_Car,method="anova",minsplit=10)
print(rp_Car_Reg1)
## n= 45 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 45 211.849300 11.951090  
##    2) 发动机功率< 137 19  32.557130  9.940236  
##      4) 价格< 9247 8   3.606646  8.626163 *
##      5) 价格>=9247 11   5.089398 10.895930  
##       10) 类型=Sporty 4   1.150584 10.267300 *
##       11) 类型=Compact,Small 7   1.454858 11.255140 *
##    3) 发动机功率>=137 26  46.321580 13.420570  
##      6) 类型=Compact,Medium,Sporty 20  20.768890 12.910710  
##       12) 价格< 11245 4   1.813531 11.519990 *
##       13) 价格>=11245 16   9.284856 13.258390  
##         26) 净马力< 154 11   3.295533 12.938470 *
##         27) 净马力>=154 5   2.386790 13.962200 *
##      7) 类型=Large,Van 6   3.023018 15.120100 *
printcp(rp_Car_Reg1)
## 
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova", 
##     minsplit = 10)
## 
## Variables actually used in tree construction:
## [1] 发动机功率 价格       净马力     类型      
## 
## Root node error: 211.85/45 = 4.7078
## 
## n= 45 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.627666      0  1.000000 1.03957 0.171923
## 2 0.112632      1  0.372334 0.45493 0.071841
## 3 0.106348      2  0.259702 0.51005 0.109048
## 4 0.045648      3  0.153354 0.39101 0.104937
## 5 0.017005      4  0.107706 0.36528 0.104065
## 6 0.011725      5  0.090701 0.37083 0.104090
## 7 0.010000      6  0.078976 0.38170 0.106049
#将CP值从默认值0.01改为0.1,新的回归树记为rp_Car_Reg2
rp_Car_Reg2 <- rpart(formula_Car_Reg,Train_Car,method="anova",cp=0.1)
print(rp_Car_Reg2)
## n= 45 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570 *
printcp(rp_Car_Reg2)
## 
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova", 
##     cp = 0.1)
## 
## Variables actually used in tree construction:
## [1] 发动机功率
## 
## Root node error: 211.85/45 = 4.7078
## 
## n= 45 
## 
##        CP nsplit rel error  xerror    xstd
## 1 0.62767      0   1.00000 1.07593 0.17636
## 2 0.10000      1   0.37233 0.44465 0.07069
#对决策树rp_Car_Reg按照CP值为0.1进行剪枝,新的回归树记为rp_Car_Reg3
rp_Car_Reg3 <- prune.rpart(rp_Car_Reg,cp=0.1)
print(rp_Car_Reg3)
## n= 45 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570 *
printcp(rp_Car_Reg3)
## 
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova")
## 
## Variables actually used in tree construction:
## [1] 发动机功率
## 
## Root node error: 211.85/45 = 4.7078
## 
## n= 45 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.62767      0   1.00000 1.03382 0.168052
## 2 0.10000      1   0.37233 0.41239 0.066937
#将树的深度maxdepth设为1,新的回归树记为rp_Car_Reg4
rp_Car_Reg4 <- rpart(formula_Car_Reg,Train_Car,method="anova",maxdepth=1)
print(rp_Car_Reg4)
## n= 45 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570 *
printcp(rp_Car_Reg4)
## 
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova", 
##     maxdepth = 1)
## 
## Variables actually used in tree construction:
## [1] 发动机功率
## 
## Root node error: 211.85/45 = 4.7078
## 
## n= 45 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.62767      0   1.00000 1.05355 0.174146
## 2 0.01000      1   0.37233 0.43641 0.067109

对“油耗”变量建立回归树—树形结构

#设置minsplit为10,新的回归树记为rp_Car_Plot
rp_Car_Plot <- rpart(formula_Car_Reg,Train_Car,method="anova",minsplit=10)
print(rp_Car_Plot)
## n= 45 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 45 211.849300 11.951090  
##    2) 发动机功率< 137 19  32.557130  9.940236  
##      4) 价格< 9247 8   3.606646  8.626163 *
##      5) 价格>=9247 11   5.089398 10.895930  
##       10) 类型=Sporty 4   1.150584 10.267300 *
##       11) 类型=Compact,Small 7   1.454858 11.255140 *
##    3) 发动机功率>=137 26  46.321580 13.420570  
##      6) 类型=Compact,Medium,Sporty 20  20.768890 12.910710  
##       12) 价格< 11245 4   1.813531 11.519990 *
##       13) 价格>=11245 16   9.284856 13.258390  
##         26) 净马力< 154 11   3.295533 12.938470 *
##         27) 净马力>=154 5   2.386790 13.962200 *
##      7) 类型=Large,Van 6   3.023018 15.120100 *
#绘制决策树
rpart.plot(rp_Car_Plot)

#通过上图数中给出的信息,无法全面给出每个分支的判断条件,需要额外查看数字结果。
#这时可以通过更改所绘制树状图的类型,即参数type来满足我们的需求。
#更改type参数为类型4,绘制决策树。
#还可以设置type值为1、2、3来比较绘制决策树的不同之处。
rpart.plot(rp_Car_Plot,type=4)

#选择设置"分支"参数branch=1来获得垂直枝干形状的决策树以减少图形所占空间。
rpart.plot(rp_Car_Plot,type=4,branch=1)

#选择设置fallen.leaves=TRUE,将所有叶节点一致的摆放在树最下端,以方便查看。
rpart.plot(rp_Car_Plot,type=4,fallen.leaves=TRUE)

#利用draw.tree()绘制决策树。
draw.tree(rp_Car_Plot,col=rep(1,7),nodeinfo=TRUE)

#利用post()函数绘制决策树。
post(rp_Car_Plot,file="",title.="post: Regression Tree") 

#用plot()直接绘图,并对如上制图结果添加相关文字内容。
plot(rp_Car_Plot,uniform=TRUE,main="plot: Regression Tree")
text(rp_Car_Plot,use.n=TRUE,all=TRUE)

对“分组油耗”变量建立分类树

library(rpart)
library(rpart.plot)
library(maptree)
#设定模型公式
formula_Car_Cla <- "分组油耗~价格+产地+可靠性+类型+车重+发动机功率+净马力"
#按照公式formula_Car_Cla对训练集Train_Car构建分类树,记为rp_Car_Cla
rp_Car_Cla <- rpart(formula_Car_Cla,Train_Car,method="class",minsplit=5)
print(rp_Car_Cla)
## n= 45 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 45 19 A (0.57777778 0.26666667 0.15555556)  
##    2) 发动机功率>=137 26  2 A (0.92307692 0.07692308 0.00000000) *
##    3) 发动机功率< 137 19  9 B (0.10526316 0.52631579 0.36842105)  
##      6) 价格>=9247 11  2 B (0.18181818 0.81818182 0.00000000)  
##       12) 产地=France,Japan/USA 2  0 A (1.00000000 0.00000000 0.00000000) *
##       13) 产地=Germany,Japan,USA 9  0 B (0.00000000 1.00000000 0.00000000) *
##      7) 价格< 9247 8  1 C (0.00000000 0.12500000 0.87500000) *
#绘制分类树
rpart.plot(rp_Car_Cla,type=4,fallen.leaves=TRUE)

对测试集Test_Car预测目标变量

#对测试集Test_Car中观测样本的"分组油耗"指标进行预测
pre_Car_Cla <- predict(rp_Car_Cla,Test_Car,type="class")
#显示预测结果
pre_Car_Cla
##  4 18  6 11 28 36 24 33 35 49 50 51 53 56 60 
##  C  B  C  C  A  A  A  A  A  A  A  A  A  A  A 
## Levels: A B C
#获取混淆矩阵
table(Test_Car$分组油耗,pre_Car_Cla)
##    pre_Car_Cla
##     A B C
##   A 9 0 0
##   B 2 0 2
##   C 0 1 1
#计算错误率
(p=sum(as.numeric(pre_Car_Cla!=Test_Car$分组油耗))/nrow(Test_Car))
## [1] 0.3333333

C4.5应用

最早的决策时算法是由Hunt等人于1966年提出的CLS。当前最有影响的决策树算法是Quinlan于1986年提出的ID3和1993年提出的C4.5。ID3只能处理离散型描述属性,它选择信息增益最大的属性划分训练样本,其目的是进行分枝时系统的熵最小,从而提高算法的运算速度和精确度。ID3算法的主要缺陷是,用信息增益作为选择分枝属性的标准时,偏向于取值较多的属性,而在某些情况下,这类属性可能不会提供太多有价值的信息。C4.5是ID3算法的改进算法,不仅可以处理离散型描述属性,还能处理连续性描述属性。C4.5采用了信息增益比作为选择分枝属性的标准,弥补了ID3算法的不足。

决策树算法的优点如下:(1)分类精度高;(2)成的模式简单;(3)对噪声数据有很好的健壮性。因而是目前应用最为广泛的归纳推理算法之一,在数据挖掘中受到研究者的广泛关注。

C4.5算法示例

library(RWeka)
#更改英文变量名
names(Train_Car) <- c("Price","Country","Reliability","Mileage",
                   "Type","Weight","Disp.","HP","Oil_Consumption") 
#将分组油耗Oil_Consumption的变量类型改为因子型,使J48()函数可识别
Train_Car$Oil_Consumption <- as.factor(Train_Car$Oil_Consumption)
#设置模型公式
formula <- Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
#在默认参数取值下,构建分类树模型C45_0
C45_0 <- J48(formula,Train_Car)
C45_0
## J48 pruned tree
## ------------------
## 
## Disp. <= 133
## |   Disp. <= 97: C (7.0/1.0)
## |   Disp. > 97: B (10.0/1.0)
## Disp. > 133: A (19.0/1.0)
## 
## Number of Leaves  :  3
## 
## Size of the tree :   5
summary(C45_0)
## 
## === Summary ===
## 
## Correctly Classified Instances          33               91.6667 %
## Incorrectly Classified Instances         3                8.3333 %
## Kappa statistic                          0.8621
## Mean absolute error                      0.1002
## Root mean squared error                  0.2238
## Relative absolute error                 24.8179 %
## Root relative squared error             50.0125 %
## Total Number of Instances               36     
## 
## === Confusion Matrix ===
## 
##   a  b  c   <-- classified as
##  18  1  0 |  a = A
##   1  9  1 |  b = B
##   0  0  6 |  c = C
#取control参数的M值为3,构建分类树模型C45_1
C45_1 <- J48(formula,Train_Car,control=Weka_control(M=3))
C45_1
## J48 pruned tree
## ------------------
## 
## Disp. <= 133
## |   Disp. <= 97: C (7.0/1.0)
## |   Disp. > 97: B (10.0/1.0)
## Disp. > 133: A (19.0/1.0)
## 
## Number of Leaves  :  3
## 
## Size of the tree :   5
#绘制分类树
plot(C45_1)

C5.0应用

C4.5算法与 ID3 算法极为相似,只是在特征选择上有所不同,算是一种对 ID3 算法的改进了。C4.5算法在决策树生成过程中,用信息增益比来选择特征。信息增益比的定义如前文所述。而 C5.0 算法作为 C4.5的改进版,其算法原理都是一致的,都是采用信息增益比来进行特征选择,但C5.0适用于处理大数据集,采用 Boosting 方式来提高模型准确率,计算速度较高,对计算内存占用也较少。关于 C4.5和C5.0的实际计算可参考R语言实现方式。

C5.0是C4.5应用于大数据集上的分类算法,主要在执行效率和内存使用方面进行了改进。C4.5算法是ID3算法的修订版,采用GainRatio来进行改进,选取有最大GainRatio的分割变量作为准则,避免ID3算法过度配适的问题。C5.0算法则是C4.5算法的修订版,适用于处理大数据集,采用Boosting方式提高模型准确率,又称为BoostingTrees,在软件上计算速度比较快,占用的内存资源较少。

C5.0函数

?C5.0
?C5.0Control

C5.0(x, y, trials = 1, rules= FALSE, 
     weights = NULL, 
     control = C5.0Control(), 
     costs = NULL, ...)

trials:控制自动法循环次数多迭代效果更好 
costs:可选矩阵与各类型错误项对应的成本-代价矩阵

C5.0Control(subset = TRUE, 
            bands = 0, 
            winnow = FALSE, 
            noGlobalPruning = FALSE, 
            CF = 0.25, 
            minCases = 2, 
            fuzzyThreshold = FALSE, 
            sample = 0, 
            seed = sample.int(4096, size = 1) - 1L,  
            earlyStopping = TRUE,
            label = "outcome")

predict(model,test,type="class")  
  type取class分类结果或者prob分类概率

c5.0tree=C5.0(churn~.,data=tree.train,trials=5,control=C5.0Control(winnow = TRUE,CF=0.25))

  trials:模型的迭代次数         
  winnow :在建模之前是否对变量进行特征选择       
  CF:剪枝时的置信度
  summary(c5.0tree):查看模型结果

C5.0算法示例

library(C50)
#更改英文变量名
data_name <- c("Price","Country","Reliability","Mileage","Type","Weight","Disp.","HP","Oil_Consumption") 
names(Train_Car) <- data_name
names(Test_Car) <- data_name
#将分组油耗Oil_Consumption的变量类型改为因子型,使J48()函数可识别
Train_Car$Oil_Consumption <- as.factor(Train_Car$Oil_Consumption)
#设置模型公式
formula <- Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
tr5.0 <- C5.0(formula, data=Train_Car, control = C5.0Control(noGlobalPruning = T))
tr5.01 <- C5.0(formula, data=Train_Car, control = C5.0Control(noGlobalPruning = F))

class(tr5.0)
## [1] "C5.0"
length(tr5.0)
## [1] 18
summary(tr5.0)
## 
## Call:
## C5.0.formula(formula = formula, data = Train_Car, control
##  = C5.0Control(noGlobalPruning = T))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Aug 29 18:56:26 2017
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 45 cases (8 attributes) from undefined.data
## 
## Decision tree:
## 
## Price <= 9410:
## :...Weight <= 2295: C (6)
## :   Weight > 2295: B (3/1)
## Price > 9410:
## :...Disp. <= 133: B (11/2)
##     Disp. > 133: A (25/1)
## 
## 
## Evaluation on training data (45 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       4    4( 8.9%)   <<
## 
## 
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##      24     2          (a): class A
##       1    11          (b): class B
##             1     6    (c): class C
## 
## 
##  Attribute usage:
## 
##  100.00% Price
##   80.00% Disp.
##   20.00% Weight
## 
## 
## Time: 0.0 secs
tr5.0[1:length(tr5.0)]
## $names
## [1] "| Generated using R version 3.3.3 (2017-03-06)\n| on 周二 8月 29 18:56:26 2017\n| function call: makeNamesFile(x = x, y = y, w = weights, label = control$label, \n|    comments = TRUE)\noutcome.\n\noutcome: A,B,C.\nPrice:continuous.\nCountry:France,Germany,Japan,Japan\\/USA,Korea,Mexico,Sweden,USA.\nReliability:continuous.\nType:Compact,Large,Medium,Small,Sporty,Van.\nWeight:continuous.\nDisp\\.:continuous.\nHP:continuous.\n"
## 
## $cost
## [1] ""
## 
## $costMatrix
## NULL
## 
## $caseWeights
## [1] FALSE
## 
## $control
## $control$subset
## [1] TRUE
## 
## $control$bands
## [1] 0
## 
## $control$winnow
## [1] FALSE
## 
## $control$noGlobalPruning
## [1] TRUE
## 
## $control$CF
## [1] 0.25
## 
## $control$minCases
## [1] 2
## 
## $control$fuzzyThreshold
## [1] FALSE
## 
## $control$sample
## [1] 0
## 
## $control$earlyStopping
## [1] TRUE
## 
## $control$label
## [1] "outcome"
## 
## $control$seed
## [1] 3655
## 
## 
## $trials
## Requested    Actual 
##         1         1 
## 
## $rbm
## [1] FALSE
## 
## $boostResults
## NULL
## 
## $size
## [1] 4
## 
## $dims
## [1] 45  7
## 
## $call
## C5.0.formula(formula = formula, data = Train_Car, control = C5.0Control(noGlobalPruning = T))
## 
## $levels
## [1] "A" "B" "C"
## 
## $output
## [1] "\nC5.0 [Release 2.07 GPL Edition]  \tTue Aug 29 18:56:26 2017\n-------------------------------\n\nClass specified by attribute `outcome'\n\nRead 45 cases (8 attributes) from undefined.data\n\nDecision tree:\n\nPrice <= 9410:\n:...Weight <= 2295: C (6)\n:   Weight > 2295: B (3/1)\nPrice > 9410:\n:...Disp. <= 133: B (11/2)\n    Disp. > 133: A (25/1)\n\n\nEvaluation on training data (45 cases):\n\n\t    Decision Tree   \n\t  ----------------  \n\t  Size      Errors  \n\n\t     4    4( 8.9%)   <<\n\n\n\t   (a)   (b)   (c)    <-classified as\n\t  ----  ----  ----\n\t    24     2          (a): class A\n\t     1    11          (b): class B\n\t           1     6    (c): class C\n\n\n\tAttribute usage:\n\n\t100.00%\tPrice\n\t 80.00%\tDisp.\n\t 20.00%\tWeight\n\n\nTime: 0.0 secs\n"
## 
## $tree
## [1] "id=\"See5/C5.0 2.07 GPL Edition 2017-08-29\"\nentries=\"1\"\ntype=\"2\" class=\"A\" freq=\"26,12,7\" att=\"Price\" forks=\"3\" cut=\"9410\"\ntype=\"0\" class=\"A\"\ntype=\"2\" class=\"C\" freq=\"0,2,7\" att=\"Weight\" forks=\"3\" cut=\"2295\"\ntype=\"0\" class=\"C\"\ntype=\"0\" class=\"C\" freq=\"0,0,6\"\ntype=\"0\" class=\"B\" freq=\"0,2,1\"\ntype=\"2\" class=\"A\" freq=\"26,10,0\" att=\"Disp.\" forks=\"3\" cut=\"133\"\ntype=\"0\" class=\"A\"\ntype=\"0\" class=\"B\" freq=\"2,9,0\"\ntype=\"0\" class=\"A\" freq=\"24,1,0\"\n"
## 
## $predictors
## [1] "Price"       "Country"     "Reliability" "Type"        "Weight"     
## [6] "Disp."       "HP"         
## 
## $rules
## [1] ""
## 
## $Terms
## Oil_Consumption ~ Price + Country + Reliability + Type + Weight + 
##     Disp. + HP
## attr(,"variables")
## list(Oil_Consumption, Price, Country, Reliability, Type, Weight, 
##     Disp., HP)
## attr(,"factors")
##                 Price Country Reliability Type Weight Disp. HP
## Oil_Consumption     0       0           0    0      0     0  0
## Price               1       0           0    0      0     0  0
## Country             0       1           0    0      0     0  0
## Reliability         0       0           1    0      0     0  0
## Type                0       0           0    1      0     0  0
## Weight              0       0           0    0      1     0  0
## Disp.               0       0           0    0      0     1  0
## HP                  0       0           0    0      0     0  1
## attr(,"term.labels")
## [1] "Price"       "Country"     "Reliability" "Type"        "Weight"     
## [6] "Disp."       "HP"         
## attr(,"order")
## [1] 1 1 1 1 1 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(Oil_Consumption, Price, Country, Reliability, Type, Weight, 
##     Disp., HP)
## attr(,"dataClasses")
## Oil_Consumption           Price         Country     Reliability 
##        "factor"       "numeric"        "factor"       "numeric" 
##            Type          Weight           Disp.              HP 
##        "factor"       "numeric"       "numeric"       "numeric" 
## 
## $xlevels
## $xlevels$Country
## [1] "France"    "Germany"   "Japan"     "Japan/USA" "Korea"     "Mexico"   
## [7] "Sweden"    "USA"      
## 
## $xlevels$Type
## [1] "Compact" "Large"   "Medium"  "Small"   "Sporty"  "Van"
#--种类画图不需要text()--#
# par(xpd = TRUE)
# plot(tr5.0)
# plot(tr5.01)
# C5imp(tr5.0)
#预测
pre <- predict(tr5.0, Test_Car,type = 'class')
pre1 <- predict(tr5.01,Test_Car,type = 'class')
##建立预测交叉矩阵
m <- table(type=Test_Car$Oil_Consumption, predict=pre)
m1 <- table(type=Test_Car$Oil_Consumption, predict=pre1)
#预测正确率
sum(diag(m))/sum(m)
## [1] 0.7333333
#预测正确率
sum(diag(m1))/sum(m1)
## [1] 0.8666667