








oPathFile <- "http://data.galaxystatistics.com/blog_data/decision_tree/car_test_frame.csv"
car.test.frame <- read.csv(oPathFile,header=T)
## [1] 60  9
##                  X Price   Country Reliability Mileage  Type Weight Disp.
## 1   Eagle Summit 4  8895       USA           4      33 Small   2560    97
## 2  Ford Escort   4  7402       USA           2      33 Small   2345   114
## 3   Ford Festiva 4  6319     Korea           4      37 Small   1845    81
## 4    Honda Civic 4  6635 Japan/USA           5      32 Small   2260    91
## 5  Mazda Protege 4  6599     Japan           5      32 Small   2440   113
## 6 Mercury Tracer 4  8672    Mexico           4      26 Small   2285    97
##    HP
## 1 113
## 2  90
## 3  63
## 4  92
## 5 103
## 6  82
car.test.frame <- car.test.frame[,-1]
## [1] 60  8
##   Price   Country Reliability Mileage  Type Weight Disp.  HP
## 1  8895       USA           4      33 Small   2560    97 113
## 2  7402       USA           2      33 Small   2345   114  90
## 3  6319     Korea           4      37 Small   1845    81  63
## 4  6635 Japan/USA           5      32 Small   2260    91  92
## 5  6599     Japan           5      32 Small   2440   113 103
## 6  8672    Mexico           4      26 Small   2285    97  82

为了在决策树分析等过程中便于理解,此处将该数据集的变量名改设为中文,且将其中的“英里数”换算为 ,我们所熟悉的“油耗”指标。为了进一步了解各变量的信息我们可以分别使用str( )和summary( )函数,探寻数据集内部结构和概括信息。通过使用str( )函数,我们可知,数据集的维度为608,其中“产地”及“类型”变量时分别含有8个和6个水平的因子型变量,其他6个变量都为整型变量。通过使用summary( )函数,在输出结果中,对于因子型变量,给出了各个水平分别对应的样本个数,而数值型数据则给出了最值及中位数等基本的描述性统计指标。

car.test.frame$Mileage=100*4.546/(1.6*car.test.frame$Mileage)  #将"英里数"换算成"油耗"指标
##   价格      产地 可靠性      油耗  类型 车重 发动机功率 净马力
## 1 8895       USA      4  8.609848 Small 2560         97    113
## 2 7402       USA      2  8.609848 Small 2345        114     90
## 3 6319     Korea      4  7.679054 Small 1845         81     63
## 4 6635 Japan/USA      5  8.878906 Small 2260         91     92
## 5 6599     Japan      5  8.878906 Small 2440        113    103
## 6 8672    Mexico      4 10.927885 Small 2285         97     82
## 'data.frame':    60 obs. of  8 variables:
##  $ 价格      : int  8895 7402 6319 6635 6599 8672 7399 7254 9599 5866 ...
##  $ 产地      : Factor w/ 8 levels "France","Germany",..: 8 8 5 4 3 6 4 5 3 3 ...
##  $ 可靠性    : int  4 2 4 5 5 4 5 1 5 NA ...
##  $ 油耗      : num  8.61 8.61 7.68 8.88 8.88 ...
##  $ 类型      : Factor w/ 6 levels "Compact","Large",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ 车重      : int  2560 2345 1845 2260 2440 2285 2275 2350 2295 1900 ...
##  $ 发动机功率: int  97 114 81 91 113 97 97 98 109 73 ...
##  $ 净马力    : int  113 90 63 92 103 82 90 74 90 73 ...
##       价格              产地        可靠性           油耗       
##  Min.   : 5866   USA      :26   Min.   :1.000   Min.   : 7.679  
##  1st Qu.: 9932   Japan    :19   1st Qu.:2.000   1st Qu.:10.523  
##  Median :12216   Japan/USA: 7   Median :3.000   Median :12.353  
##  Mean   :12616   Korea    : 3   Mean   :3.388   Mean   :11.962  
##  3rd Qu.:14933   Germany  : 2   3rd Qu.:5.000   3rd Qu.:13.530  
##  Max.   :24760   France   : 1   Max.   :5.000   Max.   :15.785  
##                  (Other)  : 2   NA's   :11                      
##       类型         车重        发动机功率        净马力     
##  Compact:15   Min.   :1845   Min.   : 73.0   Min.   : 63.0  
##  Large  : 3   1st Qu.:2571   1st Qu.:113.8   1st Qu.:101.5  
##  Medium :13   Median :2885   Median :144.5   Median :111.5  
##  Small  :13   Mean   :2901   Mean   :152.1   Mean   :122.3  
##  Sporty : 9   3rd Qu.:3231   3rd Qu.:180.0   3rd Qu.:142.8  
##  Van    : 7   Max.   :3855   Max.   :305.0   Max.   :225.0  





Group_Mileage <- matrix(0,60,1)
Group_Mileage[which(car.test.frame$"油耗">=11.6)] <- "A"
Group_Mileage[which(car.test.frame$"油耗"<=9)] <- "C"
Group_Mileage[which(Group_Mileage==0)] <- "B"
car.test.frame$"分组油耗" <- Group_Mileage
##         油耗 分组油耗
## 1   8.609848        C
## 2   8.609848        C
## 3   7.679054        C
## 4   8.878906        C
## 5   8.878906        C
## 6  10.927885        B
## 7   8.609848        C
## 8  10.147321        B
## 9  11.365000        B
## 10  8.356618        C
##   价格      产地 可靠性      油耗  类型 车重 发动机功率 净马力 分组油耗
## 1 8895       USA      4  8.609848 Small 2560         97    113        C
## 2 7402       USA      2  8.609848 Small 2345        114     90        C
## 3 6319     Korea      4  7.679054 Small 1845         81     63        C
## 4 6635 Japan/USA      5  8.878906 Small 2260         91     92        C
## 5 6599     Japan      5  8.878906 Small 2440        113    103        C
## 6 8672    Mexico      4 10.927885 Small 2285         97     82        B



a <- round(1/4*sum(car.test.frame$"分组油耗"=="A"))
b <- round(1/4*sum(car.test.frame$"分组油耗"=="B"))
c <- round(1/4*sum(car.test.frame$"分组油耗"=="C"))
## [1] 9
## [1] 4
## [1] 2
sub <- strata(car.test.frame,stratanames="分组油耗",size=c(c,b,a),method="srswor")
##    分组油耗 ID_unit      Prob Stratum
## 2         C       2 0.2222222       1
## 5         C       5 0.2222222       1
## 6         B       6 0.2500000       2
## 15        B      15 0.2500000       2
## 21        B      21 0.2500000       2
## 23        B      23 0.2500000       2
## 16        A      16 0.2571429       3
## 27        A      27 0.2571429       3
## 37        A      37 0.2571429       3
## 39        A      39 0.2571429       3
## 40        A      40 0.2571429       3
## 42        A      42 0.2571429       3
## 44        A      44 0.2571429       3
## 50        A      50 0.2571429       3
## 52        A      52 0.2571429       3
Train_Car <- car.test.frame[-sub$ID_unit,]
Test_Car <- car.test.frame[sub$ID_unit,]
## [1] 45  9
##   价格      产地 可靠性      油耗  类型 车重 发动机功率 净马力 分组油耗
## 1 8895       USA      4  8.609848 Small 2560         97    113        C
## 3 6319     Korea      4  7.679054 Small 1845         81     63        C
## 4 6635 Japan/USA      5  8.878906 Small 2260         91     92        C
## 7 7399 Japan/USA      5  8.609848 Small 2275         97     90        C
## 8 7254     Korea      1 10.147321 Small 2350         98     74        B
## 9 9599     Japan      5 11.365000 Small 2295        109     90        B
## [1] 15  9
##     价格    产地 可靠性      油耗    类型 车重 发动机功率 净马力 分组油耗
## 2   7402     USA      2  8.609848   Small 2345        114     90        C
## 5   6599   Japan      5  8.878906   Small 2440        113    103        C
## 6   8672  Mexico      4 10.927885   Small 2285         97     82        B
## 15  9745     USA      1 10.523148  Sporty 2885        153    100        B
## 21 10855     USA     NA 10.927885  Sporty 2840        107     92        B
## 23 18900 Germany     NA 10.523148 Compact 2670        121    108        B



CART 算法(分类回归树)是由Leo Breiman ,Jerome Friedman 等专家提出的一种数据勘测和预测算法。CART 树是一种二叉树,它采用一种二分递归分割的技术将当前样本集分割成为两个子样本集,使得生成的决策树的每个非叶子节点都有两个分支。CART 树的一大优点是它将模型的验证和最优通用树的发现嵌在了算法中。CART 树是这样实现这一目标的,它首先生成一棵非常复杂的树,再根据交叉验证和测试集验证的结果对树进行剪枝,从而得到最优通用树,这棵树是根据剪枝后不同版本的树在测试集数据上的性能得到的。复杂的树很少能在备用数据上表现出好的性能,因为对训练数据来说它是过适应的,使用交叉验证,能够克服过适应性,得到最适应未来数据的树。


#设定模型公式,记为 formula_Car_Reg
formula_Car_Reg <- "油耗~价格+产地+可靠性+类型+车重+发动机功率+净马力"
rp_Car_Reg <- rpart(formula_Car_Reg,Train_Car,method="anova")
## n= 45 
## node), split, n, deviance, yval
##       * denotes terminal node
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570  
##     6) 车重< 3165 12  10.69595 12.518360 *
##     7) 车重>=3165 14  17.48563 14.193890 *
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova")
## Variables actually used in tree construction:
## [1] 车重       发动机功率
## Root node error: 211.85/45 = 4.7078
## n= 45 
##         CP nsplit rel error  xerror     xstd
## 1 0.627666      0   1.00000 1.03382 0.168052
## 2 0.085627      1   0.37233 0.41239 0.066937
## 3 0.010000      2   0.28671 0.44581 0.087820
## Call:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova")
##   n= 45 
##           CP nsplit rel error    xerror       xstd
## 1 0.62766590      0 1.0000000 1.0338154 0.16805195
## 2 0.08562688      1 0.3723341 0.4123883 0.06693738
## 3 0.01000000      2 0.2867072 0.4458136 0.08782003
## Variable importance
## 发动机功率       车重     净马力       价格       类型       产地 
##         25         24         15         14         14          8 
## Node number 1: 45 observations,    complexity param=0.6276659
##   mean=11.95109, MSE=4.707762 
##   left son=2 (19 obs) right son=3 (26 obs)
##   Primary splits:
##       发动机功率 < 137     to the left,  improve=0.6276659, (0 missing)
##       车重       < 2747.5  to the left,  improve=0.6132483, (0 missing)
##       价格       < 11507.5 to the left,  improve=0.5368155, (0 missing)
##       类型       splits as  RRRLRR,      improve=0.4829313, (0 missing)
##       净马力     < 105     to the left,  improve=0.4490789, (0 missing)
##   Surrogate splits:
##       车重   < 2747.5  to the left,  agree=0.956, adj=0.895, (0 split)
##       净马力 < 105     to the left,  agree=0.822, adj=0.579, (0 split)
##       价格   < 11507.5 to the left,  agree=0.800, adj=0.526, (0 split)
##       类型   splits as  RRRLRR,      agree=0.800, adj=0.526, (0 split)
##       产地   splits as  LLRLL-RR,    agree=0.711, adj=0.316, (0 split)
## Node number 2: 19 observations
##   mean=9.940236, MSE=1.713533 
## Node number 3: 26 observations,    complexity param=0.08562688
##   mean=13.42057, MSE=1.781599 
##   left son=6 (12 obs) right son=7 (14 obs)
##   Primary splits:
##       车重       < 3165    to the left,  improve=0.3916100, (0 missing)
##       净马力     < 148.5   to the left,  improve=0.2221799, (0 missing)
##       价格       < 13599   to the left,  improve=0.1904112, (0 missing)
##       类型       splits as  LRR-LR,      improve=0.1620824, (0 missing)
##       发动机功率 < 181.5   to the left,  improve=0.1041802, (0 missing)
##   Surrogate splits:
##       价格       < 13172.5 to the left,  agree=0.769, adj=0.500, (0 split)
##       发动机功率 < 158     to the left,  agree=0.769, adj=0.500, (0 split)
##       类型       splits as  LRR-RR,      agree=0.731, adj=0.417, (0 split)
##       净马力     < 144.5   to the left,  agree=0.731, adj=0.417, (0 split)
##       产地       splits as  --R-L-LR,    agree=0.615, adj=0.167, (0 split)
## Node number 6: 12 observations
##   mean=12.51836, MSE=0.8913294 
## Node number 7: 14 observations
##   mean=14.19389, MSE=1.248974


在如上输出结果中,我们看到各节点信息按照“node), split, n, deviance, yval”的格式给出,且按照节点层次以不同缩进量列出,如节点1缩进量最小,其次为节点2和节点3,并在每条节点信息后以星号*标示出是否为叶节点。具体的,我们可以看出,1)为根节点共含有45个样本,即全部训练样本;2) 和3) 以“发动机功率”变量为节点,且以“134”为分割值划分为两支,分别包含20个和25个样本;4) 和5) 以及 6) 和7) 以此类推。

由此可以看到,在建树过程中用到的变量有“发动机功率”、“价格”和“类型”这三种,且各节点的CP值,节点序号nsplit、错误率rel error、交互验证错误率xerror等也被列出,其中CP值对于选择控制树的复杂程度十分重要。

若想获得每个节点更详细的信息,可以对已有决策树模型rp_Car_Reg使用 summary()函数。所得输出结果除了与上面printcp()给出值相同的部分外,另有变量重要程度(Variable importance)、每一个分支变量对生成树的提升程度(improve)等信息。



rp_Car_Reg1 <- rpart(formula_Car_Reg,Train_Car,method="anova",minsplit=10)
## n= 45 
## node), split, n, deviance, yval
##       * denotes terminal node
##  1) root 45 211.849300 11.951090  
##    2) 发动机功率< 137 19  32.557130  9.940236  
##      4) 价格< 9247 8   3.606646  8.626163 *
##      5) 价格>=9247 11   5.089398 10.895930  
##       10) 类型=Sporty 4   1.150584 10.267300 *
##       11) 类型=Compact,Small 7   1.454858 11.255140 *
##    3) 发动机功率>=137 26  46.321580 13.420570  
##      6) 类型=Compact,Medium,Sporty 20  20.768890 12.910710  
##       12) 价格< 11245 4   1.813531 11.519990 *
##       13) 价格>=11245 16   9.284856 13.258390  
##         26) 净马力< 154 11   3.295533 12.938470 *
##         27) 净马力>=154 5   2.386790 13.962200 *
##      7) 类型=Large,Van 6   3.023018 15.120100 *
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova", 
##     minsplit = 10)
## Variables actually used in tree construction:
## [1] 发动机功率 价格       净马力     类型      
## Root node error: 211.85/45 = 4.7078
## n= 45 
##         CP nsplit rel error  xerror     xstd
## 1 0.627666      0  1.000000 1.03957 0.171923
## 2 0.112632      1  0.372334 0.45493 0.071841
## 3 0.106348      2  0.259702 0.51005 0.109048
## 4 0.045648      3  0.153354 0.39101 0.104937
## 5 0.017005      4  0.107706 0.36528 0.104065
## 6 0.011725      5  0.090701 0.37083 0.104090
## 7 0.010000      6  0.078976 0.38170 0.106049
rp_Car_Reg2 <- rpart(formula_Car_Reg,Train_Car,method="anova",cp=0.1)
## n= 45 
## node), split, n, deviance, yval
##       * denotes terminal node
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570 *
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova", 
##     cp = 0.1)
## Variables actually used in tree construction:
## [1] 发动机功率
## Root node error: 211.85/45 = 4.7078
## n= 45 
##        CP nsplit rel error  xerror    xstd
## 1 0.62767      0   1.00000 1.07593 0.17636
## 2 0.10000      1   0.37233 0.44465 0.07069
rp_Car_Reg3 <- prune.rpart(rp_Car_Reg,cp=0.1)
## n= 45 
## node), split, n, deviance, yval
##       * denotes terminal node
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570 *
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova")
## Variables actually used in tree construction:
## [1] 发动机功率
## Root node error: 211.85/45 = 4.7078
## n= 45 
##        CP nsplit rel error  xerror     xstd
## 1 0.62767      0   1.00000 1.03382 0.168052
## 2 0.10000      1   0.37233 0.41239 0.066937
rp_Car_Reg4 <- rpart(formula_Car_Reg,Train_Car,method="anova",maxdepth=1)
## n= 45 
## node), split, n, deviance, yval
##       * denotes terminal node
## 1) root 45 211.84930 11.951090  
##   2) 发动机功率< 137 19  32.55713  9.940236 *
##   3) 发动机功率>=137 26  46.32158 13.420570 *
## Regression tree:
## rpart(formula = formula_Car_Reg, data = Train_Car, method = "anova", 
##     maxdepth = 1)
## Variables actually used in tree construction:
## [1] 发动机功率
## Root node error: 211.85/45 = 4.7078
## n= 45 
##        CP nsplit rel error  xerror     xstd
## 1 0.62767      0   1.00000 1.05355 0.174146
## 2 0.01000      1   0.37233 0.43641 0.067109


rp_Car_Plot <- rpart(formula_Car_Reg,Train_Car,method="anova",minsplit=10)
## n= 45 
## node), split, n, deviance, yval
##       * denotes terminal node
##  1) root 45 211.849300 11.951090  
##    2) 发动机功率< 137 19  32.557130  9.940236  
##      4) 价格< 9247 8   3.606646  8.626163 *
##      5) 价格>=9247 11   5.089398 10.895930  
##       10) 类型=Sporty 4   1.150584 10.267300 *
##       11) 类型=Compact,Small 7   1.454858 11.255140 *
##    3) 发动机功率>=137 26  46.321580 13.420570  
##      6) 类型=Compact,Medium,Sporty 20  20.768890 12.910710  
##       12) 价格< 11245 4   1.813531 11.519990 *
##       13) 价格>=11245 16   9.284856 13.258390  
##         26) 净马力< 154 11   3.295533 12.938470 *
##         27) 净马力>=154 5   2.386790 13.962200 *
##      7) 类型=Large,Van 6   3.023018 15.120100 *





post(rp_Car_Plot,file="",title.="post: Regression Tree") 

plot(rp_Car_Plot,uniform=TRUE,main="plot: Regression Tree")


formula_Car_Cla <- "分组油耗~价格+产地+可靠性+类型+车重+发动机功率+净马力"
rp_Car_Cla <- rpart(formula_Car_Cla,Train_Car,method="class",minsplit=5)
## n= 45 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
##  1) root 45 19 A (0.57777778 0.26666667 0.15555556)  
##    2) 发动机功率>=137 26  2 A (0.92307692 0.07692308 0.00000000) *
##    3) 发动机功率< 137 19  9 B (0.10526316 0.52631579 0.36842105)  
##      6) 价格>=9247 11  2 B (0.18181818 0.81818182 0.00000000)  
##       12) 产地=France,Japan/USA 2  0 A (1.00000000 0.00000000 0.00000000) *
##       13) 产地=Germany,Japan,USA 9  0 B (0.00000000 1.00000000 0.00000000) *
##      7) 价格< 9247 8  1 C (0.00000000 0.12500000 0.87500000) *


pre_Car_Cla <- predict(rp_Car_Cla,Test_Car,type="class")
##  4 18  6 11 28 36 24 33 35 49 50 51 53 56 60 
##  C  B  C  C  A  A  A  A  A  A  A  A  A  A  A 
## Levels: A B C
##    pre_Car_Cla
##     A B C
##   A 9 0 0
##   B 2 0 2
##   C 0 1 1
## [1] 0.3333333





names(Train_Car) <- c("Price","Country","Reliability","Mileage",
Train_Car$Oil_Consumption <- as.factor(Train_Car$Oil_Consumption)
formula <- Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
C45_0 <- J48(formula,Train_Car)
## J48 pruned tree
## ------------------
## Disp. <= 133
## |   Disp. <= 97: C (7.0/1.0)
## |   Disp. > 97: B (10.0/1.0)
## Disp. > 133: A (19.0/1.0)
## Number of Leaves  :  3
## Size of the tree :   5
## === Summary ===
## Correctly Classified Instances          33               91.6667 %
## Incorrectly Classified Instances         3                8.3333 %
## Kappa statistic                          0.8621
## Mean absolute error                      0.1002
## Root mean squared error                  0.2238
## Relative absolute error                 24.8179 %
## Root relative squared error             50.0125 %
## Total Number of Instances               36     
## === Confusion Matrix ===
##   a  b  c   <-- classified as
##  18  1  0 |  a = A
##   1  9  1 |  b = B
##   0  0  6 |  c = C
C45_1 <- J48(formula,Train_Car,control=Weka_control(M=3))
## J48 pruned tree
## ------------------
## Disp. <= 133
## |   Disp. <= 97: C (7.0/1.0)
## |   Disp. > 97: B (10.0/1.0)
## Disp. > 133: A (19.0/1.0)
## Number of Leaves  :  3
## Size of the tree :   5


C4.5算法与 ID3 算法极为相似,只是在特征选择上有所不同,算是一种对 ID3 算法的改进了。C4.5算法在决策树生成过程中,用信息增益比来选择特征。信息增益比的定义如前文所述。而 C5.0 算法作为 C4.5的改进版,其算法原理都是一致的,都是采用信息增益比来进行特征选择,但C5.0适用于处理大数据集,采用 Boosting 方式来提高模型准确率,计算速度较高,对计算内存占用也较少。关于 C4.5和C5.0的实际计算可参考R语言实现方式。




C5.0(x, y, trials = 1, rules= FALSE, 
     weights = NULL, 
     control = C5.0Control(), 
     costs = NULL, ...)


C5.0Control(subset = TRUE, 
            bands = 0, 
            winnow = FALSE, 
            noGlobalPruning = FALSE, 
            CF = 0.25, 
            minCases = 2, 
            fuzzyThreshold = FALSE, 
            sample = 0, 
            seed = sample.int(4096, size = 1) - 1L,  
            earlyStopping = TRUE,
            label = "outcome")


c5.0tree=C5.0(churn~.,data=tree.train,trials=5,control=C5.0Control(winnow = TRUE,CF=0.25))

  winnow :在建模之前是否对变量进行特征选择       


data_name <- c("Price","Country","Reliability","Mileage","Type","Weight","Disp.","HP","Oil_Consumption") 
names(Train_Car) <- data_name
names(Test_Car) <- data_name
Train_Car$Oil_Consumption <- as.factor(Train_Car$Oil_Consumption)
formula <- Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
tr5.0 <- C5.0(formula, data=Train_Car, control = C5.0Control(noGlobalPruning = T))
tr5.01 <- C5.0(formula, data=Train_Car, control = C5.0Control(noGlobalPruning = F))

## [1] "C5.0"
## [1] 18
## Call:
## C5.0.formula(formula = formula, data = Train_Car, control
##  = C5.0Control(noGlobalPruning = T))
## C5.0 [Release 2.07 GPL Edition]      Tue Aug 29 18:56:26 2017
## -------------------------------
## Class specified by attribute `outcome'
## Read 45 cases (8 attributes) from undefined.data
## Decision tree:
## Price <= 9410:
## :...Weight <= 2295: C (6)
## :   Weight > 2295: B (3/1)
## Price > 9410:
## :...Disp. <= 133: B (11/2)
##     Disp. > 133: A (25/1)
## Evaluation on training data (45 cases):
##      Decision Tree   
##    ----------------  
##    Size      Errors  
##       4    4( 8.9%)   <<
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##      24     2          (a): class A
##       1    11          (b): class B
##             1     6    (c): class C
##  Attribute usage:
##  100.00% Price
##   80.00% Disp.
##   20.00% Weight
## Time: 0.0 secs
## $names
## [1] "| Generated using R version 3.3.3 (2017-03-06)\n| on 周二 8月 29 18:56:26 2017\n| function call: makeNamesFile(x = x, y = y, w = weights, label = control$label, \n|    comments = TRUE)\noutcome.\n\noutcome: A,B,C.\nPrice:continuous.\nCountry:France,Germany,Japan,Japan\\/USA,Korea,Mexico,Sweden,USA.\nReliability:continuous.\nType:Compact,Large,Medium,Small,Sporty,Van.\nWeight:continuous.\nDisp\\.:continuous.\nHP:continuous.\n"
## $cost
## [1] ""
## $costMatrix
## $caseWeights
## [1] FALSE
## $control
## $control$subset
## [1] TRUE
## $control$bands
## [1] 0
## $control$winnow
## [1] FALSE
## $control$noGlobalPruning
## [1] TRUE
## $control$CF
## [1] 0.25
## $control$minCases
## [1] 2
## $control$fuzzyThreshold
## [1] FALSE
## $control$sample
## [1] 0
## $control$earlyStopping
## [1] TRUE
## $control$label
## [1] "outcome"
## $control$seed
## [1] 3655
## $trials
## Requested    Actual 
##         1         1 
## $rbm
## [1] FALSE
## $boostResults
## $size
## [1] 4
## $dims
## [1] 45  7
## $call
C5.0.formula(formula = formula, data = Train_Car, control = C5.0Control(noGlobalPruning = T))
## $levels
## [1] "A" "B" "C"
## $output
## [1] "\nC5.0 [Release 2.07 GPL Edition]  \tTue Aug 29 18:56:26 2017\n-------------------------------\n\nClass specified by attribute `outcome'\n\nRead 45 cases (8 attributes) from undefined.data\n\nDecision tree:\n\nPrice <= 9410:\n:...Weight <= 2295: C (6)\n:   Weight > 2295: B (3/1)\nPrice > 9410:\n:...Disp. <= 133: B (11/2)\n    Disp. > 133: A (25/1)\n\n\nEvaluation on training data (45 cases):\n\n\t    Decision Tree   \n\t  ----------------  \n\t  Size      Errors  \n\n\t     4    4( 8.9%)   <<\n\n\n\t   (a)   (b)   (c)    <-classified as\n\t  ----  ----  ----\n\t    24     2          (a): class A\n\t     1    11          (b): class B\n\t           1     6    (c): class C\n\n\n\tAttribute usage:\n\n\t100.00%\tPrice\n\t 80.00%\tDisp.\n\t 20.00%\tWeight\n\n\nTime: 0.0 secs\n"
## $tree
## [1] "id=\"See5/C5.0 2.07 GPL Edition 2017-08-29\"\nentries=\"1\"\ntype=\"2\" class=\"A\" freq=\"26,12,7\" att=\"Price\" forks=\"3\" cut=\"9410\"\ntype=\"0\" class=\"A\"\ntype=\"2\" class=\"C\" freq=\"0,2,7\" att=\"Weight\" forks=\"3\" cut=\"2295\"\ntype=\"0\" class=\"C\"\ntype=\"0\" class=\"C\" freq=\"0,0,6\"\ntype=\"0\" class=\"B\" freq=\"0,2,1\"\ntype=\"2\" class=\"A\" freq=\"26,10,0\" att=\"Disp.\" forks=\"3\" cut=\"133\"\ntype=\"0\" class=\"A\"\ntype=\"0\" class=\"B\" freq=\"2,9,0\"\ntype=\"0\" class=\"A\" freq=\"24,1,0\"\n"
## $predictors
## [1] "Price"       "Country"     "Reliability" "Type"        "Weight"     
## [6] "Disp."       "HP"         
## $rules
## [1] ""
## $Terms
## Oil_Consumption ~ Price + Country + Reliability + Type + Weight + 
##     Disp. + HP
## attr(,"variables")
## list(Oil_Consumption, Price, Country, Reliability, Type, Weight, 
##     Disp., HP)
## attr(,"factors")
##                 Price Country Reliability Type Weight Disp. HP
## Oil_Consumption     0       0           0    0      0     0  0
## Price               1       0           0    0      0     0  0
## Country             0       1           0    0      0     0  0
## Reliability         0       0           1    0      0     0  0
## Type                0       0           0    1      0     0  0
## Weight              0       0           0    0      1     0  0
## Disp.               0       0           0    0      0     1  0
## HP                  0       0           0    0      0     0  1
## attr(,"term.labels")
## [1] "Price"       "Country"     "Reliability" "Type"        "Weight"     
## [6] "Disp."       "HP"         
## attr(,"order")
## [1] 1 1 1 1 1 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(Oil_Consumption, Price, Country, Reliability, Type, Weight, 
##     Disp., HP)
## attr(,"dataClasses")
## Oil_Consumption           Price         Country     Reliability 
##        "factor"       "numeric"        "factor"       "numeric" 
##            Type          Weight           Disp.              HP 
##        "factor"       "numeric"       "numeric"       "numeric" 
## $xlevels
## $xlevels$Country
## [1] "France"    "Germany"   "Japan"     "Japan/USA" "Korea"     "Mexico"   
## [7] "Sweden"    "USA"      
## $xlevels$Type
## [1] "Compact" "Large"   "Medium"  "Small"   "Sporty"  "Van"
# par(xpd = TRUE)
# plot(tr5.0)
# plot(tr5.01)
# C5imp(tr5.0)
pre <- predict(tr5.0, Test_Car,type = 'class')
pre1 <- predict(tr5.01,Test_Car,type = 'class')
m <- table(type=Test_Car$Oil_Consumption, predict=pre)
m1 <- table(type=Test_Car$Oil_Consumption, predict=pre1)
## [1] 0.7333333
## [1] 0.8666667