前言

titanic.raw数据案例

# 加载相关包
library(arules)     # 加载关联算法的程序包
# library(grid)
library(arulesViz)  # 加载可视化包

mdata <- read.csv('http://data.galaxystatistics.com/blog_data/apriori/titanic.csv',header=T) 
head(mdata)
##   Class   Age  Sex Survived
## 1   1st adult male      yes
## 2   1st adult male      yes
## 3   1st adult male      yes
## 4   1st adult male      yes
## 5   1st adult male      yes
## 6   1st adult male      yes
titanic.raw <- mdata
str(titanic.raw)
## 'data.frame':    2201 obs. of  4 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age     : Factor w/ 2 levels "adult","child": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Sex     : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Survived: Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
# 船舱等级,性别,年龄, 年龄(是个分类变量包含成人和儿童), 是否幸存
# VARIABLE DESCRIPTIONS:
# Column
#    1   Class (0 = crew, 1 = first, 2 = second, 3 = third)
#   10   Age   (1 = adult, 0 = child)
#   19   Sex   (1 = male, 0 = female)
#   28   Survived (1 = yes, 0 = no)

# 以函数apriori中的缺省设置来查找数据集titanic.raw中的关联规则
rules <- apriori(titanic.raw)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 220 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 2201 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [27 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# 检查所返回的关联规则
inspect(rules)
##      lhs                                   rhs           support  
## [1]  {}                                 => {Age=adult}   0.9504771
## [2]  {Class=2nd}                        => {Age=adult}   0.1185825
## [3]  {Class=1st}                        => {Age=adult}   0.1449341
## [4]  {Sex=female}                       => {Age=adult}   0.1930940
## [5]  {Class=3rd}                        => {Age=adult}   0.2848705
## [6]  {Survived=yes}                     => {Age=adult}   0.2971377
## [7]  {Class=crew}                       => {Sex=male}    0.3916402
## [8]  {Class=crew}                       => {Age=adult}   0.4020900
## [9]  {Survived=no}                      => {Sex=male}    0.6197183
## [10] {Survived=no}                      => {Age=adult}   0.6533394
## [11] {Sex=male}                         => {Age=adult}   0.7573830
## [12] {Sex=female,Survived=yes}          => {Age=adult}   0.1435711
## [13] {Class=3rd,Sex=male}               => {Survived=no} 0.1917310
## [14] {Class=3rd,Survived=no}            => {Age=adult}   0.2162653
## [15] {Class=3rd,Sex=male}               => {Age=adult}   0.2099046
## [16] {Sex=male,Survived=yes}            => {Age=adult}   0.1535666
## [17] {Class=crew,Survived=no}           => {Sex=male}    0.3044071
## [18] {Class=crew,Survived=no}           => {Age=adult}   0.3057701
## [19] {Class=crew,Sex=male}              => {Age=adult}   0.3916402
## [20] {Class=crew,Age=adult}             => {Sex=male}    0.3916402
## [21] {Sex=male,Survived=no}             => {Age=adult}   0.6038164
## [22] {Age=adult,Survived=no}            => {Sex=male}    0.6038164
## [23] {Class=3rd,Sex=male,Survived=no}   => {Age=adult}   0.1758292
## [24] {Class=3rd,Age=adult,Survived=no}  => {Sex=male}    0.1758292
## [25] {Class=3rd,Age=adult,Sex=male}     => {Survived=no} 0.1758292
## [26] {Class=crew,Sex=male,Survived=no}  => {Age=adult}   0.3044071
## [27] {Class=crew,Age=adult,Survived=no} => {Sex=male}    0.3044071
##      confidence lift     
## [1]  0.9504771  1.0000000
## [2]  0.9157895  0.9635051
## [3]  0.9815385  1.0326798
## [4]  0.9042553  0.9513700
## [5]  0.8881020  0.9343750
## [6]  0.9198312  0.9677574
## [7]  0.9740113  1.2384742
## [8]  1.0000000  1.0521033
## [9]  0.9154362  1.1639949
## [10] 0.9651007  1.0153856
## [11] 0.9630272  1.0132040
## [12] 0.9186047  0.9664669
## [13] 0.8274510  1.2222950
## [14] 0.9015152  0.9484870
## [15] 0.9058824  0.9530818
## [16] 0.9209809  0.9689670
## [17] 0.9955423  1.2658514
## [18] 1.0000000  1.0521033
## [19] 1.0000000  1.0521033
## [20] 0.9740113  1.2384742
## [21] 0.9743402  1.0251065
## [22] 0.9242003  1.1751385
## [23] 0.9170616  0.9648435
## [24] 0.8130252  1.0337773
## [25] 0.8376623  1.2373791
## [26] 1.0000000  1.0521033
## [27] 0.9955423  1.2658514
# 现在生成的关联规则结果只包含("Survived=No", "Survived=Yes")
# lhs=left hand side; rhs=right hand side.
# 如果只想检查其它变量和乘客是否幸存的关系,那么需要提前设置变量rhs=c("Survived=No", "Survived=Yes")
rules <- apriori(titanic.raw,
  parameter = list(minlen=2, supp=0.005, conf=0.8),
  appearance = list(rhs=c("Survived=no", "Survived=yes"), default="lhs"),
  control = list(verbose=F))

# 根据关联结果中的提升度(life)进行降序排序。
# rules.sorted <- sort(rules, decreasing=TRUE, na.last=TRUE, by="lift")
rules.sorted <- sort(rules, decreasing=TRUE, by="lift")

# class(rules.sorted)
# as(rules.sorted, 'data.frame')

# 检查排序后的变量。
inspect(rules.sorted)
##      lhs                                  rhs            support    
## [1]  {Class=2nd,Age=child}             => {Survived=yes} 0.010904134
## [2]  {Class=2nd,Age=child,Sex=female}  => {Survived=yes} 0.005906406
## [3]  {Class=1st,Sex=female}            => {Survived=yes} 0.064061790
## [4]  {Class=1st,Age=adult,Sex=female}  => {Survived=yes} 0.063607451
## [5]  {Class=2nd,Sex=female}            => {Survived=yes} 0.042253521
## [6]  {Class=crew,Sex=female}           => {Survived=yes} 0.009086779
## [7]  {Class=crew,Age=adult,Sex=female} => {Survived=yes} 0.009086779
## [8]  {Class=2nd,Age=adult,Sex=female}  => {Survived=yes} 0.036347115
## [9]  {Class=2nd,Age=adult,Sex=male}    => {Survived=no}  0.069968196
## [10] {Class=2nd,Sex=male}              => {Survived=no}  0.069968196
## [11] {Class=3rd,Age=adult,Sex=male}    => {Survived=no}  0.175829169
## [12] {Class=3rd,Sex=male}              => {Survived=no}  0.191731031
##      confidence lift    
## [1]  1.0000000  3.095640
## [2]  1.0000000  3.095640
## [3]  0.9724138  3.010243
## [4]  0.9722222  3.009650
## [5]  0.8773585  2.715986
## [6]  0.8695652  2.691861
## [7]  0.8695652  2.691861
## [8]  0.8602151  2.662916
## [9]  0.9166667  1.354083
## [10] 0.8603352  1.270871
## [11] 0.8376623  1.237379
## [12] 0.8274510  1.222295
# 去掉结果集出现的冗余的规则
# 在上面的结果集中可以看到规则1实际上已经包含了规则2,因为规则1实际上告诉了我们,所有2等舱的儿童都幸存了,也就是说实际上规则2只是规则1的超集。一般来说它的提升度(lift)实际上与规则1的提升度(lift)相等,甚至更低一下。由此可以认定规则2是冗余的。在下面会演示如何去掉冗余的规则。

# find redundant rules
# 插播—函数介绍:
# is.subset和is.superset函数用于在关联和项集矩阵对象中发现子集或父集!
### Is.subset(x,y=NULL, proper=FALSE, sparse=FALSE,...) 
# 返回一个与给定矩阵(在上三角或下三角中TRUE)相同大小的逻辑矩阵
### Lower.tri(x,diag=FALSE) 
# 生成一个关联规则的子集矩阵
subset.matrix <- is.subset(rules.sorted, rules.sorted)
# subset.matrix

# 将矩阵对角线以下的元素置为空
# subset.matrix[lower.tri(subset.matrix, diag=T)] <- NA
subset.matrix[lower.tri(subset.matrix, diag=T)] <- FALSE
subset.matrix
## 12 x 12 sparse Matrix of class "ngCMatrix"
##                                                                       
## {Class=2nd,Age=child,Survived=yes}             . | . . . . . . . . . .
## {Class=2nd,Age=child,Sex=female,Survived=yes}  . . . . . . . . . . . .
## {Class=1st,Sex=female,Survived=yes}            . . . | . . . . . . . .
## {Class=1st,Age=adult,Sex=female,Survived=yes}  . . . . . . . . . . . .
## {Class=2nd,Sex=female,Survived=yes}            . . . . . . . | . . . .
## {Class=crew,Sex=female,Survived=yes}           . . . . . . | . . . . .
## {Class=crew,Age=adult,Sex=female,Survived=yes} . . . . . . . . . . . .
## {Class=2nd,Age=adult,Sex=female,Survived=yes}  . . . . . . . . . . . .
## {Class=2nd,Age=adult,Sex=male,Survived=no}     . . . . . . . . . . . .
## {Class=2nd,Sex=male,Survived=no}               . . . . . . . . . . . .
## {Class=3rd,Age=adult,Sex=male,Survived=no}     . . . . . . . . . . . .
## {Class=3rd,Sex=male,Survived=no}               . . . . . . . . . . . .
# 将子集矩阵中每列元素和大于等于1的列找出来
redundant <- colSums(subset.matrix, na.rm=T) >= 1
which(redundant)
##  {Class=2nd,Age=child,Sex=female,Survived=yes} 
##                                              2 
##  {Class=1st,Age=adult,Sex=female,Survived=yes} 
##                                              4 
## {Class=crew,Age=adult,Sex=female,Survived=yes} 
##                                              7 
##  {Class=2nd,Age=adult,Sex=female,Survived=yes} 
##                                              8
# 从规则矩阵中去掉这些列
rules.pruned <- rules.sorted[!redundant]

# 检查最终生成的结果集
inspect(rules.pruned)
##     lhs                               rhs            support    
## [1] {Class=2nd,Age=child}          => {Survived=yes} 0.010904134
## [2] {Class=1st,Sex=female}         => {Survived=yes} 0.064061790
## [3] {Class=2nd,Sex=female}         => {Survived=yes} 0.042253521
## [4] {Class=crew,Sex=female}        => {Survived=yes} 0.009086779
## [5] {Class=2nd,Age=adult,Sex=male} => {Survived=no}  0.069968196
## [6] {Class=2nd,Sex=male}           => {Survived=no}  0.069968196
## [7] {Class=3rd,Age=adult,Sex=male} => {Survived=no}  0.175829169
## [8] {Class=3rd,Sex=male}           => {Survived=no}  0.191731031
##     confidence lift    
## [1] 1.0000000  3.095640
## [2] 0.9724138  3.010243
## [3] 0.8773585  2.715986
## [4] 0.8695652  2.691861
## [5] 0.9166667  1.354083
## [6] 0.8603352  1.270871
## [7] 0.8376623  1.237379
## [8] 0.8274510  1.222295
# 加载包arulesViz, 画出关联规则的图形表示方法
# library(arulesViz)
plot(rules)

plot(rules, method="graph", control=list(type="items"))
## Available control parameters (with default values):
## main  =  Graph for 12 rules
## nodeColors    =  c("#66CC6680", "#9999CC80")
## nodeCol   =  c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF",  "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF",  "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol   =  c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF",  "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF",  "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha     =  0.5
## cex   =  1
## itemLabels    =  TRUE
## labelCol  =  #000000B3
## measureLabels     =  FALSE
## precision     =  3
## layout    =  NULL
## layoutParams  =  list()
## arrowSize     =  0.5
## engine    =  igraph
## plot  =  TRUE
## plot_options  =  list()
## max   =  100
## verbose   =  FALSE

plot(rules, method="paracoord", control=list(reorder=TRUE))

SunBai数据案例

# 加载相关包
library(arules)     # 加载关联算法的程序包
# library(grid)
library(arulesViz)  # 加载可视化包

# mdata <- read.table('http://data.galaxystatistics.com/blog_data/apriori/SunBai.txt',row.names=NULL,header=T)
# head(mdata)

# 一个小例子数据库作为类的一个对象提供的加权关联规则挖掘
data(SunBai)

str(SunBai)
## Formal class 'transactions' [package "arules"] with 3 slots
##   ..@ data       :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
##   .. .. ..@ i       : int [1:18] 0 1 2 3 4 2 5 6 0 1 ...
##   .. .. ..@ p       : int [1:7] 0 5 8 10 11 15 18
##   .. .. ..@ Dim     : int [1:2] 8 6
##   .. .. ..@ Dimnames:List of 2
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. ..@ factors : list()
##   ..@ itemInfo   :'data.frame':  8 obs. of  1 variable:
##   .. ..$ labels: chr [1:8] "A" "B" "C" "D" ...
##   ..@ itemsetInfo:'data.frame':  6 obs. of  2 variables:
##   .. ..$ transactionID: num [1:6] 100 200 300 400 500 600
##   .. ..$ weight       : num [1:6] 0.518 0.436 0.232 0.148 0.544 ...
summary(SunBai)
## transactions as itemMatrix in sparse format with
##  6 rows (elements/itemsets/transactions) and
##  8 columns (items) and a density of 0.375 
## 
## most frequent items:
##       A       C       G       B       F (Other) 
##       4       3       3       2       2       4 
## 
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 
## 1 1 2 1 1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.25    3.00    3.00    3.75    5.00 
## 
## includes extended item information - examples:
##   labels
## 1      A
## 2      B
## 3      C
## 
## includes extended transaction information - examples:
##   transactionID    weight
## 1           100 0.5176528
## 2           200 0.4362571
## 3           300 0.2321374
#探索和准备数据:
# 事务型数据每一行指定一个单一的实例,每条记录包括用逗号隔开的任意数量的产品清单
# 通过inspect()函数可以看到超市的交易记录,每次交易的商品名称;
# 通过summary()函数可以查看该数据集的一些基本信息。
# 总共有6条交易记录transaction,8个商品item。density=0.375表示在稀疏矩阵中1的百分比。最频繁出现的商品item,以及其出现的次数。可以计算出最大支持度。
# 每笔交易包含的商品数目,以及其对应的5个分位数和均值的统计信息。如:一条交易包含一件商品;一条包含两件;两条包含三件;一条包含四件;一条包含五件。其下统计信息表明:最低一次交易只含一件商品,第一分位数是2.25,意味着25%的交易包含不超过2.25个item。中位数是3表面50%的交易购买的商品不超过3件。

# 通过inspect函数查看SunBai数据集的前5次交易记录
inspect(SunBai[1:5]) 
##     items       transactionID weight   
## [1] {A,B,C,D,E} 100           0.5176528
## [2] {C,F,G}     200           0.4362571
## [3] {A,B}       300           0.2321374
## [4] {A}         400           0.1476262
## [5] {C,F,G,H}   500           0.5440458
# itemFrequency()函数可以查看商品的交易比例
itemFrequency(SunBai[,1:8]) 
##         A         B         C         D         E         F         G 
## 0.6666667 0.3333333 0.5000000 0.1666667 0.1666667 0.3333333 0.5000000 
##         H 
## 0.3333333
# support = 0.1 表示支持度至少为0.1
itemFrequencyPlot(SunBai, support=0.1) 

# topN = 20 表示支持度排在前20的商品
itemFrequencyPlot(SunBai, topN=20) 

# 利用transactionInfo函数查看前六数据
head(transactionInfo(SunBai))   
##   transactionID    weight
## 1           100 0.5176528
## 2           200 0.4362571
## 3           300 0.2321374
## 4           400 0.1476262
## 5           500 0.5440458
## 6           600 0.4123691
# 训练模型
# 设置支持度0.2,置信度0.5对数据进行关联规则处理
rules <- apriori(SunBai, parameter=list(support=0.2,confidence=0.5))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.2      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 1 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[8 item(s), 6 transaction(s)] done [0.00s].
## sorting and recoding items ... [6 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [16 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# minlen=2表示规则中至少包含两种商品,这可以防止仅仅是由于某种商品被频繁购买而创建的无用规则
# rules=apriori(SunBai,parameter = list(support=0.2,confidence=0.5,minlen = 2)) 
summary(rules)
## set of 16 rules
## 
## rule length distribution (lhs + rhs):sizes
##  1  2  3 
##  3 10  3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       2       2       2       2       3 
## 
## summary of quality measures:
##     support         confidence          lift      
##  Min.   :0.3333   Min.   :0.5000   Min.   :1.000  
##  1st Qu.:0.3333   1st Qu.:0.6667   1st Qu.:1.333  
##  Median :0.3333   Median :0.6667   Median :2.000  
##  Mean   :0.3750   Mean   :0.7812   Mean   :1.729  
##  3rd Qu.:0.3333   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :0.6667   Max.   :1.0000   Max.   :3.000  
## 
## mining info:
##    data ntransactions support confidence
##  SunBai             6     0.2        0.5
# 提高模型的性能
# 根据购物篮分析的目标,最有用的规则或许是那些具有高支持度、信度和提升度的规则。arules包中包含一个sort()函数,通过指定参数by为"support","confidence"或者"lift"对规则列表进行重新排序。 在默认的情况下,排序是降序排列,可以指定参数decreasing=FALSE反转排序方式。
# lift(提升度),表示用来度量一类商品相对于它的一般购买率,此时被购买的可能性有多大 (Lift)是避免了一些不平衡数据标签的偏差性,Lift越大,则数据质量较好;Lift越小,则数据越不平衡。在此处设置lift值为3.
inspect(head(sort(rules, by = "lift"), 3))
##     lhs      rhs support   confidence lift
## [1] {C,G} => {F} 0.3333333 1.0000000  3   
## [2] {F}   => {G} 0.3333333 1.0000000  2   
## [3] {G}   => {F} 0.3333333 0.6666667  2
# library(arulesViz) # 加载可视化包
plot(rules, method = "grouped")

# 散点图判断大量规则的支持度与置信度分布情况
plot(rules, method='scatterplot') 

# 可以使用interactive=TRUE来实现散点图的互动功能,可以选中一些点查看其具体的规则
# plot(rules,interactive=TRUE) 

# 还有类似“气泡图”的展现形式:提升度lift是圈的颜色深浅,圈的大小表示支持度support的大小。LHS的个数和分组中最重要(频繁)项集显示在列的标签里。lift从左上角到右下角逐渐减少。
# 关联图看相互关系
# measure 定义圆圈大小,shading 控制颜色深浅
plot(rules, method='graph', shading = "lift",  control = list(type='items'))  
## Available control parameters (with default values):
## main  =  Graph for 16 rules
## nodeColors    =  c("#66CC6680", "#9999CC80")
## nodeCol   =  c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF",  "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF",  "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol   =  c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF",  "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF",  "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha     =  0.5
## cex   =  1
## itemLabels    =  TRUE
## labelCol  =  #000000B3
## measureLabels     =  FALSE
## precision     =  3
## layout    =  NULL
## layoutParams  =  list()
## arrowSize     =  0.5
## engine    =  igraph
## plot  =  TRUE
## plot_options  =  list()
## max   =  100
## verbose   =  FALSE

# 通过箭头和圆圈来表示关联规则,利用顶点代表项集,边表示规则中关系。圆圈越大表示支持度support越大,颜色越深表示提升度lift越大。但是如果规则较多的话会显得很混乱,难以发现其中的规律,因此,通常只对较少的规则使用这样的图;

# 提取关联规则的子集:可以通过subset()函数提取我们感兴趣的规则
sub_rules <- subset(rules, items %in% "C")
sub_rules
## set of 8 rules
inspect(sub_rules[1:5])
##     lhs    rhs support   confidence lift    
## [1] {}  => {C} 0.5000000 0.5000000  1.000000
## [2] {F} => {C} 0.3333333 1.0000000  2.000000
## [3] {C} => {F} 0.3333333 0.6666667  2.000000
## [4] {G} => {C} 0.3333333 0.6666667  1.333333
## [5] {C} => {G} 0.3333333 0.6666667  1.333333
# 如果lift=1,说明两个事项没有任何关联;如果lift<1,说明A事件的发生与B事件是相斥的。一般在数据挖掘中当提升度大于3时,我们才承认挖掘出的关联规则是有价值的。

Epub数据案例

# 加载相关包
library(arules)     # 加载关联算法的程序包
# library(grid)
library(arulesViz)  # 加载可视化包

# mdata <- read.table('http://data.galaxystatistics.com/blog_data/apriori/Epub.txt',row.names=NULL,header=T)
# head(mdata)
# # 数据格式而非事务集的格式,因此首先要对数据进行格式转换。要对数据进行分组,一个transactionID的所有items应该在一个组里,因此,我们可以使用split函数,指定它的分组变量和目标变量:
# Epub <- split(mdata$items, mdata$transactionID)
# # 分组之后,将Epub数据转换成事务集形式
# Epub <- as(Epub, "transactions")

data(Epub)

# 查看Epub数据的前十行
inspect(Epub[1:10])
##      items                    transactionID TimeStamp          
## [1]  {doc_154}                session_4795  2003-01-02 09:59:00
## [2]  {doc_3d6}                session_4797  2003-01-02 20:46:01
## [3]  {doc_16f}                session_479a  2003-01-02 23:50:38
## [4]  {doc_11d,doc_1a7,doc_f4} session_47b7  2003-01-03 07:55:50
## [5]  {doc_83}                 session_47bb  2003-01-03 10:27:44
## [6]  {doc_11d}                session_47c2  2003-01-03 23:18:04
## [7]  {doc_368}                session_47cb  2003-01-04 03:40:57
## [8]  {doc_11d,doc_192}        session_47d8  2003-01-04 08:00:01
## [9]  {doc_364}                session_47e2  2003-01-05 01:48:36
## [10] {doc_ec}                 session_47e7  2003-01-05 04:58:48
# 查看数据集统计汇总信息
summary(Epub)
## transactions as itemMatrix in sparse format with
##  15729 rows (elements/itemsets/transactions) and
##  936 columns (items) and a density of 0.001758755 
## 
## most frequent items:
## doc_11d doc_813 doc_4c6 doc_955 doc_698 (Other) 
##     356     329     288     282     245   24393 
## 
## element (itemset/transaction) length distribution:
## sizes
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 11615  2189   854   409   198   121    93    50    42    34    26    12 
##    13    14    15    16    17    18    19    20    21    22    23    24 
##    10    10     6     8     6     5     8     2     2     3     2     3 
##    25    26    27    28    30    34    36    38    41    43    52    58 
##     4     5     1     1     1     2     1     2     1     1     1     1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.646   2.000  58.000 
## 
## includes extended item information - examples:
##    labels
## 1 doc_11d
## 2 doc_13d
## 3 doc_14c
## 
## includes extended transaction information - examples:
##       transactionID           TimeStamp
## 10792  session_4795 2003-01-02 09:59:00
## 10793  session_4797 2003-01-02 20:46:01
## 10794  session_479a 2003-01-02 23:50:38
# 可以使用dim函数提取项集数和item数
dim(Epub)
## [1] 15729   936
# 第一步,查看数据集的统计汇总信息。summary()含义的具体解释如下:  
# 1)共有15729个项集和4343个item,稀疏矩阵中1的百分比为0.0002302556。  
# 2)most frequent items描述了最频繁出现的5个item以及其分别出现的次数。
# 3)sizes描述了项集的项的个数以及n项集共有几个,例如单项集有10个,二项集有11个,58项集有1个。sizes之后描述了sizes对应的5个分位数和均值的统计信息。

# 统计每个item的支持度
itemFreq <- itemFrequency(Epub)
# 每个项集transaction包含item的个数
Size <- size(Epub)
# 每个item出现的次数
itemCount <- (itemFreq/sum(itemFreq)*sum(Size))

# 除此之外,还可以更直观地作图观测itemFrequency。
#查看支持度排行前15的图
itemFrequencyPlot(Epub,topN=15,col="lightblue")

# 当对数据的基本统计信息心中有数之后,就开始最重要的建模步骤。
# 最小支持度0.001,最小置信度0.6,最小项数2
rules <- apriori(Epub,parameter=list(support=0.001,confidence=0.6,minlen=2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 15 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[936 item(s), 15729 transaction(s)] done [0.00s].
## sorting and recoding items ... [481 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [9 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# 这里解释一下最小项数。minlen和maxlen表示LHS+RHS并集的元素的最小个数和最大个数。有意义的规则起码是LHS和RHS各至少包含1个元素,所以minlen为2。生成模型之后,观测模型结果。
# 展示生成的规则
inspect(sort(rules, by="lift"))
##     lhs                  rhs       support     confidence lift    
## [1] {doc_6e7,doc_6e8} => {doc_6e9} 0.001080806 0.8095238  454.7500
## [2] {doc_6e7,doc_6e9} => {doc_6e8} 0.001080806 0.8500000  417.8016
## [3] {doc_6e8,doc_6e9} => {doc_6e7} 0.001080806 0.8947368  402.0947
## [4] {doc_6e9}         => {doc_6e8} 0.001207960 0.6785714  333.5391
## [5] {doc_6e9}         => {doc_6e7} 0.001271537 0.7142857  321.0000
## [6] {doc_506}         => {doc_507} 0.001207960 0.6551724  303.0943
## [7] {doc_6e8}         => {doc_6e7} 0.001335113 0.6562500  294.9187
## [8] {doc_6e7}         => {doc_6e8} 0.001335113 0.6000000  294.9187
## [9] {doc_87c}         => {doc_882} 0.001335113 0.6000000  171.5891
summary(rules)
## set of 9 rules
## 
## rule length distribution (lhs + rhs):sizes
## 2 3 
## 6 3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.333   3.000   3.000 
## 
## summary of quality measures:
##     support           confidence          lift      
##  Min.   :0.001081   Min.   :0.6000   Min.   :171.6  
##  1st Qu.:0.001081   1st Qu.:0.6552   1st Qu.:294.9  
##  Median :0.001208   Median :0.6786   Median :321.0  
##  Mean   :0.001215   Mean   :0.7176   Mean   :332.6  
##  3rd Qu.:0.001335   3rd Qu.:0.8095   3rd Qu.:402.1  
##  Max.   :0.001335   Max.   :0.8947   Max.   :454.8  
## 
## mining info:
##  data ntransactions support confidence
##  Epub         15729   0.001        0.6
# 使用inspect函数,将会具体的展示生成的每一条强规则,其对应的支持度support,置信度confidence ,提升度为lift。by设置为lift,表示展示的规则将会按照提升度lift的大小进行排序。  

# 除了一次性查询所有规则,还可以指定搜索条件,查看规则的子集,这样就得到了包含doc_882的规则。
subrules <- subset(rules, items %in% c("doc_882") & lift>=3)
inspect(sort(subrules,by="lift"))
##     lhs          rhs       support     confidence lift    
## [1] {doc_87c} => {doc_882} 0.001335113 0.6        171.5891
# subrules <- rules

# 对于规则,我们也可单独查询它的三度,例如现在查询自规则subrules的三度:
subrulesquality <- quality(subrules)
subrulesquality
##       support confidence     lift
## 6 0.001335113        0.6 171.5891
# 可视化  
# 在关联规则中,归纳关联规则规律比较重要的图有三个,分别是散点图,分组矩阵图和graph图。

# 散点图   
# 散点图表示了求出的强关联规则里支持度support和置信度confidence的分布,每个点颜色的深浅表示了提升度lift的大小.

# 载入arulesViz画图包
# library(arulesViz)
# 颜色包
library(RColorBrewer)
# 如果想看交互图,可以设置interactive=TRUE,默认为FALSE
plot(rules,control=list(jitter=2,col=rev(brewer.pal(9,"BrBG"))),shading="lift")

# 从图中可以看出,关联规则的支持度support普遍偏低,confidence分布较为均匀,提升度的位置基本上都在百级,总体都远远大于3,表示出来的强规则均是有意义的。

# 分组矩阵图
# 分组矩阵图将有共同点比较相近的规则聚成类,然后展示聚类规则的大体分布。默认是20类,这里将聚类个数设置为5。
plot(rules,method="grouped",control=list(k=5,col=rev(brewer.pal(9,"YlOrRd")[3:8])))

# 相似的关联规则分成一组,从而能够更深入地提取出关联规则的总体规律和重要规则共性。其中,横坐标代表被聚成的8类,且均是先导,纵坐标代表先导最后可能产生的10类后继,圆圈颜色的深浅表示提升度的大小,颜色越深,提升度越大;圆圈的大小表示支持度的大小,圆圈越大,支持度越大。

# graph图  
plot(rules, method = "graph",measure = "confidence", control = list(type="items"),shading = "lift")
## Available control parameters (with default values):
## main  =  Graph for 9 rules
## nodeColors    =  c("#66CC6680", "#9999CC80")
## nodeCol   =  c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF",  "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF",  "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol   =  c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF",  "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF",  "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha     =  0.5
## cex   =  1
## itemLabels    =  TRUE
## labelCol  =  #000000B3
## measureLabels     =  FALSE
## precision     =  3
## layout    =  NULL
## layoutParams  =  list()
## arrowSize     =  0.5
## engine    =  igraph
## plot  =  TRUE
## plot_options  =  list()
## max   =  100
## verbose   =  FALSE

# graph图中,源头表示先导,箭头表示关系指向的方向,中间的圆圈表示此规则置信度的大小,圆圈越大,置信度越大;圆圈颜色的深度表示提升度的大小,圆圈颜色越深,该规则提升度越大;箭头指向的尽头表示该规则的后继。观察关联规则因果图,可以直观地对重要的关联规则有一个初步的认识。

Groceries数据案例1

# 加载相关包
library(arules)     # 加载关联算法的程序包
# library(grid)
library(arulesViz)  # 加载可视化包

# 1.数据源:利用arules包中自带的Groceries数据集,该数据集是来自一个现实世界中的超市经营一个月的购物数据,包含了9835次交易。我们按照超市一天12个小时的工作时间计算,大约每小时的交易次数为9835/30/12=27.3,表明该超市规模属于中等。

data(Groceries)
Groceries
## transactions in sparse format with
##  9835 transactions (rows) and
##  169 items (columns)
class(Groceries)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
str(Groceries)
## Formal class 'transactions' [package "arules"] with 3 slots
##   ..@ data       :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
##   .. .. ..@ i       : int [1:43367] 13 60 69 78 14 29 98 24 15 29 ...
##   .. .. ..@ p       : int [1:9836] 0 4 7 8 12 16 21 22 27 28 ...
##   .. .. ..@ Dim     : int [1:2] 169 9835
##   .. .. ..@ Dimnames:List of 2
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. ..@ factors : list()
##   ..@ itemInfo   :'data.frame':  169 obs. of  3 variables:
##   .. ..$ labels: chr [1:169] "frankfurter" "sausage" "liver loaf" "ham" ...
##   .. ..$ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
##   .. ..$ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
##   ..@ itemsetInfo:'data.frame':  0 obs. of  0 variables
# 2.探索和准备数据:
# 事务型数据每一行指定一个单一的实例,每条记录包括用逗号隔开的任意数量的产品清单,通过inspect()函数可以看到超市的交易记录,每次交易的商品名称;通过summary()函数可以查看该数据集的一些基本信息。
# 通过inspect函数查看Groceries数据集的前5次交易记录
inspect(Groceries[1:5])  
##     items                     
## [1] {citrus fruit,            
##      semi-finished bread,     
##      margarine,               
##      ready soups}             
## [2] {tropical fruit,          
##      yogurt,                  
##      coffee}                  
## [3] {whole milk}              
## [4] {pip fruit,               
##      yogurt,                  
##      cream cheese ,           
##      meat spreads}            
## [5] {other vegetables,        
##      whole milk,              
##      condensed milk,          
##      long life bakery product}
# itemFrequency()函数可以查看商品的交易比例
# frankfurter  sausage  liver loaf - 0.058973055 0.093950178 0.005083884
itemFrequency(Groceries[,1:3])   
## frankfurter     sausage  liver loaf 
## 0.058973055 0.093950178 0.005083884
# 分析:
# ①密度值0.02609146(2.6%)指的是非零矩阵单元格的比例。该数据集一共有9835行(交易记录),169列(所有交易的商品种类),因此,矩阵中共有9835*169=1662115个位置,我们可以得出,在30天内共有1662115*0.02609146=43367件商品被购买。进一步可以得出在每次交易中包含了43367/9835=4.409件商品被购买,在均值那一列可以看出(Mean=4.409)我们的计算是正确的;
# ②most frequent items:列出了事务型数据中最常购买的商品。whole milk 在9835次交易中被购买了2513次,因此,我们可以得出结论:whole milk有2513/9835=25.6%的概率出现在所有的交易中;
# ③element (itemset/transaction) length distribution:呈现了一组关于交易规模的统计,总共有2159次交易中包含一件商品,有1次交易中包含了32件商品.从分位数分布情况可以看出,25%的交易中包含了两件或者更少的商品,大约一半的交易中商品数量为3件;


# 可视化商品的支持度——商品的频率图
# 为了直观地呈现统计数据,可以使用itemFrequenctyPlot()函数生成一个用于描绘所包含的特定商品的交易比例的柱状图。因为包含很多种商品,不可能同时展现出来,因此可以通过support或者topN参数进行排除一部分商品进行展示

itemFrequencyPlot(Groceries, support=0.1)  # support = 0.1 表示支持度至少为0.1

itemFrequencyPlot(Groceries, topN=20)      # topN = 20 表示支持度排在前20的商品

# 可视化交易数据——绘制稀疏矩阵
# 通过使用image()函数可以可视化整个稀疏矩阵。
# 生成一个5行169列的矩阵,矩阵中填充有黑色的单元表示在此次交易(行)中,该商品(列)被购买了
image(Groceries[1:5]) 

# 从上图可以看出,第一行记录(交易)包含了四种商品(黑色的方块),这种可视化的图是用于数据探索的一种很有用的工具。它可能有助于识别潜在的数据问题,比如:由于列表示的是商品名称,如果列从上往下一直被填充表明这个商品在每一次交易中都被购买了;另一方面,图中的模式可能有助于揭示交易或者商品的有趣部分,特别是当数据以有趣的方式排序后,比如,如果交易按照日期进行排序,那么黑色方块图案可能会揭示人们购买商品的数量或者类型受季节性的影响。这种可视化对于超大型的交易数据集是没有意义的,因为单元太小会很难发现有趣的模式。

# 3.训练模型
# grocery_rules <- apriori(data=Groceries,parameter=list(support =,confidence =,minlen =))
# 运行apriori()函数很简单,但是找到支持度和置信度参数来产生合理数量的关联规则时,可能需要进行大量的试验与误差评估。
# 如果参数设置过高,那么结果可能是没有规则或者规则过于普通而不是非常有用的规则;另一方面如果阈值太低,可能会导致规则数量很多,甚至需要运行很长的时间或者在学习阶段耗尽内存。
# aprior()函数默认设置 support = 0.1 和 confidence = 0.8,然而使用默认的设置,不能得到任何规则

# 因为support = 0.1,则意味着该商品必须至少出现在 0.1 * 9835 = 983.5次交易中,在前面的分析中,我们发现只有8种商品的 support >= 0.1,因此使用默认的设置没有产生任何规则也不足为奇
apriori(Groceries)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 983 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [8 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## set of 0 rules
# 解决支持度设定问题的一种方法是考虑一个有趣的模式之前,事先想好需要的最小交易数量,例如:我们可以认为如果一种商品一天被购买了2次,一个月也就是60次交易记录,这或许是我们所感兴趣的,据此,可以计算所需要的支持度support=60/9835=0.006;
# 
# 关于置信度:设置太低,可能会被大量不可靠的规则淹没,设置过高,可能会出现很多显而易见的规则致使我们不能发现有趣的模式;一个合适的置信度水平的选取,取决于我们的分析目标,我们可以尝试以一个保守的值开始,如果发现没有具有可行性的规则,可以降低置信度以拓宽规则的搜索范围。
# 
# 在此例中,我们将从置信度0.25开始,这意味着为了将规则包含在结果中,此时规则的正确率至少为25%,这将排除最不可靠的规则
# 
# minlen = 2 表示规则中至少包含两种商品,这可以防止仅仅是由于某种商品被频繁购买而创建的无用规则,比如在上面的分析中,我们发现whole milk出现的概率(支持度)为25.6%,很可能出现如下规则:{}=>whole milk,这种规则是没有意义的。
# 
# 最终,根据上面的分析我们确定如下参数设置:

# 最终,根据上面的分析我们确定如下参数设置:

grocery_rules <- apriori(data = Groceries,parameter = list(support = 0.006,confidence = 0.25,minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5   0.006      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 59 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [109 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [463 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
grocery_rules
## set of 463 rules
# 4.评估模型的性能
summary(grocery_rules)
## set of 463 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3   4 
## 150 297  16 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.711   3.000   4.000 
## 
## summary of quality measures:
##     support           confidence          lift       
##  Min.   :0.006101   Min.   :0.2500   Min.   :0.9932  
##  1st Qu.:0.007117   1st Qu.:0.2971   1st Qu.:1.6229  
##  Median :0.008744   Median :0.3554   Median :1.9332  
##  Mean   :0.011539   Mean   :0.3786   Mean   :2.0351  
##  3rd Qu.:0.012303   3rd Qu.:0.4495   3rd Qu.:2.3565  
##  Max.   :0.074835   Max.   :0.6600   Max.   :3.9565  
## 
## mining info:
##       data ntransactions support confidence
##  Groceries          9835   0.006       0.25
# 这里需要解释一下lift(提升度),表示用来度量一类商品相对于它的一般购买率,此时被购买的可能性有多大。通俗的讲就是:比如第一条规则{pot plants} => {whole milk},lift = 1.565,表明(购买pot plants 之后再购买 whole milk商品的可能性) 是 (没有购买pot plants 但是购买了whole milk 的可能性) 的 1.565倍;
# 
# 第一条规则解读:如果一个顾客购买了pot plants,那么他还会购买whole milk,支持度support为0.0070,置信度confidence为0.4000,我们可以确定该规则涵盖了大约0.7%的交易,而且在购买了pot plants后,他购买whole milk的概率为40%,提升度lift值为1.565,表明他相对于一般没有购买pot plant商品的顾客购买whole milk商品的概率提升了1.565倍,我们在上面的分析中知道,有25.6%的顾客购买了whole milk,因此计算提升度为0.40/0.256=1.56,这与显示的结果是一致的,注意:标有support的列表示规则的支持度,而不是前件(lhs)或者后件(rhs)的支持度。
# 
# 提升度 lift(X → Y) = P (Y| X) / P (Y) , lift(X → Y) 与  lift(Y → X) 是相同的。
# 
# 如果lift值>1,说明这两类商品在一起购买比只有一类商品被购买更常见。一个大的提升度值是一个重要的指标,它表明一个规则时很重要的,并反映了商品之间的真实联系。


# 5.提高模型的性能

# (1)对关联规则集合排序,根据购物篮分析的目标,最有用的规则或许是那些具有高支持度、信度和提升度的规则。arules包中包含一个sort()函数,通过指定参数by为"support","confidence"或者"lift"对规则列表进行重新排序。 在默认的情况下,排序是降序排列,可以指定参数decreasing=FALSE反转排序方式。
inspect(sort(grocery_rules,by="lift")[1:10])
##      lhs                   rhs                      support confidence     lift
## [1]  {herbs}            => {root vegetables}    0.007015760  0.4312500 3.956477
## [2]  {berries}          => {whipped/sour cream} 0.009049314  0.2721713 3.796886
## [3]  {tropical fruit,                                                          
##       other vegetables,                                                        
##       whole milk}       => {root vegetables}    0.007015760  0.4107143 3.768074
## [4]  {beef,                                                                    
##       other vegetables} => {root vegetables}    0.007930859  0.4020619 3.688692
## [5]  {tropical fruit,                                                          
##       other vegetables} => {pip fruit}          0.009456024  0.2634561 3.482649
## [6]  {beef,                                                                    
##       whole milk}       => {root vegetables}    0.008032537  0.3779904 3.467851
## [7]  {pip fruit,                                                               
##       other vegetables} => {tropical fruit}     0.009456024  0.3618677 3.448613
## [8]  {pip fruit,                                                               
##       yogurt}           => {tropical fruit}     0.006405694  0.3559322 3.392048
## [9]  {citrus fruit,                                                            
##       other vegetables} => {root vegetables}    0.010371124  0.3591549 3.295045
## [10] {other vegetables,                                                        
##       whole milk,                                                              
##       yogurt}           => {tropical fruit}     0.007625826  0.3424658 3.263712
# (2)提取关联规则的子集:可以通过subset()函数提取我们感兴趣的规则
# items 表明与出现在规则的任何位置的项进行匹配,为了将子集限制到匹配只发生在左侧或者右侧位置上,可以使用lhs或者rhs代替
fruit_rules <- subset(grocery_rules,items %in% "pip fruit")  
fruit_rules
## set of 21 rules
inspect(fruit_rules[1:5])
##     lhs                           rhs                support    
## [1] {pip fruit}                => {tropical fruit}   0.020437214
## [2] {pip fruit}                => {other vegetables} 0.026131164
## [3] {pip fruit}                => {whole milk}       0.030096594
## [4] {tropical fruit,pip fruit} => {yogurt}           0.006405694
## [5] {pip fruit,yogurt}         => {tropical fruit}   0.006405694
##     confidence lift    
## [1] 0.2701613  2.574648
## [2] 0.3454301  1.785237
## [3] 0.3978495  1.557043
## [4] 0.3134328  2.246802
## [5] 0.3559322  3.392048

Groceries数据案例2

library(arules)
library(arulesViz)

data(Groceries)
summary(Groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##        labels  level2           level1
## 1 frankfurter sausage meat and sausage
## 2     sausage sausage meat and sausage
## 3  liver loaf sausage meat and sausage
rules <- apriori(Groceries, parameter=list(support = 0.01,confidence = 0.2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [232 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(sort(rules, by="support")[1:6])     # 按支持度查看前6条规则
##     lhs                   rhs                support    confidence
## [1] {}                 => {whole milk}       0.25551601 0.2555160 
## [2] {other vegetables} => {whole milk}       0.07483477 0.3867578 
## [3] {whole milk}       => {other vegetables} 0.07483477 0.2928770 
## [4] {rolls/buns}       => {whole milk}       0.05663447 0.3079049 
## [5] {whole milk}       => {rolls/buns}       0.05663447 0.2216474 
## [6] {yogurt}           => {whole milk}       0.05602440 0.4016035 
##     lift    
## [1] 1.000000
## [2] 1.513634
## [3] 1.513634
## [4] 1.205032
## [5] 1.205032
## [6] 1.571735
inspect(sort(rules, by="confidence")[1:6])  # 按置信度查看前6条规则
##     lhs                   rhs                   support confidence     lift
## [1] {citrus fruit,                                                         
##      root vegetables}  => {other vegetables} 0.01037112  0.5862069 3.029608
## [2] {tropical fruit,                                                       
##      root vegetables}  => {other vegetables} 0.01230300  0.5845411 3.020999
## [3] {curd,                                                                 
##      yogurt}           => {whole milk}       0.01006609  0.5823529 2.279125
## [4] {other vegetables,                                                     
##      butter}           => {whole milk}       0.01148958  0.5736041 2.244885
## [5] {tropical fruit,                                                       
##      root vegetables}  => {whole milk}       0.01199797  0.5700483 2.230969
## [6] {root vegetables,                                                      
##      yogurt}           => {whole milk}       0.01453991  0.5629921 2.203354
sub.rules <- subset(rules, subset = rhs %in% "whole milk" & lift > 1.2)  # 也可以用subset做规则的筛选,取"右手边"含有whole milk且lift大于1.2的规则

itemFrequencyPlot(Groceries,support = 0.05,cex.names =0.8)   # 数据画频繁项的图

plot(rules, shading="order", control=list(main = "Two-key plot")) 

plot(rules, method="grouped") 

plot(rules, method="graph")  

# eclat算法案例:简易应用之筛选购全脂牛奶 的关联分析情况

fsets <- eclat(Groceries, parameter = list(support = 0.05),control = list(verbose=FALSE))  #提取频繁项集

itemFrequencyPlot(Groceries,support = 0.05,cex.names =0.8)  #数据画频繁项的图

itemsetList <- LIST(items(fsets), decode = FALSE)

singleItems <- fsets[size(items(fsets)) == 1]  #筛选单独项集

singleSupport <- quality(singleItems)$support  #单独项集的支持度

names(singleSupport) <- unlist(LIST(items(singleItems), decode = FALSE)) #转换为列表格式的数据

allConfidence <- quality(fsets)$support / sapply(itemsetList, function(x)max(singleSupport[as.character(x)]))

quality(fsets) <- cbind(quality(fsets), allConfidence)

fsetsmilk <- subset(fsets, subset = items %pin% "whole milk")

inspect(sort(fsetsmilk[size(fsetsmilk)>1], by = "allConfidence")[1:3])
##     items                         support    allConfidence
## [1] {other vegetables,whole milk} 0.07483477 0.2928770    
## [2] {whole milk,rolls/buns}       0.05663447 0.2216474    
## [3] {whole milk,yogurt}           0.05602440 0.2192598

titanic.raw数据案例–封装函数运行举例

# 主流程中的子函数
# 
# 上述的主流程里包含3个子函数:
# 
#   数据处理:as.transaction;
#   Rhs的提取函数:Rhs_Selecet;
#   将规则转换成数据框格式输出:inspect.frame。

##数据转换,先转换成List格式,再转换成transaction格式。
as.transaction <- function(data, f){
  dataList <- split(data, f)
  dataList <- lapply(dataList, function(x){
    rst <- unlist(x)
    names(rst) <- NULL
    rst <- unique(rst)
    rst <- rst[-which(rst=="")]
    rst
  })
  transaction <- as(dataList, "transactions") 
  transaction
}

##右提取规则
Rhs_Selecet <- function(rules.pruned, char){
  rhs <- rules.pruned@rhs@itemInfo[(rules.pruned@rhs@data@i)+1,]
  loc <- which(rhs == char)
  rules.pruned[loc]
}

##转换成data.frame格式,先提取Lhs,并连接成一个字符串
##再提取Rhs,quality,组成一个数据框
inspect.frame <- function(rules.pruned, itemSep = ","){
  ##Lhs处理
  #提取Lhs长度
  lhsNum <- diff(rules.pruned@lhs@data@p)
  #产生标签
  lhsRuleItemsLOC <- NULL
  for(i in 1:length(lhsNum)){
    lhsRuleItemsLOC <- c(lhsRuleItemsLOC, rep(i, lhsNum[i]))
  }
  #提取Rhs,组合成字符串, 链接符号默认“,”
  lhsRuleItems <- rules.pruned@lhs@itemInfo[rules.pruned@lhs@data@i+1,]
  lhsRuleItemsList <- split(lhsRuleItems, lhsRuleItemsLOC)
  lhs <- sapply(lhsRuleItemsList, function(x){
    lhs <- x[1]
    if(length(x)>1){
      for(i in 2:length(x)){
        lhs <- paste(lhs, x[i], sep=itemSep)
      }
    }
    lhs
  })
  
  ##lhs处理
  rhs <- rules.pruned@rhs@itemInfo[(rules.pruned@rhs@data@i)+1,]
  
  ##整理结果成数据框
  csq <- data.frame(lhs, rhs, rules.pruned@quality)
  csq
}

# 关联分析主流程

##读取数据,转换成transaction格式
# data <- read.csv("xxx.csv", stringsAsFactors = F)
mdata <- read.csv('http://data.galaxystatistics.com/blog_data/apriori/titanic.csv',header=T)
# transaction <- as.transaction(data[,-1],data[,1]) 
# transaction <- as.transactions(mdata) 
# transactions
transaction <- as(mdata, "transactions")

##关联分析,设置support,confidence,对结果按照lift排序
rules <- apriori(transaction, parameter = list(support = 0.005, confidence = 0.8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.005      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 11 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10 item(s), 2201 transaction(s)] done [0.00s].
## sorting and recoding items ... [10 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [72 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
quality(rules) <- round(quality(rules), 3)
rules.sorted <- sort(rules, by="lift")

##消除冗余项,具体看自己如何定义冗余,我的定义如下:
#类型一:同时存在 1) {A,B}=>{D}   2){A,B,C}=>{D},则1)是冗余项
#类型二:同时存在 1) {A,B}=>{C}   2){A,B,C}=>{D},则1)是冗余项
supersetLhs <- is.superset(rules.sorted@lhs, rules.sorted@lhs)   
supersetAll <- is.superset(rules.sorted, rules.sorted)
superset <- supersetLhs==supersetAll&supersetLhs==T&supersetAll==T
redundant <- colSums(superset, na.rm = T) == 1
rules.pruned <- rules.sorted[redundant]
inspect(rules.pruned) #查看结果
##      lhs                                     rhs            support
## [1]  {Class=2nd,Age=adult,Survived=yes}   => {Sex=female}   0.036  
## [2]  {Age=child,Sex=female,Survived=no}   => {Class=3rd}    0.008  
## [3]  {Age=child,Sex=male,Survived=no}     => {Class=3rd}    0.016  
## [4]  {Class=2nd,Age=child,Sex=female}     => {Survived=yes} 0.006  
## [5]  {Class=1st,Age=adult,Sex=female}     => {Survived=yes} 0.064  
## [6]  {Class=crew,Age=adult,Sex=female}    => {Survived=yes} 0.009  
## [7]  {Class=2nd,Age=adult,Sex=female}     => {Survived=yes} 0.036  
## [8]  {Age=adult,Sex=female,Survived=no}   => {Class=3rd}    0.040  
## [9]  {Class=2nd,Age=adult,Sex=male}       => {Survived=no}  0.070  
## [10] {Class=crew,Age=adult,Survived=no}   => {Sex=male}     0.304  
## [11] {Class=3rd,Age=adult,Sex=male}       => {Survived=no}  0.176  
## [12] {Class=1st,Age=adult,Survived=no}    => {Sex=male}     0.054  
## [13] {Class=2nd,Age=adult,Survived=no}    => {Sex=male}     0.070  
## [14] {Class=crew,Age=adult,Survived=yes}  => {Sex=male}     0.087  
## [15] {Class=2nd,Sex=female,Survived=no}   => {Age=adult}    0.006  
## [16] {Class=2nd,Sex=male,Survived=no}     => {Age=adult}    0.070  
## [17] {Class=1st,Sex=male,Survived=no}     => {Age=adult}    0.054  
## [18] {Class=crew,Sex=female,Survived=yes} => {Age=adult}    0.009  
## [19] {Class=crew,Sex=male,Survived=yes}   => {Age=adult}    0.087  
## [20] {Class=crew,Sex=male,Survived=no}    => {Age=adult}    0.304  
## [21] {Class=1st,Sex=female,Survived=yes}  => {Age=adult}    0.064  
## [22] {Class=3rd,Age=adult,Survived=no}    => {Sex=male}     0.176  
## [23] {Class=1st,Sex=male,Survived=yes}    => {Age=adult}    0.026  
## [24] {Class=3rd,Sex=male,Survived=no}     => {Age=adult}    0.176  
## [25] {Class=2nd,Sex=female,Survived=yes}  => {Age=adult}    0.036  
## [26] {Class=3rd,Sex=male,Survived=yes}    => {Age=adult}    0.034  
## [27] {Class=3rd,Sex=female,Survived=yes}  => {Age=adult}    0.035  
## [28] {Class=3rd,Sex=female,Survived=no}   => {Age=adult}    0.040  
##      confidence lift 
## [1]  0.851      3.986
## [2]  1.000      3.118
## [3]  1.000      3.118
## [4]  1.000      3.096
## [5]  0.972      3.010
## [6]  0.870      2.692
## [7]  0.860      2.663
## [8]  0.817      2.546
## [9]  0.917      1.354
## [10] 0.996      1.266
## [11] 0.838      1.237
## [12] 0.967      1.230
## [13] 0.922      1.173
## [14] 0.906      1.152
## [15] 1.000      1.052
## [16] 1.000      1.052
## [17] 1.000      1.052
## [18] 1.000      1.052
## [19] 1.000      1.052
## [20] 1.000      1.052
## [21] 0.993      1.045
## [22] 0.813      1.034
## [23] 0.919      0.967
## [24] 0.917      0.965
## [25] 0.860      0.905
## [26] 0.852      0.897
## [27] 0.844      0.888
## [28] 0.840      0.883
###转换成data.frame输出规则 
RuleFrame <- inspect.frame(rules.pruned, itemSep=",") 
head(RuleFrame)
##                                                                  lhs
## 1 c("Class=2nd", "Age=adult", "Survived=yes"),c(2, 1, 4),c(2, 4, 10)
## 2  c("Age=child", "Sex=female", "Survived=no"),c(1, 3, 4),c(5, 7, 9)
## 3    c("Age=child", "Sex=male", "Survived=no"),c(1, 3, 4),c(5, 8, 9)
## 4    c("Class=2nd", "Age=child", "Sex=female"),c(2, 1, 3),c(2, 5, 7)
## 5    c("Class=1st", "Age=adult", "Sex=female"),c(2, 1, 3),c(1, 4, 7)
## 6   c("Class=crew", "Age=adult", "Sex=female"),c(2, 1, 3),c(6, 4, 7)
##         labels variables levels support confidence  lift
## 1   Sex=female       Sex female   0.036      0.851 3.986
## 2    Class=3rd     Class    3rd   0.008      1.000 3.118
## 3    Class=3rd     Class    3rd   0.016      1.000 3.118
## 4 Survived=yes  Survived    yes   0.006      1.000 3.096
## 5 Survived=yes  Survived    yes   0.064      0.972 3.010
## 6 Survived=yes  Survived    yes   0.009      0.870 2.692
write.csv(RuleFrame, "rstRuleFrame.csv")

##提取想要分析的RHS项,如rhs=“相对安全”,然后对结果进行分析
rstRule <- Rhs_Selecet(rules.pruned, "Survived=no")
inspect(rstRule )
##     lhs                               rhs           support confidence
## [1] {Class=2nd,Age=adult,Sex=male} => {Survived=no} 0.070   0.917     
## [2] {Class=3rd,Age=adult,Sex=male} => {Survived=no} 0.176   0.838     
##     lift 
## [1] 1.354
## [2] 1.237
# 可视化
# 全部规则的可视化,以及预分析Rhs项的可视化

###全部规则,气泡图,大小表示support,颜色表示Lift
library(arulesViz)
plot(rules.pruned, method = "grouped")

###部分规则,项集有向图,大小表示support,颜色表示Lift
plot(rstRule, method = "graph", control = list(edgeCol="black",  main="rhs=`Survived=no`"))