前言

一般来说,协同过滤推荐分为三种类型。第一种是基于用户(user-based)的协同过滤,第二种是基于项目(item-based)的协同过滤,第三种是基于模型(model based)的协同过滤。

基于用户(user-based)的协同过滤

案例1:

# install.packages("recommenderlab")
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
## Loading required package: proxy
## 
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
## Loading required package: registry
library(ggplot2)
library(plyr)

# 
data(MovieLense)
dim(MovieLense)
## [1]  943 1664
## [1]  943 1664
MovieLense
## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
# class(MovieLense)
# class(MovieLenseMeta)

## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
image(sample(MovieLense,500),main="Raw ratings")

qplot(getRatings(MovieLense),binwidth=1,main="histogram of normalized ratings",xlab = "Ratings")

summary(getRatings(MovieLense))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    4.00    3.53    4.00    5.00
qplot(getRatings(normalize(MovieLense,method="Z-score")),main="hist of normalized ratings",xlab="rating")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(getRatings(normalize(MovieLense,method="Z-score")))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.8520 -0.6466  0.1084  0.0000  0.7506  4.1280
qplot(rowCounts(MovieLense), binwidth=10, main="Movies rated on Average", xlab="# of users", ylab="# of movies rated")

qplot(colMeans(MovieLense), binwidth=0.1, main="Mean ratings of Movies", xlab="Rating", ylab="# of movies")

recommenderRegistry$get_entries(dataType="realRatingMatrix")
## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
##   normalize lambda n_factors n_iterations min_item_nr seed
## 1      NULL    0.1        10           10           1 NULL
## 
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
##   lambda alpha n_factors n_iterations min_item_nr seed
## 1    0.1    10        10           10           1 NULL
## 
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
##    k   method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine"  "center"                FALSE   0.5      FALSE
## 
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
##   normalize    aggregationRatings aggregationPopularity
## 1  "center" new("standardGeneric" new("standardGeneric"
## 
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
## 
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
##   randomize minRating
## 1         1        NA
## 
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
##    k maxiter normalize
## 1 10     100  "center"
## 
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend.
## Reference: NA
## Parameters:
##    k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015  0.001         50        200           1e-06  "center"   FALSE
## 
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
##     method nn sample normalize
## 1 "cosine" 25  FALSE  "center"
scheme <- evaluationScheme(MovieLense, method="split", train=0.9, k=1, given=10, goodRating=4)
scheme
## Evaluation scheme with 10 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=4.000000
## Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
algorithms <- list(
  "random items" = list(name="RANDOM", param=list(normalize = "Z-score")),
  "popular items" = list(name="POPULAR", param=list(normalize = "Z-score")),
  "user-based CF" = list(name="UBCF", param=list(normalize = "Z-score", method="Cosine", nn=50, minRating=3)),
  "item-based CF" = list(name="IBCF", param=list(normalize = "Z-score", method="Cosine"))
)

# run algorithms, predict next n movies
results <- evaluate(scheme, algorithms, n=c(1, 3, 5, 10, 15, 20))
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.34sec] 
## POPULAR run fold/sample [model time/prediction time]
##   1  [0.03sec/0.25sec] 
## UBCF run fold/sample [model time/prediction time]
##   1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
## [0.03sec/1.04sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [34.94sec/0.06sec]
plot(results, annotate = 1:4, legend="topleft")

# See precision / recall
plot(results, "prec/rec", annotate=3)

summary(results)
##               Length Class             Mode
## random items  1      evaluationResults S4  
## popular items 1      evaluationResults S4  
## user-based CF 1      evaluationResults S4  
## item-based CF 1      evaluationResults S4
print(results)
## List of evaluation results for 4 recommenders:
## Evaluation results for 1 folds/samples using method 'RANDOM'.
## Evaluation results for 1 folds/samples using method 'POPULAR'.
## Evaluation results for 1 folds/samples using method 'UBCF'.
## Evaluation results for 1 folds/samples using method 'IBCF'.
result1 <- ldply(avg(results))
head(result1)
##            .id        TP         FP       FN       TN  precision
## 1 random items 0.1368421  0.8631579 50.68421 1602.316 0.13684211
## 2 random items 0.3578947  2.6421053 50.46316 1600.537 0.11929825
## 3 random items 0.4736842  4.5263158 50.34737 1598.653 0.09473684
## 4 random items 0.8210526  9.1789474 50.00000 1594.000 0.08210526
## 5 random items 1.2000000 13.8000000 49.62105 1589.379 0.08000000
## 6 random items 1.6000000 18.4000000 49.22105 1584.779 0.08000000
##        recall         TPR          FPR
## 1 0.002314131 0.002314131 0.0005351756
## 2 0.007203976 0.007203976 0.0016419898
## 3 0.008970023 0.008970023 0.0028166748
## 4 0.016294109 0.016294109 0.0057157308
## 5 0.023149457 0.023149457 0.0085937135
## 6 0.032354085 0.032354085 0.0114554966
result1[,1] <- paste(result1[,1],c(1, 3, 5, 10, 15, 20))
temp_result1 <- result1[,c(1,6,7)]

f<-function(p,r){
  return(2*p*r)/(p+r)
}

result1_f <- cbind(result1,f=f(temp_result1[,2],temp_result1[,3]))
head(result1_f)
##               .id        TP         FP       FN       TN  precision
## 1  random items 1 0.1368421  0.8631579 50.68421 1602.316 0.13684211
## 2  random items 3 0.3578947  2.6421053 50.46316 1600.537 0.11929825
## 3  random items 5 0.4736842  4.5263158 50.34737 1598.653 0.09473684
## 4 random items 10 0.8210526  9.1789474 50.00000 1594.000 0.08210526
## 5 random items 15 1.2000000 13.8000000 49.62105 1589.379 0.08000000
## 6 random items 20 1.6000000 18.4000000 49.22105 1584.779 0.08000000
##        recall         TPR          FPR            f
## 1 0.002314131 0.002314131 0.0005351756 0.0006333412
## 2 0.007203976 0.007203976 0.0016419898 0.0017188434
## 3 0.008970023 0.008970023 0.0028166748 0.0016995834
## 4 0.016294109 0.016294109 0.0057157308 0.0026756642
## 5 0.023149457 0.023149457 0.0085937135 0.0037039131
## 6 0.032354085 0.032354085 0.0114554966 0.0051766535
head(result1_f[order(-result1_f$f),])
##                 .id       TP        FP       FN       TN precision
## 18 user-based CF 20 6.431579 13.568421 44.38947 1589.611 0.3215789
## 17 user-based CF 15 5.210526  9.789474 45.61053 1593.389 0.3473684
## 16 user-based CF 10 3.842105  6.157895 46.97895 1597.021 0.3842105
## 15  user-based CF 5 2.200000  2.800000 48.62105 1600.379 0.4400000
## 12 popular items 20 5.663158 14.336842 45.15789 1588.842 0.2831579
## 11 popular items 15 4.589474 10.410526 46.23158 1592.768 0.3059649
##        recall        TPR         FPR          f
## 18 0.18780517 0.18780517 0.008388672 0.12078837
## 17 0.15994608 0.15994608 0.006050759 0.11112044
## 16 0.12814236 0.12814236 0.003805340 0.09846728
## 15 0.08364741 0.08364741 0.001730485 0.07360972
## 12 0.12898428 0.12898428 0.008860610 0.07304584
## 11 0.10978560 0.10978560 0.006430933 0.06718108
#带入模型
moive_re <- Recommender(MovieLense,method="UBCF")
moives_pr <- predict(moive_re,MovieLense,n=20)
print(as(moives_pr,"list")[1:5])
## $`1`
##  [1] "Glory (1989)"                                                               
##  [2] "Schindler's List (1993)"                                                    
##  [3] "Casablanca (1942)"                                                          
##  [4] "Close Shave, A (1995)"                                                      
##  [5] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
##  [6] "Leaving Las Vegas (1995)"                                                   
##  [7] "One Flew Over the Cuckoo's Nest (1975)"                                     
##  [8] "Rear Window (1954)"                                                         
##  [9] "Heathers (1989)"                                                            
## [10] "L.A. Confidential (1997)"                                                   
## [11] "City of Lost Children, The (1995)"                                          
## [12] "Butch Cassidy and the Sundance Kid (1969)"                                  
## [13] "Titanic (1997)"                                                             
## [14] "Lawrence of Arabia (1962)"                                                  
## [15] "Shine (1996)"                                                               
## [16] "Stand by Me (1986)"                                                         
## [17] "Gandhi (1982)"                                                              
## [18] "To Kill a Mockingbird (1962)"                                               
## [19] "In the Name of the Father (1993)"                                           
## [20] "Harold and Maude (1971)"                                                    
## 
## $`2`
##  [1] "Boot, Das (1981)"                         
##  [2] "Dead Man Walking (1995)"                  
##  [3] "Lone Star (1996)"                         
##  [4] "Return of the Jedi (1983)"                
##  [5] "Celluloid Closet, The (1995)"             
##  [6] "Casablanca (1942)"                        
##  [7] "Citizen Kane (1941)"                      
##  [8] "Godfather: Part II, The (1974)"           
##  [9] "2001: A Space Odyssey (1968)"             
## [10] "When We Were Kings (1996)"                
## [11] "Diva (1981)"                              
## [12] "Close Shave, A (1995)"                    
## [13] "Tango Lesson, The (1997)"                 
## [14] "Beautiful Thing (1996)"                   
## [15] "Empire Strikes Back, The (1980)"          
## [16] "Mrs. Dalloway (1997)"                     
## [17] "Butch Cassidy and the Sundance Kid (1969)"
## [18] "My Fair Lady (1964)"                      
## [19] "Bonnie and Clyde (1967)"                  
## [20] "Annie Hall (1977)"                        
## 
## $`3`
##  [1] "Mrs. Brown (Her Majesty, Mrs. Brown) (1997)" 
##  [2] "Star Wars (1977)"                            
##  [3] "Pulp Fiction (1994)"                         
##  [4] "English Patient, The (1996)"                 
##  [5] "Full Monty, The (1997)"                      
##  [6] "Lone Star (1996)"                            
##  [7] "Titanic (1997)"                              
##  [8] "Sweet Hereafter, The (1997)"                 
##  [9] "In the Company of Men (1997)"                
## [10] "Willy Wonka and the Chocolate Factory (1971)"
## [11] "In & Out (1997)"                             
## [12] "Vertigo (1958)"                              
## [13] "As Good As It Gets (1997)"                   
## [14] "Apt Pupil (1998)"                            
## [15] "Dazed and Confused (1993)"                   
## [16] "Ice Storm, The (1997)"                       
## [17] "This Is Spinal Tap (1984)"                   
## [18] "Trainspotting (1996)"                        
## [19] "Heat (1995)"                                 
## [20] "Fargo (1996)"                                
## 
## $`4`
##  [1] "Titanic (1997)"               "English Patient, The (1996)" 
##  [3] "L.A. Confidential (1997)"     "Game, The (1997)"            
##  [5] "Good Will Hunting (1997)"     "Kiss the Girls (1997)"       
##  [7] "Full Monty, The (1997)"       "Usual Suspects, The (1995)"  
##  [9] "Rosewood (1997)"              "Boogie Nights (1997)"        
## [11] "Raise the Red Lantern (1991)" "Pulp Fiction (1994)"         
## [13] "Toy Story (1995)"             "Love Jones (1997)"           
## [15] "Eve's Bayou (1997)"           "Edge, The (1997)"            
## [17] "Sting, The (1973)"            "Some Like It Hot (1959)"     
## [19] "Strictly Ballroom (1992)"     "Soul Food (1997)"            
## 
## $`5`
##  [1] "Terminator 2: Judgment Day (1991)"                                          
##  [2] "Terminator, The (1984)"                                                     
##  [3] "Usual Suspects, The (1995)"                                                 
##  [4] "Contact (1997)"                                                             
##  [5] "Braveheart (1995)"                                                          
##  [6] "Casablanca (1942)"                                                          
##  [7] "Twelve Monkeys (1995)"                                                      
##  [8] "Godfather, The (1972)"                                                      
##  [9] "Shawshank Redemption, The (1994)"                                           
## [10] "Raising Arizona (1987)"                                                     
## [11] "Amadeus (1984)"                                                             
## [12] "Nikita (La Femme Nikita) (1990)"                                            
## [13] "Reservoir Dogs (1992)"                                                      
## [14] "Citizen Kane (1941)"                                                        
## [15] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
## [16] "Schindler's List (1993)"                                                    
## [17] "Titanic (1997)"                                                             
## [18] "Leaving Las Vegas (1995)"                                                   
## [19] "North by Northwest (1959)"                                                  
## [20] "Army of Darkness (1993)"

案例2:

library(recommenderlab)

# 数据构成

set.seed(1234)
m <- matrix(sample(c(as.numeric(0:5), NA), 50, 
                       replace = TRUE,  prob = c(rep(.4/6, 6), .6 )),  ncol = 10, dimnames = list(user = paste("u", 1:5,sep = ''), item=paste("i", 1:10, sep = '' )))
r <- as(m, "realRatingMatrix") 
m 
##     item
## user i1 i2 i3 i4 i5 i6 i7 i8 i9 i10
##   u1 NA  2  3  5 NA  5 NA  4 NA  NA
##   u2  2 NA NA NA NA NA NA NA  2   3
##   u3  2 NA NA NA NA  1 NA NA NA  NA
##   u4  2  2  1 NA NA  5 NA  0  2  NA
##   u5  5 NA NA NA NA NA NA  5 NA   4
r
## 5 x 10 rating matrix of class 'realRatingMatrix' with 19 ratings.
# 运用recommenderlab包实现协同过滤推荐,其数据类型采用S4类构造,使用抽象的raringMatrix为评分数据提供接口。 先自行构建一个包含缺失值的矩阵,行为u1-u5个用户,列为i1-i10的物品。 需通过as()函数转为raringMatrix类型。 raringMatrix采用了很多类似矩阵对象的操作,如dim(),dimnames(),rowCounts(),colMeans(),rowMeans(),colSums(),rowMeans()、、也增加了一些特别的操作方法,如sample(),用于从用户(即,行)中抽样,image()可以生成像素图


# 数据转换

head(as(r , "data.frame"))
##    user item rating
## 5    u1   i2      2
## 7    u1   i3      3
## 9    u1   i4      5
## 10   u1   i6      5
## 13   u1   i8      4
## 1    u2   i1      2
n <- normalize(r)
image(r, main = "Raw rating")

image(n, main = "normalized rating")

#二元分类转换
r_b <- binarize(r, minRating = 4)
b <- as(r_b , "matrix") 
b
##       i1    i2    i3    i4    i5    i6    i7    i8    i9   i10
## u1 FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE
## u2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## u3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## u4 FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## u5  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
# 数据也可转换为数据框类型,或通过normalize()函数进行标准化处理,标准化的目的是为了去除用户评分的偏差、通过binarize()函数转为binaryRatingMatrix类型,适用于是0-1的评分矩阵(赞/踩)


# 数据可视化

# 为更好说明推荐系统的算法,我们采用非常出名的MovieLense数据集,其收集了网站MovieLens(movielens.umn.edu)从1997年9月19日到1998年4月22日的数据,包括943名用户对1664部电影的评分。 
# getRatings()函数可获取评价数据。 
# image()可查看数据分布

data(MovieLense)
r <- sample(MovieLense, 943, replace = F)

image(MovieLense)

hist(getRatings(normalize(MovieLense)), breaks = 100)

hist(rowCounts(r), breaks = 50)

# 创建推荐
# 查看推荐方法

# 我们已经获得了评分数据,现在来看看recommenderlab包可以实现哪些推荐算法呢!
# 可以运用recommenderRegistry$get_entry_names()函数实现。 
# 参数realRatingMatrix表示数据类型是推荐评分型(如1-5颗星评价),可具体查看算法说明。 
# 对于realRatingMatrix有六种方法:IBCF(基于物品的推荐)、UBCF(基于用户的推荐)、PCA(主成分分析)、 RANDOM(随机推荐)、SVD(矩阵因子化)、POPULAR(基于流行度的推荐)
recommenderRegistry$get_entry_names()
##  [1] "ALS_realRatingMatrix"            "ALS_implicit_realRatingMatrix"  
##  [3] "ALS_implicit_binaryRatingMatrix" "AR_binaryRatingMatrix"          
##  [5] "IBCF_binaryRatingMatrix"         "IBCF_realRatingMatrix"          
##  [7] "POPULAR_binaryRatingMatrix"      "POPULAR_realRatingMatrix"       
##  [9] "RANDOM_realRatingMatrix"         "RANDOM_binaryRatingMatrix"      
## [11] "RERECOMMEND_realRatingMatrix"    "SVD_realRatingMatrix"           
## [13] "SVDF_realRatingMatrix"           "UBCF_binaryRatingMatrix"        
## [15] "UBCF_realRatingMatrix"
# 建立模型

# 建立协同过滤推荐算法模型,主要运用 Recommender(data=ratingMatrix,method,parameter=NULL)函数,getModel()可查看模型参数
r_recom <- Recommender(r , method = "IBCF")
r_popul <- Recommender(r , method = "POPULAR")
# 查看模型方法
names(getModel(r_recom))
## [1] "description"          "sim"                  "k"                   
## [4] "method"               "normalize"            "normalize_sim_matrix"
## [7] "alpha"                "na_as_zero"           "verbose"
names(getModel(r_popul))
## [1] "topN"                  "ratings"               "normalize"            
## [4] "aggregationRatings"    "aggregationPopularity" "verbose"
# 模型预测
# TOP-N预测

# 对模型预测可运用predict()函数,在此分别以TOP-N预测及评分预测为例,预测第940-943位观影者的评分情况。 n表示最终为TOP-N的列表推荐,参数type = "ratings"表示运用评分预测观影者对电影评分,模型结果均需转为list或矩阵表示
pred <- predict(r_popul ,r[940:943], n = 5) 
as(pred, "list")
## $`877`
## [1] "Star Wars (1977)"               "Godfather, The (1972)"         
## [3] "Fargo (1996)"                   "Raiders of the Lost Ark (1981)"
## [5] "Titanic (1997)"                
## 
## $`476`
## [1] "Star Wars (1977)"                 "Godfather, The (1972)"           
## [3] "Fargo (1996)"                     "Raiders of the Lost Ark (1981)"  
## [5] "Silence of the Lambs, The (1991)"
## 
## $`929`
## [1] "Titanic (1997)"                   "Shawshank Redemption, The (1994)"
## [3] "Return of the Jedi (1983)"        "L.A. Confidential (1997)"        
## [5] "Princess Bride, The (1987)"      
## 
## $`366`
## [1] "Star Wars (1977)"               "Godfather, The (1972)"         
## [3] "Fargo (1996)"                   "Raiders of the Lost Ark (1981)"
## [5] "Titanic (1997)"
#top-N为有序列表,抽取最优推荐子集
pred3 <- bestN(pred, n=3)
as(pred3, "list")
## $`877`
## [1] "Star Wars (1977)"      "Godfather, The (1972)" "Fargo (1996)"         
## 
## $`476`
## [1] "Star Wars (1977)"      "Godfather, The (1972)" "Fargo (1996)"         
## 
## $`929`
## [1] "Titanic (1997)"                   "Shawshank Redemption, The (1994)"
## [3] "Return of the Jedi (1983)"       
## 
## $`366`
## [1] "Star Wars (1977)"      "Godfather, The (1972)" "Fargo (1996)"
#评分预测
rate <- predict(r_popul, r[940:943], type = "ratings")
as(rate, "matrix")[, 1:5]
##     Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## 877         4.126962         3.574065          3.421685          3.797723
## 476         3.665655         3.112759          2.960378                NA
## 929               NA         3.440782          3.288402          3.664440
## 366         4.693741         4.140844          3.988464          4.364502
##     Copycat (1995)
## 877       3.621840
## 476       3.160534
## 929       3.488557
## 366       4.188619
# 预测模型评价

# 评分预测模型评价

# 建立推荐系统模型后,非常关心的是对预测模型的评价。 
# 可通过evaluationScheme()将数据按一定规则分为训练集和测试集(参数method = "split",),
# 或进行k-fold交叉验证(如method = "cross",k=4),
# given参数表示用来进行模型评价的items的数量。 
# 分别运用UBCF及IBCF算法,进行预测评价。 
# 
# getData(e,"train")表示获取训练集数据,
# predict(r1,getData(e,"known"),type = "ratings")表示对“已知”训练集数据进行预测。 
# 计算预测模型的准确度可通过calcPredictionAccuracy()函数实现,参数“unknown”表示对“未知”test集进行比较。 
# 结果发现两种方法的均方根误差(RMSE)基本一致

e <- evaluationScheme(r[1:800], method = "split", train = 0.9, given = 15, goodRating = 5)
e
## Evaluation scheme with 15 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=5.000000
## Data set: 800 x 1664 rating matrix of class 'realRatingMatrix' with 83043 ratings.
r1 <- Recommender(getData(e , "train") , "UBCF") 
p1 <- predict(r1 , getData(e , "known") , type = "ratings") 
r2 <- Recommender(getData(e , "train") , "IBCF") 
p2 <- predict(r2 , getData(e , "known") , type = "ratings")
##计算预测模型的准确度
c1 <- calcPredictionAccuracy(p1, getData(e , "unknown"))
c2 <- calcPredictionAccuracy(p2, getData(e , "unknown"))
error <- rbind(c1, c2) 
rownames(error) <- c("UBCF", "IBCF")
error
##          RMSE      MSE       MAE
## UBCF 1.053345 1.109537 0.8338755
## IBCF 1.172571 1.374924 0.8580037
# TOP-N预测模型评价

# 让我们来评价TOP-1,TOP-3,TOP-5,TOP-10推荐准确性。 
# 通过4-fold交叉验证方法分割数据集,运用evaluate()进行TOP-N预测模型评价。 
# 评价结果可通过ROC曲线及准确率-召回率曲线展示

# 4-fold交叉验证
tops <- evaluationScheme(r[1:800], method = "cross", k = 4, given = 3,  goodRating = 5)
tops
## Evaluation scheme with 3 items given
## Method: 'cross-validation' with 4 run(s).
## Good ratings: >=5.000000
## Data set: 800 x 1664 rating matrix of class 'realRatingMatrix' with 83043 ratings.
results <- evaluate(tops, method = "POPULAR", type = "topNList",  n=c(1, 3, 5, 10))
## POPULAR run fold/sample [model time/prediction time]
##   1  [0sec/0.67sec] 
##   2  [0sec/0.61sec] 
##   3  [0.01sec/0.66sec] 
##   4  [0.02sec/0.63sec]
# 获得混淆矩阵
getConfusionMatrix(results)[[1]]
##      TP   FP    FN      TN precision     recall        TPR          FPR
## 1  0.39 0.61 19.80 1640.20 0.3900000 0.02826095 0.02826095 0.0003701392
## 3  0.83 2.17 19.36 1638.64 0.2766667 0.05467628 0.05467628 0.0013176701
## 5  1.24 3.76 18.95 1637.05 0.2480000 0.08633353 0.08633353 0.0022851454
## 10 2.12 7.88 18.07 1632.93 0.2120000 0.12988107 0.12988107 0.0047891164
avg(results)
##         TP      FP       FN       TN precision     recall        TPR
## 1  0.34000 0.66000 20.75000 1639.250 0.3400000 0.02278813 0.02278813
## 3  0.75250 2.24750 20.33750 1637.663 0.2508333 0.05391753 0.05391753
## 5  1.15625 3.84375 19.93375 1636.066 0.2312500 0.07884788 0.07884788
## 10 2.05125 7.94875 19.03875 1631.961 0.2051250 0.12853235 0.12853235
##             FPR
## 1  0.0004008696
## 3  0.0013665909
## 5  0.0023378733
## 10 0.0048350401
# ROC曲线
plot(results, annotate = TRUE)

# 准确率-召回率曲线
plot(results, "prec/rec", annotate = TRUE)

# 推荐算法的比较

# 除了对预测模型进行评价,还可以对不同推荐算法进行比较。 可首先构建一个推荐算法列表,通过ROC曲线、、准确率-召回率曲线或RMSE直方图进行比较

# TOP-N算法比较

set.seed(2017)
scheme <- evaluationScheme(r, method = "split", train = 0.9, k = 1, given = 10, goodRating = 5)
# 构建推荐算法列表
algorithms <- list(
      "random items"=list(name = "RANDOM", param = NULL), 
      "popular items" = list(name = "POPULAR",  param = list(normalize="Z-score")), 
      "user-based CF" = list(name = "UBCF", param = list(normalize="Z-score", method = "Cosine",  nn = 25,  minRating = 3)),   
      "item-based CF" = list(name = "IBCF", param = list(k = 50)), 
      "SVD approximation" = list(name = "SVD", param = list(approxRank = 50)))
# 构建不同算法模型
results <- evaluate(scheme, algorithms, n = c(1, 3, 5, 10, 15, 20))
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.21sec] 
## POPULAR run fold/sample [model time/prediction time]
##   1  [0.03sec/0.25sec] 
## UBCF run fold/sample [model time/prediction time]
##   1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
## [0.03sec/0.91sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [22.62sec/0.07sec] 
## SVD run fold/sample [model time/prediction time]
##   1
## Warning: Unknown parameter: approxRank
## Available parameter (with default values):
## k     =  10
## maxiter   =  100
## normalize     =  center
## verbose   =  FALSE
## [0.44sec/0.2sec]
# 模型比较 #ROC曲线
plot(results, annotate = c(1, 3) , legend = "bottomright")

# 准确率-召回率曲线
plot(results, "prec/rec", annotate = c(2, 3, 4), legend = "topleft")

# 预测评分算法比较
results2 <- evaluate(scheme, algorithms, type = "ratings")
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.07sec] 
## POPULAR run fold/sample [model time/prediction time]
##   1  [0.04sec/0.03sec] 
## UBCF run fold/sample [model time/prediction time]
##   1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method    =  cosine
## nn    =  25
## sample    =  FALSE
## normalize     =  center
## verbose   =  FALSE
## [0.03sec/0.69sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [23.61sec/0.04sec] 
## SVD run fold/sample [model time/prediction time]
##   1
## Warning: Unknown parameter: approxRank
## Available parameter (with default values):
## k     =  10
## maxiter   =  100
## normalize     =  center
## verbose   =  FALSE
## [0.52sec/0.04sec]
plot(results2, ylim = c(0, 20))

# 应用新的推荐算法

# 当然在recommenderlab包说明文档中也有自定义新的推荐算法的举例,通过编辑推荐算法的函数实现符合实际的推荐算法。 但要说明的是运用R进行协同过滤推荐并不是推荐系统应用的主流,主要原因在于算法调整不灵活,并且R主要依靠内存的单线程计算,并不太适用于过大规模的推荐应用

案例3:

# 1). 建立数据模型
library(reshape2)
file <- 'C:/Users/abdata/Desktop/testCF.csv'

FileDataModel <- function(file){
  data <- read.csv(file,header=TRUE)
  names(data) <- c("uid","iid","pref")
  M <- dcast(uid ~ iid,data=data)
  M <- as.matrix(M[-1])
  rownames(M) <- unique(data$uid)
  colnames(M) <- unique(sort(data$iid))
  M 
}
M <- FileDataModel(file)
## Using pref as value column: use value.var to override.
M
##   101 102 103 104 105 106 107
## 1 5.0 3.0 2.5  NA  NA  NA  NA
## 2 2.0 2.5 5.0 2.0  NA  NA  NA
## 3 2.5  NA  NA 4.0 4.5  NA   5
## 4 5.0  NA 3.0 4.5  NA   4  NA
## 5 4.0 3.0 2.0 4.0 3.5   4  NA
# 2). 欧氏距离相似度算法
EuclideanDistanceSimilarity <- function(M){
  
  simple <- function(x,y){
    num <- intersect(which(M[x,]!=0), which(M[y,]!=0)) 
    s <- sum((M[x,] - M[y,])^2, na.rm = T)
    s <- length(num)/(1 + sqrt(s))
    if(s > 1) s <- 1
    if(s < -1) s <- -1
    return(s)
  }
  outer(1:nrow(M),1:nrow(M),Vectorize(simple)) - diag(nrow(M))
}
S <- EuclideanDistanceSimilarity(M)
S
##           [,1]      [,2]      [,3]      [,4]      [,5]
## [1,] 0.0000000 0.6076560 0.2857143 1.0000000 1.0000000
## [2,] 0.6076560 0.0000000 0.6532633 0.5568464 0.7761999
## [3,] 0.2857143 0.6532633 0.0000000 0.5634581 1.0000000
## [4,] 1.0000000 0.5568464 0.5634581 0.0000000 1.0000000
## [5,] 1.0000000 0.7761999 1.0000000 1.0000000 0.0000000
# 3). 最紧邻算法
NearestNUserNeighborhood <- function(S,n){
  sapply(1:n,function(i) {
    m <- apply(S,2,which.max)
    ij <- cbind(m,seq_along(m))
    S[ij] <<- 0
    m
  })
}
NEIGHBORHOOD_NUM <- 2
N <- NearestNUserNeighborhood(S,NEIGHBORHOOD_NUM)
N
##      [,1] [,2]
## [1,]    4    5
## [2,]    5    3
## [3,]    5    2
## [4,]    1    5
## [5,]    1    3
# 4). 推荐算法
RECOMMENDER_NUM <- 3
UserBasedRecommender <- function(uid,n,M,S,N){
  part <- colnames(M)[is.na(M[uid,])]
  m <- S[uid, N[uid,]]
  md <- M[N[uid,],part]
  if(length(dim(md)) >= 1) {
    a <- colnames(md)[colSums(is.na(md)) == 0]
    score <- colSums(md*m,na.rm = T)/apply(!is.na(md),2,function(x) sum(m[x])) 
    res <- score[order(score,decreasing=T)][1:n]
    res <- res[intersect(names(res),a)]
    return(res)
  }else {
    res <- NA
  }
}

# 5). 运行程序
R1 <- UserBasedRecommender(1,RECOMMENDER_NUM,M,S,N)
R1
##  104  106 
## 4.25 4.00
R2 <- UserBasedRecommender(2,RECOMMENDER_NUM,M,S,N)
R2
##      105 
## 3.956999
R3 <- UserBasedRecommender(3,RECOMMENDER_NUM,M,S,N)
R3
##      103      102 
## 3.185407 2.802432
R4 <- UserBasedRecommender(4,RECOMMENDER_NUM,M,S,N)
R4
## 102 
##   3
R5 <- UserBasedRecommender(5,RECOMMENDER_NUM,M,S,N)
R5
## [1] NA

基于项目(item-based)的协同过滤

基于模型(model based)的协同过滤

网商时代关联规则背弃长尾效应

在实际案例运用过程中关联规则与协同过滤的区别在于,关联规则推荐的是本来就很热门的产品,因为代表同时发生频率越高,关联性越强。在网商时代会背弃长尾效应,让差异扩大,2/8定律会一定程度上扩充至1/9,助长马太效应。一般要推荐冷门产品会使用协同过滤。