一般来说,协同过滤推荐分为三种类型。第一种是基于用户(user-based)的协同过滤,第二种是基于项目(item-based)的协同过滤,第三种是基于模型(model based)的协同过滤。
# install.packages("recommenderlab")
library(recommenderlab)
## Loading required package: Matrix
## Loading required package: arules
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Loading required package: registry
library(ggplot2)
library(plyr)
#
data(MovieLense)
dim(MovieLense)
## [1] 943 1664
## [1] 943 1664
MovieLense
## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
# class(MovieLense)
# class(MovieLenseMeta)
## 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
image(sample(MovieLense,500),main="Raw ratings")
qplot(getRatings(MovieLense),binwidth=1,main="histogram of normalized ratings",xlab = "Ratings")
summary(getRatings(MovieLense))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 4.00 3.53 4.00 5.00
qplot(getRatings(normalize(MovieLense,method="Z-score")),main="hist of normalized ratings",xlab="rating")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(getRatings(normalize(MovieLense,method="Z-score")))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.8520 -0.6466 0.1084 0.0000 0.7506 4.1280
qplot(rowCounts(MovieLense), binwidth=10, main="Movies rated on Average", xlab="# of users", ylab="# of movies rated")
qplot(colMeans(MovieLense), binwidth=0.1, main="Mean ratings of Movies", xlab="Rating", ylab="# of movies")
recommenderRegistry$get_entries(dataType="realRatingMatrix")
## $ALS_realRatingMatrix
## Recommender method: ALS for realRatingMatrix
## Description: Recommender for explicit ratings based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yunhong Zhou, Dennis Wilkinson, Robert Schreiber, Rong Pan (2008). Large-Scale Parallel Collaborative Filtering for the Netflix Prize, 4th Int'l Conf. Algorithmic Aspects in Information and Management, LNCS 5034.
## Parameters:
## normalize lambda n_factors n_iterations min_item_nr seed
## 1 NULL 0.1 10 10 1 NULL
##
## $ALS_implicit_realRatingMatrix
## Recommender method: ALS_implicit for realRatingMatrix
## Description: Recommender for implicit data based on latent factors, calculated by alternating least squares algorithm.
## Reference: Yifan Hu, Yehuda Koren, Chris Volinsky (2008). Collaborative Filtering for Implicit Feedback Datasets, ICDM '08 Proceedings of the 2008 Eighth IEEE International Conference on Data Mining, pages 263-272.
## Parameters:
## lambda alpha n_factors n_iterations min_item_nr seed
## 1 0.1 10 10 10 1 NULL
##
## $IBCF_realRatingMatrix
## Recommender method: IBCF for realRatingMatrix
## Description: Recommender based on item-based collaborative filtering.
## Reference: NA
## Parameters:
## k method normalize normalize_sim_matrix alpha na_as_zero
## 1 30 "Cosine" "center" FALSE 0.5 FALSE
##
## $POPULAR_realRatingMatrix
## Recommender method: POPULAR for realRatingMatrix
## Description: Recommender based on item popularity.
## Reference: NA
## Parameters:
## normalize aggregationRatings aggregationPopularity
## 1 "center" new("standardGeneric" new("standardGeneric"
##
## $RANDOM_realRatingMatrix
## Recommender method: RANDOM for realRatingMatrix
## Description: Produce random recommendations (real ratings).
## Reference: NA
## Parameters: None
##
## $RERECOMMEND_realRatingMatrix
## Recommender method: RERECOMMEND for realRatingMatrix
## Description: Re-recommends highly rated items (real ratings).
## Reference: NA
## Parameters:
## randomize minRating
## 1 1 NA
##
## $SVD_realRatingMatrix
## Recommender method: SVD for realRatingMatrix
## Description: Recommender based on SVD approximation with column-mean imputation.
## Reference: NA
## Parameters:
## k maxiter normalize
## 1 10 100 "center"
##
## $SVDF_realRatingMatrix
## Recommender method: SVDF for realRatingMatrix
## Description: Recommender based on Funk SVD with gradient descend.
## Reference: NA
## Parameters:
## k gamma lambda min_epochs max_epochs min_improvement normalize verbose
## 1 10 0.015 0.001 50 200 1e-06 "center" FALSE
##
## $UBCF_realRatingMatrix
## Recommender method: UBCF for realRatingMatrix
## Description: Recommender based on user-based collaborative filtering.
## Reference: NA
## Parameters:
## method nn sample normalize
## 1 "cosine" 25 FALSE "center"
scheme <- evaluationScheme(MovieLense, method="split", train=0.9, k=1, given=10, goodRating=4)
scheme
## Evaluation scheme with 10 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=4.000000
## Data set: 943 x 1664 rating matrix of class 'realRatingMatrix' with 99392 ratings.
algorithms <- list(
"random items" = list(name="RANDOM", param=list(normalize = "Z-score")),
"popular items" = list(name="POPULAR", param=list(normalize = "Z-score")),
"user-based CF" = list(name="UBCF", param=list(normalize = "Z-score", method="Cosine", nn=50, minRating=3)),
"item-based CF" = list(name="IBCF", param=list(normalize = "Z-score", method="Cosine"))
)
# run algorithms, predict next n movies
results <- evaluate(scheme, algorithms, n=c(1, 3, 5, 10, 15, 20))
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.34sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0.03sec/0.25sec]
## UBCF run fold/sample [model time/prediction time]
## 1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
## [0.03sec/1.04sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [34.94sec/0.06sec]
plot(results, annotate = 1:4, legend="topleft")
# See precision / recall
plot(results, "prec/rec", annotate=3)
summary(results)
## Length Class Mode
## random items 1 evaluationResults S4
## popular items 1 evaluationResults S4
## user-based CF 1 evaluationResults S4
## item-based CF 1 evaluationResults S4
print(results)
## List of evaluation results for 4 recommenders:
## Evaluation results for 1 folds/samples using method 'RANDOM'.
## Evaluation results for 1 folds/samples using method 'POPULAR'.
## Evaluation results for 1 folds/samples using method 'UBCF'.
## Evaluation results for 1 folds/samples using method 'IBCF'.
result1 <- ldply(avg(results))
head(result1)
## .id TP FP FN TN precision
## 1 random items 0.1368421 0.8631579 50.68421 1602.316 0.13684211
## 2 random items 0.3578947 2.6421053 50.46316 1600.537 0.11929825
## 3 random items 0.4736842 4.5263158 50.34737 1598.653 0.09473684
## 4 random items 0.8210526 9.1789474 50.00000 1594.000 0.08210526
## 5 random items 1.2000000 13.8000000 49.62105 1589.379 0.08000000
## 6 random items 1.6000000 18.4000000 49.22105 1584.779 0.08000000
## recall TPR FPR
## 1 0.002314131 0.002314131 0.0005351756
## 2 0.007203976 0.007203976 0.0016419898
## 3 0.008970023 0.008970023 0.0028166748
## 4 0.016294109 0.016294109 0.0057157308
## 5 0.023149457 0.023149457 0.0085937135
## 6 0.032354085 0.032354085 0.0114554966
result1[,1] <- paste(result1[,1],c(1, 3, 5, 10, 15, 20))
temp_result1 <- result1[,c(1,6,7)]
f<-function(p,r){
return(2*p*r)/(p+r)
}
result1_f <- cbind(result1,f=f(temp_result1[,2],temp_result1[,3]))
head(result1_f)
## .id TP FP FN TN precision
## 1 random items 1 0.1368421 0.8631579 50.68421 1602.316 0.13684211
## 2 random items 3 0.3578947 2.6421053 50.46316 1600.537 0.11929825
## 3 random items 5 0.4736842 4.5263158 50.34737 1598.653 0.09473684
## 4 random items 10 0.8210526 9.1789474 50.00000 1594.000 0.08210526
## 5 random items 15 1.2000000 13.8000000 49.62105 1589.379 0.08000000
## 6 random items 20 1.6000000 18.4000000 49.22105 1584.779 0.08000000
## recall TPR FPR f
## 1 0.002314131 0.002314131 0.0005351756 0.0006333412
## 2 0.007203976 0.007203976 0.0016419898 0.0017188434
## 3 0.008970023 0.008970023 0.0028166748 0.0016995834
## 4 0.016294109 0.016294109 0.0057157308 0.0026756642
## 5 0.023149457 0.023149457 0.0085937135 0.0037039131
## 6 0.032354085 0.032354085 0.0114554966 0.0051766535
head(result1_f[order(-result1_f$f),])
## .id TP FP FN TN precision
## 18 user-based CF 20 6.431579 13.568421 44.38947 1589.611 0.3215789
## 17 user-based CF 15 5.210526 9.789474 45.61053 1593.389 0.3473684
## 16 user-based CF 10 3.842105 6.157895 46.97895 1597.021 0.3842105
## 15 user-based CF 5 2.200000 2.800000 48.62105 1600.379 0.4400000
## 12 popular items 20 5.663158 14.336842 45.15789 1588.842 0.2831579
## 11 popular items 15 4.589474 10.410526 46.23158 1592.768 0.3059649
## recall TPR FPR f
## 18 0.18780517 0.18780517 0.008388672 0.12078837
## 17 0.15994608 0.15994608 0.006050759 0.11112044
## 16 0.12814236 0.12814236 0.003805340 0.09846728
## 15 0.08364741 0.08364741 0.001730485 0.07360972
## 12 0.12898428 0.12898428 0.008860610 0.07304584
## 11 0.10978560 0.10978560 0.006430933 0.06718108
#带入模型
moive_re <- Recommender(MovieLense,method="UBCF")
moives_pr <- predict(moive_re,MovieLense,n=20)
print(as(moives_pr,"list")[1:5])
## $`1`
## [1] "Glory (1989)"
## [2] "Schindler's List (1993)"
## [3] "Casablanca (1942)"
## [4] "Close Shave, A (1995)"
## [5] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
## [6] "Leaving Las Vegas (1995)"
## [7] "One Flew Over the Cuckoo's Nest (1975)"
## [8] "Rear Window (1954)"
## [9] "Heathers (1989)"
## [10] "L.A. Confidential (1997)"
## [11] "City of Lost Children, The (1995)"
## [12] "Butch Cassidy and the Sundance Kid (1969)"
## [13] "Titanic (1997)"
## [14] "Lawrence of Arabia (1962)"
## [15] "Shine (1996)"
## [16] "Stand by Me (1986)"
## [17] "Gandhi (1982)"
## [18] "To Kill a Mockingbird (1962)"
## [19] "In the Name of the Father (1993)"
## [20] "Harold and Maude (1971)"
##
## $`2`
## [1] "Boot, Das (1981)"
## [2] "Dead Man Walking (1995)"
## [3] "Lone Star (1996)"
## [4] "Return of the Jedi (1983)"
## [5] "Celluloid Closet, The (1995)"
## [6] "Casablanca (1942)"
## [7] "Citizen Kane (1941)"
## [8] "Godfather: Part II, The (1974)"
## [9] "2001: A Space Odyssey (1968)"
## [10] "When We Were Kings (1996)"
## [11] "Diva (1981)"
## [12] "Close Shave, A (1995)"
## [13] "Tango Lesson, The (1997)"
## [14] "Beautiful Thing (1996)"
## [15] "Empire Strikes Back, The (1980)"
## [16] "Mrs. Dalloway (1997)"
## [17] "Butch Cassidy and the Sundance Kid (1969)"
## [18] "My Fair Lady (1964)"
## [19] "Bonnie and Clyde (1967)"
## [20] "Annie Hall (1977)"
##
## $`3`
## [1] "Mrs. Brown (Her Majesty, Mrs. Brown) (1997)"
## [2] "Star Wars (1977)"
## [3] "Pulp Fiction (1994)"
## [4] "English Patient, The (1996)"
## [5] "Full Monty, The (1997)"
## [6] "Lone Star (1996)"
## [7] "Titanic (1997)"
## [8] "Sweet Hereafter, The (1997)"
## [9] "In the Company of Men (1997)"
## [10] "Willy Wonka and the Chocolate Factory (1971)"
## [11] "In & Out (1997)"
## [12] "Vertigo (1958)"
## [13] "As Good As It Gets (1997)"
## [14] "Apt Pupil (1998)"
## [15] "Dazed and Confused (1993)"
## [16] "Ice Storm, The (1997)"
## [17] "This Is Spinal Tap (1984)"
## [18] "Trainspotting (1996)"
## [19] "Heat (1995)"
## [20] "Fargo (1996)"
##
## $`4`
## [1] "Titanic (1997)" "English Patient, The (1996)"
## [3] "L.A. Confidential (1997)" "Game, The (1997)"
## [5] "Good Will Hunting (1997)" "Kiss the Girls (1997)"
## [7] "Full Monty, The (1997)" "Usual Suspects, The (1995)"
## [9] "Rosewood (1997)" "Boogie Nights (1997)"
## [11] "Raise the Red Lantern (1991)" "Pulp Fiction (1994)"
## [13] "Toy Story (1995)" "Love Jones (1997)"
## [15] "Eve's Bayou (1997)" "Edge, The (1997)"
## [17] "Sting, The (1973)" "Some Like It Hot (1959)"
## [19] "Strictly Ballroom (1992)" "Soul Food (1997)"
##
## $`5`
## [1] "Terminator 2: Judgment Day (1991)"
## [2] "Terminator, The (1984)"
## [3] "Usual Suspects, The (1995)"
## [4] "Contact (1997)"
## [5] "Braveheart (1995)"
## [6] "Casablanca (1942)"
## [7] "Twelve Monkeys (1995)"
## [8] "Godfather, The (1972)"
## [9] "Shawshank Redemption, The (1994)"
## [10] "Raising Arizona (1987)"
## [11] "Amadeus (1984)"
## [12] "Nikita (La Femme Nikita) (1990)"
## [13] "Reservoir Dogs (1992)"
## [14] "Citizen Kane (1941)"
## [15] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
## [16] "Schindler's List (1993)"
## [17] "Titanic (1997)"
## [18] "Leaving Las Vegas (1995)"
## [19] "North by Northwest (1959)"
## [20] "Army of Darkness (1993)"
library(recommenderlab)
# 数据构成
set.seed(1234)
m <- matrix(sample(c(as.numeric(0:5), NA), 50,
replace = TRUE, prob = c(rep(.4/6, 6), .6 )), ncol = 10, dimnames = list(user = paste("u", 1:5,sep = ''), item=paste("i", 1:10, sep = '' )))
r <- as(m, "realRatingMatrix")
m
## item
## user i1 i2 i3 i4 i5 i6 i7 i8 i9 i10
## u1 NA 2 3 5 NA 5 NA 4 NA NA
## u2 2 NA NA NA NA NA NA NA 2 3
## u3 2 NA NA NA NA 1 NA NA NA NA
## u4 2 2 1 NA NA 5 NA 0 2 NA
## u5 5 NA NA NA NA NA NA 5 NA 4
r
## 5 x 10 rating matrix of class 'realRatingMatrix' with 19 ratings.
# 运用recommenderlab包实现协同过滤推荐,其数据类型采用S4类构造,使用抽象的raringMatrix为评分数据提供接口。 先自行构建一个包含缺失值的矩阵,行为u1-u5个用户,列为i1-i10的物品。 需通过as()函数转为raringMatrix类型。 raringMatrix采用了很多类似矩阵对象的操作,如dim(),dimnames(),rowCounts(),colMeans(),rowMeans(),colSums(),rowMeans()、、也增加了一些特别的操作方法,如sample(),用于从用户(即,行)中抽样,image()可以生成像素图
# 数据转换
head(as(r , "data.frame"))
## user item rating
## 5 u1 i2 2
## 7 u1 i3 3
## 9 u1 i4 5
## 10 u1 i6 5
## 13 u1 i8 4
## 1 u2 i1 2
n <- normalize(r)
image(r, main = "Raw rating")
image(n, main = "normalized rating")
#二元分类转换
r_b <- binarize(r, minRating = 4)
b <- as(r_b , "matrix")
b
## i1 i2 i3 i4 i5 i6 i7 i8 i9 i10
## u1 FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
## u2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## u3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## u4 FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## u5 TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
# 数据也可转换为数据框类型,或通过normalize()函数进行标准化处理,标准化的目的是为了去除用户评分的偏差、通过binarize()函数转为binaryRatingMatrix类型,适用于是0-1的评分矩阵(赞/踩)
# 数据可视化
# 为更好说明推荐系统的算法,我们采用非常出名的MovieLense数据集,其收集了网站MovieLens(movielens.umn.edu)从1997年9月19日到1998年4月22日的数据,包括943名用户对1664部电影的评分。
# getRatings()函数可获取评价数据。
# image()可查看数据分布
data(MovieLense)
r <- sample(MovieLense, 943, replace = F)
image(MovieLense)
hist(getRatings(normalize(MovieLense)), breaks = 100)
hist(rowCounts(r), breaks = 50)
# 创建推荐
# 查看推荐方法
# 我们已经获得了评分数据,现在来看看recommenderlab包可以实现哪些推荐算法呢!
# 可以运用recommenderRegistry$get_entry_names()函数实现。
# 参数realRatingMatrix表示数据类型是推荐评分型(如1-5颗星评价),可具体查看算法说明。
# 对于realRatingMatrix有六种方法:IBCF(基于物品的推荐)、UBCF(基于用户的推荐)、PCA(主成分分析)、 RANDOM(随机推荐)、SVD(矩阵因子化)、POPULAR(基于流行度的推荐)
recommenderRegistry$get_entry_names()
## [1] "ALS_realRatingMatrix" "ALS_implicit_realRatingMatrix"
## [3] "ALS_implicit_binaryRatingMatrix" "AR_binaryRatingMatrix"
## [5] "IBCF_binaryRatingMatrix" "IBCF_realRatingMatrix"
## [7] "POPULAR_binaryRatingMatrix" "POPULAR_realRatingMatrix"
## [9] "RANDOM_realRatingMatrix" "RANDOM_binaryRatingMatrix"
## [11] "RERECOMMEND_realRatingMatrix" "SVD_realRatingMatrix"
## [13] "SVDF_realRatingMatrix" "UBCF_binaryRatingMatrix"
## [15] "UBCF_realRatingMatrix"
# 建立模型
# 建立协同过滤推荐算法模型,主要运用 Recommender(data=ratingMatrix,method,parameter=NULL)函数,getModel()可查看模型参数
r_recom <- Recommender(r , method = "IBCF")
r_popul <- Recommender(r , method = "POPULAR")
# 查看模型方法
names(getModel(r_recom))
## [1] "description" "sim" "k"
## [4] "method" "normalize" "normalize_sim_matrix"
## [7] "alpha" "na_as_zero" "verbose"
names(getModel(r_popul))
## [1] "topN" "ratings" "normalize"
## [4] "aggregationRatings" "aggregationPopularity" "verbose"
# 模型预测
# TOP-N预测
# 对模型预测可运用predict()函数,在此分别以TOP-N预测及评分预测为例,预测第940-943位观影者的评分情况。 n表示最终为TOP-N的列表推荐,参数type = "ratings"表示运用评分预测观影者对电影评分,模型结果均需转为list或矩阵表示
pred <- predict(r_popul ,r[940:943], n = 5)
as(pred, "list")
## $`877`
## [1] "Star Wars (1977)" "Godfather, The (1972)"
## [3] "Fargo (1996)" "Raiders of the Lost Ark (1981)"
## [5] "Titanic (1997)"
##
## $`476`
## [1] "Star Wars (1977)" "Godfather, The (1972)"
## [3] "Fargo (1996)" "Raiders of the Lost Ark (1981)"
## [5] "Silence of the Lambs, The (1991)"
##
## $`929`
## [1] "Titanic (1997)" "Shawshank Redemption, The (1994)"
## [3] "Return of the Jedi (1983)" "L.A. Confidential (1997)"
## [5] "Princess Bride, The (1987)"
##
## $`366`
## [1] "Star Wars (1977)" "Godfather, The (1972)"
## [3] "Fargo (1996)" "Raiders of the Lost Ark (1981)"
## [5] "Titanic (1997)"
#top-N为有序列表,抽取最优推荐子集
pred3 <- bestN(pred, n=3)
as(pred3, "list")
## $`877`
## [1] "Star Wars (1977)" "Godfather, The (1972)" "Fargo (1996)"
##
## $`476`
## [1] "Star Wars (1977)" "Godfather, The (1972)" "Fargo (1996)"
##
## $`929`
## [1] "Titanic (1997)" "Shawshank Redemption, The (1994)"
## [3] "Return of the Jedi (1983)"
##
## $`366`
## [1] "Star Wars (1977)" "Godfather, The (1972)" "Fargo (1996)"
#评分预测
rate <- predict(r_popul, r[940:943], type = "ratings")
as(rate, "matrix")[, 1:5]
## Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## 877 4.126962 3.574065 3.421685 3.797723
## 476 3.665655 3.112759 2.960378 NA
## 929 NA 3.440782 3.288402 3.664440
## 366 4.693741 4.140844 3.988464 4.364502
## Copycat (1995)
## 877 3.621840
## 476 3.160534
## 929 3.488557
## 366 4.188619
# 预测模型评价
# 评分预测模型评价
# 建立推荐系统模型后,非常关心的是对预测模型的评价。
# 可通过evaluationScheme()将数据按一定规则分为训练集和测试集(参数method = "split",),
# 或进行k-fold交叉验证(如method = "cross",k=4),
# given参数表示用来进行模型评价的items的数量。
# 分别运用UBCF及IBCF算法,进行预测评价。
#
# getData(e,"train")表示获取训练集数据,
# predict(r1,getData(e,"known"),type = "ratings")表示对“已知”训练集数据进行预测。
# 计算预测模型的准确度可通过calcPredictionAccuracy()函数实现,参数“unknown”表示对“未知”test集进行比较。
# 结果发现两种方法的均方根误差(RMSE)基本一致
e <- evaluationScheme(r[1:800], method = "split", train = 0.9, given = 15, goodRating = 5)
e
## Evaluation scheme with 15 items given
## Method: 'split' with 1 run(s).
## Training set proportion: 0.900
## Good ratings: >=5.000000
## Data set: 800 x 1664 rating matrix of class 'realRatingMatrix' with 83043 ratings.
r1 <- Recommender(getData(e , "train") , "UBCF")
p1 <- predict(r1 , getData(e , "known") , type = "ratings")
r2 <- Recommender(getData(e , "train") , "IBCF")
p2 <- predict(r2 , getData(e , "known") , type = "ratings")
##计算预测模型的准确度
c1 <- calcPredictionAccuracy(p1, getData(e , "unknown"))
c2 <- calcPredictionAccuracy(p2, getData(e , "unknown"))
error <- rbind(c1, c2)
rownames(error) <- c("UBCF", "IBCF")
error
## RMSE MSE MAE
## UBCF 1.053345 1.109537 0.8338755
## IBCF 1.172571 1.374924 0.8580037
# TOP-N预测模型评价
# 让我们来评价TOP-1,TOP-3,TOP-5,TOP-10推荐准确性。
# 通过4-fold交叉验证方法分割数据集,运用evaluate()进行TOP-N预测模型评价。
# 评价结果可通过ROC曲线及准确率-召回率曲线展示
# 4-fold交叉验证
tops <- evaluationScheme(r[1:800], method = "cross", k = 4, given = 3, goodRating = 5)
tops
## Evaluation scheme with 3 items given
## Method: 'cross-validation' with 4 run(s).
## Good ratings: >=5.000000
## Data set: 800 x 1664 rating matrix of class 'realRatingMatrix' with 83043 ratings.
results <- evaluate(tops, method = "POPULAR", type = "topNList", n=c(1, 3, 5, 10))
## POPULAR run fold/sample [model time/prediction time]
## 1 [0sec/0.67sec]
## 2 [0sec/0.61sec]
## 3 [0.01sec/0.66sec]
## 4 [0.02sec/0.63sec]
# 获得混淆矩阵
getConfusionMatrix(results)[[1]]
## TP FP FN TN precision recall TPR FPR
## 1 0.39 0.61 19.80 1640.20 0.3900000 0.02826095 0.02826095 0.0003701392
## 3 0.83 2.17 19.36 1638.64 0.2766667 0.05467628 0.05467628 0.0013176701
## 5 1.24 3.76 18.95 1637.05 0.2480000 0.08633353 0.08633353 0.0022851454
## 10 2.12 7.88 18.07 1632.93 0.2120000 0.12988107 0.12988107 0.0047891164
avg(results)
## TP FP FN TN precision recall TPR
## 1 0.34000 0.66000 20.75000 1639.250 0.3400000 0.02278813 0.02278813
## 3 0.75250 2.24750 20.33750 1637.663 0.2508333 0.05391753 0.05391753
## 5 1.15625 3.84375 19.93375 1636.066 0.2312500 0.07884788 0.07884788
## 10 2.05125 7.94875 19.03875 1631.961 0.2051250 0.12853235 0.12853235
## FPR
## 1 0.0004008696
## 3 0.0013665909
## 5 0.0023378733
## 10 0.0048350401
# ROC曲线
plot(results, annotate = TRUE)
# 准确率-召回率曲线
plot(results, "prec/rec", annotate = TRUE)
# 推荐算法的比较
# 除了对预测模型进行评价,还可以对不同推荐算法进行比较。 可首先构建一个推荐算法列表,通过ROC曲线、、准确率-召回率曲线或RMSE直方图进行比较
# TOP-N算法比较
set.seed(2017)
scheme <- evaluationScheme(r, method = "split", train = 0.9, k = 1, given = 10, goodRating = 5)
# 构建推荐算法列表
algorithms <- list(
"random items"=list(name = "RANDOM", param = NULL),
"popular items" = list(name = "POPULAR", param = list(normalize="Z-score")),
"user-based CF" = list(name = "UBCF", param = list(normalize="Z-score", method = "Cosine", nn = 25, minRating = 3)),
"item-based CF" = list(name = "IBCF", param = list(k = 50)),
"SVD approximation" = list(name = "SVD", param = list(approxRank = 50)))
# 构建不同算法模型
results <- evaluate(scheme, algorithms, n = c(1, 3, 5, 10, 15, 20))
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.21sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0.03sec/0.25sec]
## UBCF run fold/sample [model time/prediction time]
## 1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
## [0.03sec/0.91sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [22.62sec/0.07sec]
## SVD run fold/sample [model time/prediction time]
## 1
## Warning: Unknown parameter: approxRank
## Available parameter (with default values):
## k = 10
## maxiter = 100
## normalize = center
## verbose = FALSE
## [0.44sec/0.2sec]
# 模型比较 #ROC曲线
plot(results, annotate = c(1, 3) , legend = "bottomright")
# 准确率-召回率曲线
plot(results, "prec/rec", annotate = c(2, 3, 4), legend = "topleft")
# 预测评分算法比较
results2 <- evaluate(scheme, algorithms, type = "ratings")
## RANDOM run fold/sample [model time/prediction time]
## 1 [0sec/0.07sec]
## POPULAR run fold/sample [model time/prediction time]
## 1 [0.04sec/0.03sec]
## UBCF run fold/sample [model time/prediction time]
## 1
## Warning: Unknown parameters: minRating
## Available parameter (with default values):
## method = cosine
## nn = 25
## sample = FALSE
## normalize = center
## verbose = FALSE
## [0.03sec/0.69sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [23.61sec/0.04sec]
## SVD run fold/sample [model time/prediction time]
## 1
## Warning: Unknown parameter: approxRank
## Available parameter (with default values):
## k = 10
## maxiter = 100
## normalize = center
## verbose = FALSE
## [0.52sec/0.04sec]
plot(results2, ylim = c(0, 20))
# 应用新的推荐算法
# 当然在recommenderlab包说明文档中也有自定义新的推荐算法的举例,通过编辑推荐算法的函数实现符合实际的推荐算法。 但要说明的是运用R进行协同过滤推荐并不是推荐系统应用的主流,主要原因在于算法调整不灵活,并且R主要依靠内存的单线程计算,并不太适用于过大规模的推荐应用
# 1). 建立数据模型
library(reshape2)
file <- 'C:/Users/abdata/Desktop/testCF.csv'
FileDataModel <- function(file){
data <- read.csv(file,header=TRUE)
names(data) <- c("uid","iid","pref")
M <- dcast(uid ~ iid,data=data)
M <- as.matrix(M[-1])
rownames(M) <- unique(data$uid)
colnames(M) <- unique(sort(data$iid))
M
}
M <- FileDataModel(file)
## Using pref as value column: use value.var to override.
M
## 101 102 103 104 105 106 107
## 1 5.0 3.0 2.5 NA NA NA NA
## 2 2.0 2.5 5.0 2.0 NA NA NA
## 3 2.5 NA NA 4.0 4.5 NA 5
## 4 5.0 NA 3.0 4.5 NA 4 NA
## 5 4.0 3.0 2.0 4.0 3.5 4 NA
# 2). 欧氏距离相似度算法
EuclideanDistanceSimilarity <- function(M){
simple <- function(x,y){
num <- intersect(which(M[x,]!=0), which(M[y,]!=0))
s <- sum((M[x,] - M[y,])^2, na.rm = T)
s <- length(num)/(1 + sqrt(s))
if(s > 1) s <- 1
if(s < -1) s <- -1
return(s)
}
outer(1:nrow(M),1:nrow(M),Vectorize(simple)) - diag(nrow(M))
}
S <- EuclideanDistanceSimilarity(M)
S
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.0000000 0.6076560 0.2857143 1.0000000 1.0000000
## [2,] 0.6076560 0.0000000 0.6532633 0.5568464 0.7761999
## [3,] 0.2857143 0.6532633 0.0000000 0.5634581 1.0000000
## [4,] 1.0000000 0.5568464 0.5634581 0.0000000 1.0000000
## [5,] 1.0000000 0.7761999 1.0000000 1.0000000 0.0000000
# 3). 最紧邻算法
NearestNUserNeighborhood <- function(S,n){
sapply(1:n,function(i) {
m <- apply(S,2,which.max)
ij <- cbind(m,seq_along(m))
S[ij] <<- 0
m
})
}
NEIGHBORHOOD_NUM <- 2
N <- NearestNUserNeighborhood(S,NEIGHBORHOOD_NUM)
N
## [,1] [,2]
## [1,] 4 5
## [2,] 5 3
## [3,] 5 2
## [4,] 1 5
## [5,] 1 3
# 4). 推荐算法
RECOMMENDER_NUM <- 3
UserBasedRecommender <- function(uid,n,M,S,N){
part <- colnames(M)[is.na(M[uid,])]
m <- S[uid, N[uid,]]
md <- M[N[uid,],part]
if(length(dim(md)) >= 1) {
a <- colnames(md)[colSums(is.na(md)) == 0]
score <- colSums(md*m,na.rm = T)/apply(!is.na(md),2,function(x) sum(m[x]))
res <- score[order(score,decreasing=T)][1:n]
res <- res[intersect(names(res),a)]
return(res)
}else {
res <- NA
}
}
# 5). 运行程序
R1 <- UserBasedRecommender(1,RECOMMENDER_NUM,M,S,N)
R1
## 104 106
## 4.25 4.00
R2 <- UserBasedRecommender(2,RECOMMENDER_NUM,M,S,N)
R2
## 105
## 3.956999
R3 <- UserBasedRecommender(3,RECOMMENDER_NUM,M,S,N)
R3
## 103 102
## 3.185407 2.802432
R4 <- UserBasedRecommender(4,RECOMMENDER_NUM,M,S,N)
R4
## 102
## 3
R5 <- UserBasedRecommender(5,RECOMMENDER_NUM,M,S,N)
R5
## [1] NA
在实际案例运用过程中关联规则与协同过滤的区别在于,关联规则推荐的是本来就很热门的产品,因为代表同时发生频率越高,关联性越强。在网商时代会背弃长尾效应,让差异扩大,2/8定律会一定程度上扩充至1/9,助长马太效应。一般要推荐冷门产品会使用协同过滤。