前言

现实世界中数据一般都是复杂和高维的,比如描述一个人,有姓名、年龄、性别、受教育程度、收入、地址、电话等等几十种属性,如此多的属性对于数据分析是一个严重的挑战,除了极大增加建模的成本和模型的复杂度,往往也会导致过拟合问题,因此在实际处理过程中,一些降维的方法是必不可少,其中用的比较多的有主成分分析(PCA)[Principal Component Analysis]、奇异值分解(SVD)[Singular Value Decomposition]、特征选择(Feature Select)等等。

主成分分析(PCA)-R的计算过程

# 学生体型主成分分析
# X1 : 身高 X2 : 体重 X3 : 胸围 X4 : 坐高
student <- data.frame(
  X1 = c(148, 139, 160, 149, 159, 142, 153, 150, 151, 139, 
         140, 161, 158, 140, 137, 152, 149, 145, 160, 156, 
         151, 147, 157, 147, 157, 151, 144, 141, 139, 148), 
  X2 = c(41, 34, 49, 36, 45, 31, 43, 43, 42, 31, 
         29, 47, 49, 33, 31, 35, 47, 35, 47, 44,
         42, 38, 39, 30, 48, 36, 36, 30, 32, 38),
  X3 = c(72, 71, 77, 67, 80, 66, 76, 77, 77, 68, 
         64, 78, 78, 67, 66, 73, 82, 70, 74, 78, 
         73, 73, 68, 65, 80, 74, 68, 67, 68, 70),
  X4 = c(78, 76, 86, 79, 86, 76, 83, 79, 80, 74, 
         74, 84, 83, 77, 73, 79, 79, 77, 87, 85, 
         82, 78, 80, 75, 88, 80, 76, 76, 73, 78)
)

# 手工计算
# 1. 计算协方差
(student.cov <- cov(student))
##          X1       X2       X3       X4
## X1 53.51724 40.79310 27.58621 28.75862
## X2 40.79310 41.73448 29.83103 24.35517
## X3 27.58621 29.83103 26.52989 17.22184
## X4 28.75862 24.35517 17.22184 18.24023
# 2. 计算协方差的特征向量和特征值
student.eigen <- eigen(student.cov)
# 特征值
student.eigen$values
## [1] 124.814224  10.848728   2.495491   1.863396
# 特征向量
student.eigen$vectors
##            [,1]       [,2]        [,3]        [,4]
## [1,] -0.6240226  0.6455638  0.22363789 -0.37924835
## [2,] -0.5591912 -0.3456392 -0.74576837 -0.10801960
## [3,] -0.4083340 -0.6604700  0.62447014 -0.08414179
## [4,] -0.3621661  0.1660133  0.06206998  0.91510798
# 3. 计算方差贡献率和累积贡献率
(student.pov <- (student.eigen$values/sum(student.eigen$values)))
## [1] 0.89139112 0.07747883 0.01782215 0.01330789
(student.cp <- cumsum(student.pov))
## [1] 0.8913911 0.9688700 0.9866921 1.0000000
# 4. 主成分的标准方差
(student.z <- sqrt(student.eigen$values))
## [1] 11.172029  3.293741  1.579712  1.365062
# princomp函数计算

library(stats)

#cor是逻辑变量 当cor=TRUE表示用样本的相关矩阵R做主成分分析
test.pr <- princomp(student, cor=TRUE)  

# 当cor=FALSE表示用样本的协方差阵S做主成分分析
# loading是逻辑变量 当loading=TRUE时表示显示loading 的内容
summary(test.pr,loadings=TRUE)  
## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     1.8817805 0.55980636 0.28179594 0.25711844
## Proportion of Variance 0.8852745 0.07834579 0.01985224 0.01652747
## Cumulative Proportion  0.8852745 0.96362029 0.98347253 1.00000000
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.497  0.543 -0.450  0.506
## X2 -0.515 -0.210 -0.462 -0.691
## X3 -0.481 -0.725  0.175  0.461
## X4 -0.507  0.368  0.744 -0.232
# 分析结果含义
#----Standard deviation 标准差   其平方为方差=特征值
#----Proportion of Variance  方差贡献率
#----Cumulative Proportion  方差累计贡献率

# 由结果显示 前两个主成分的累计贡献率已经达到96% 可以舍去另外两个主成分 达到降维的目的

# 因此可以得到函数表达式 
# Z1 = -0.497X'1-0.515X'2-0.481X'3-0.507X'4
# Z2 =  0.543X'1-0.210X'2-0.725X'3-0.368X'4

# loadings的输出结果为载荷 是主成分对应于原始变量的系数即Q矩阵
loadings(test.pr)
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.497  0.543 -0.450  0.506
## X2 -0.515 -0.210 -0.462 -0.691
## X3 -0.481 -0.725  0.175  0.461
## X4 -0.507  0.368  0.744 -0.232
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00
head(test.pr$scores)
##          Comp.1      Comp.2      Comp.3      Comp.4
## [1,]  0.0699095 -0.23813701 -0.35509248 -0.26612014
## [2,]  1.5952634 -0.71847399  0.32813232 -0.11805665
## [3,] -2.8479315  0.38956679 -0.09731731 -0.27948249
## [4,]  0.7599699  0.80604335 -0.04945722 -0.16294930
## [5,] -2.7396678  0.01718087  0.36012615  0.35865304
## [6,]  2.1058317  0.32284393  0.18600422 -0.03645608
# 画主成分的碎石图并预测

screeplot(test.pr, type="lines")

p <- predict(test.pr)
p
##            Comp.1      Comp.2      Comp.3       Comp.4
##  [1,]  0.06990950 -0.23813701 -0.35509248 -0.266120139
##  [2,]  1.59526340 -0.71847399  0.32813232 -0.118056646
##  [3,] -2.84793151  0.38956679 -0.09731731 -0.279482487
##  [4,]  0.75996988  0.80604335 -0.04945722 -0.162949298
##  [5,] -2.73966777  0.01718087  0.36012615  0.358653044
##  [6,]  2.10583168  0.32284393  0.18600422 -0.036456084
##  [7,] -1.42105591 -0.06053165  0.21093321 -0.044223092
##  [8,] -0.82583977 -0.78102576 -0.27557798  0.057288572
##  [9,] -0.93464402 -0.58469242 -0.08814136  0.181037746
## [10,]  2.36463820 -0.36532199  0.08840476  0.045520127
## [11,]  2.83741916  0.34875841  0.03310423 -0.031146930
## [12,] -2.60851224  0.21278728 -0.33398037  0.210157574
## [13,] -2.44253342 -0.16769496 -0.46918095 -0.162987830
## [14,]  1.86630669  0.05021384  0.37720280 -0.358821916
## [15,]  2.81347421 -0.31790107 -0.03291329 -0.222035112
## [16,]  0.06392983  0.20718448  0.04334340  0.703533624
## [17,] -1.55561022 -1.70439674 -0.33126406  0.007551879
## [18,]  1.07392251 -0.06763418  0.02283648  0.048606680
## [19,] -2.52174212  0.97274301  0.12164633 -0.390667991
## [20,] -2.14072377  0.02217881  0.37410972  0.129548960
## [21,] -0.79624422  0.16307887  0.12781270 -0.294140762
## [22,]  0.28708321 -0.35744666 -0.03962116  0.080991989
## [23,] -0.25151075  1.25555188 -0.55617325  0.109068939
## [24,]  2.05706032  0.78894494 -0.26552109  0.388088643
## [25,] -3.08596855 -0.05775318  0.62110421 -0.218939612
## [26,] -0.16367555  0.04317932  0.24481850  0.560248997
## [27,]  1.37265053  0.02220972 -0.23378320 -0.257399715
## [28,]  2.16097778  0.13733233  0.35589739  0.093123683
## [29,]  2.40434827 -0.48613137 -0.16154441 -0.007914021
## [30,]  0.50287468  0.14734317 -0.20590831 -0.122078819

主成分分析(PCA)-R包使用

在R语言中PCA对应函数是princomp,来自stats包。以美国的各州犯罪数据为对象进行分析,数据集USArrests在graphics包中。

library(stats)  
head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
summary(pc.cr <- princomp(USArrests, cor = TRUE))
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4
## Standard deviation     1.5748783 0.9948694 0.5971291 0.41644938
## Proportion of Variance 0.6200604 0.2474413 0.0891408 0.04335752
## Cumulative Proportion  0.6200604 0.8675017 0.9566425 1.00000000
# 每个主成分对方差的贡献比例,显然Comp.1 +  Comp2所占比例超过85%,因此能够用前两个主成分来表示整个数据集,也将数据从4维降到两维


# 查看每个特征在主成分中所在的比例
loadings(pc.cr)
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4
## Murder   -0.536  0.418 -0.341  0.649
## Assault  -0.583  0.188 -0.268 -0.743
## UrbanPop -0.278 -0.873 -0.378  0.134
## Rape     -0.543 -0.167  0.818       
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00
# 根据以上数据可很容易转换为几个数学等式:
# Comp1 = -0.536 * Murder + (-0.583) * Assault + (-0.278)*UrbanPop + (-0.543)* Rape
# Comp2 = 0.418 * Murder + 0.188 * Assault + (-0.873)*UrbanPop + (-0.167)* Rape
# 可以用Comp1、Comp2两个维度的数据来表示各州,在二维图上展现各州个聚类关系。

# scores包含有各州在四个主成分的得分
head(pc.cr$scores) 
##                Comp.1     Comp.2      Comp.3       Comp.4
## Alabama    -0.9855659  1.1333924 -0.44426879  0.156267145
## Alaska     -1.9501378  1.0732133  2.04000333 -0.438583440
## Arizona    -1.7631635 -0.7459568  0.05478082 -0.834652924
## Arkansas    0.1414203  1.1197968  0.11457369 -0.182810896
## California -2.5239801 -1.5429340  0.59855680 -0.341996478
## Colorado   -1.5145629 -0.9875551  1.09500699  0.001464887
# 将前两个Comp提取出来,转换为data.frame方便会面绘图
stats.arrests <- data.frame(pc.cr$scores[, -c(3:4)])
head(stats.arrests)
##                Comp.1     Comp.2
## Alabama    -0.9855659  1.1333924
## Alaska     -1.9501378  1.0732133
## Arizona    -1.7631635 -0.7459568
## Arkansas    0.1414203  1.1197968
## California -2.5239801 -1.5429340
## Colorado   -1.5145629 -0.9875551
# 展现各州的分布情况,观察哪些州比较异常,哪些能够进行聚类
library(ggplot2)
ggplot(stats.arrests, aes(x = Comp.1, y = Comp.2)) +
  xlab("First Component") + ylab("Second Component") +
  geom_text(alpha = 0.75, label = rownames(stats.arrests), size = 4)

奇异值分解(SVD)-R包使用

我们通过一张图片的处理来展示奇异值分解的魅力所在,对于图片的处理会用到R语言中raster和jpeg两个包。

## 载入图片,并且显示出来
library(raster)
library(jpeg)

raster.photo <- raster("C:/Users/abdata/Pictures/img/1082114.png")
photo.flip <- flip(raster.photo, direction = "y")

##将数据转换为矩阵形式
photo.raster <- t(as.matrix(photo.flip))
dim(photo.raster)
## [1] 256 256
image(photo.raster, col = grey(seq(0, 1, length = 256)))  ##灰化处理

# 奇异值进行分解
photo.svd <- svd(photo.raster)
d <- diag(photo.svd$d)
v <- as.matrix(photo.svd$v)
u <- photo.svd$u
# 取第一个奇异值进行估计,如下图
u1 <- as.matrix(u[, 1])
d1 <- as.matrix(d[1, 1])
v1 <- as.matrix(v[, 1])
photo1 <- u1 %*% d1 %*% t(v1)
image(photo1, col = grey(seq(0, 1, length = 256)))

# 取前五十个奇异值进行模拟,基本能还原成最初的模样,如下图
u2 <- as.matrix(u[, 1:50])
d2 <- as.matrix(d[1:50, 1:50])
v2 <- as.matrix(v[, 1:50])
photo2 <- u2 %*% d2 %*% t(v2)
image(photo2, col = grey(seq(0, 1, length = 256)))