现实世界中数据一般都是复杂和高维的,比如描述一个人,有姓名、年龄、性别、受教育程度、收入、地址、电话等等几十种属性,如此多的属性对于数据分析是一个严重的挑战,除了极大增加建模的成本和模型的复杂度,往往也会导致过拟合问题,因此在实际处理过程中,一些降维的方法是必不可少,其中用的比较多的有主成分分析(PCA)[Principal Component Analysis]、奇异值分解(SVD)[Singular Value Decomposition]、特征选择(Feature Select)等等。
# 学生体型主成分分析
# X1 : 身高 X2 : 体重 X3 : 胸围 X4 : 坐高
student <- data.frame(
X1 = c(148, 139, 160, 149, 159, 142, 153, 150, 151, 139,
140, 161, 158, 140, 137, 152, 149, 145, 160, 156,
151, 147, 157, 147, 157, 151, 144, 141, 139, 148),
X2 = c(41, 34, 49, 36, 45, 31, 43, 43, 42, 31,
29, 47, 49, 33, 31, 35, 47, 35, 47, 44,
42, 38, 39, 30, 48, 36, 36, 30, 32, 38),
X3 = c(72, 71, 77, 67, 80, 66, 76, 77, 77, 68,
64, 78, 78, 67, 66, 73, 82, 70, 74, 78,
73, 73, 68, 65, 80, 74, 68, 67, 68, 70),
X4 = c(78, 76, 86, 79, 86, 76, 83, 79, 80, 74,
74, 84, 83, 77, 73, 79, 79, 77, 87, 85,
82, 78, 80, 75, 88, 80, 76, 76, 73, 78)
)
# 手工计算
# 1. 计算协方差
(student.cov <- cov(student))
## X1 X2 X3 X4
## X1 53.51724 40.79310 27.58621 28.75862
## X2 40.79310 41.73448 29.83103 24.35517
## X3 27.58621 29.83103 26.52989 17.22184
## X4 28.75862 24.35517 17.22184 18.24023
# 2. 计算协方差的特征向量和特征值
student.eigen <- eigen(student.cov)
# 特征值
student.eigen$values
## [1] 124.814224 10.848728 2.495491 1.863396
# 特征向量
student.eigen$vectors
## [,1] [,2] [,3] [,4]
## [1,] -0.6240226 0.6455638 0.22363789 -0.37924835
## [2,] -0.5591912 -0.3456392 -0.74576837 -0.10801960
## [3,] -0.4083340 -0.6604700 0.62447014 -0.08414179
## [4,] -0.3621661 0.1660133 0.06206998 0.91510798
# 3. 计算方差贡献率和累积贡献率
(student.pov <- (student.eigen$values/sum(student.eigen$values)))
## [1] 0.89139112 0.07747883 0.01782215 0.01330789
(student.cp <- cumsum(student.pov))
## [1] 0.8913911 0.9688700 0.9866921 1.0000000
# 4. 主成分的标准方差
(student.z <- sqrt(student.eigen$values))
## [1] 11.172029 3.293741 1.579712 1.365062
# princomp函数计算
library(stats)
#cor是逻辑变量 当cor=TRUE表示用样本的相关矩阵R做主成分分析
test.pr <- princomp(student, cor=TRUE)
# 当cor=FALSE表示用样本的协方差阵S做主成分分析
# loading是逻辑变量 当loading=TRUE时表示显示loading 的内容
summary(test.pr,loadings=TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.8817805 0.55980636 0.28179594 0.25711844
## Proportion of Variance 0.8852745 0.07834579 0.01985224 0.01652747
## Cumulative Proportion 0.8852745 0.96362029 0.98347253 1.00000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.497 0.543 -0.450 0.506
## X2 -0.515 -0.210 -0.462 -0.691
## X3 -0.481 -0.725 0.175 0.461
## X4 -0.507 0.368 0.744 -0.232
# 分析结果含义
#----Standard deviation 标准差 其平方为方差=特征值
#----Proportion of Variance 方差贡献率
#----Cumulative Proportion 方差累计贡献率
# 由结果显示 前两个主成分的累计贡献率已经达到96% 可以舍去另外两个主成分 达到降维的目的
# 因此可以得到函数表达式
# Z1 = -0.497X'1-0.515X'2-0.481X'3-0.507X'4
# Z2 = 0.543X'1-0.210X'2-0.725X'3-0.368X'4
# loadings的输出结果为载荷 是主成分对应于原始变量的系数即Q矩阵
loadings(test.pr)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.497 0.543 -0.450 0.506
## X2 -0.515 -0.210 -0.462 -0.691
## X3 -0.481 -0.725 0.175 0.461
## X4 -0.507 0.368 0.744 -0.232
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
head(test.pr$scores)
## Comp.1 Comp.2 Comp.3 Comp.4
## [1,] 0.0699095 -0.23813701 -0.35509248 -0.26612014
## [2,] 1.5952634 -0.71847399 0.32813232 -0.11805665
## [3,] -2.8479315 0.38956679 -0.09731731 -0.27948249
## [4,] 0.7599699 0.80604335 -0.04945722 -0.16294930
## [5,] -2.7396678 0.01718087 0.36012615 0.35865304
## [6,] 2.1058317 0.32284393 0.18600422 -0.03645608
# 画主成分的碎石图并预测
screeplot(test.pr, type="lines")
p <- predict(test.pr)
p
## Comp.1 Comp.2 Comp.3 Comp.4
## [1,] 0.06990950 -0.23813701 -0.35509248 -0.266120139
## [2,] 1.59526340 -0.71847399 0.32813232 -0.118056646
## [3,] -2.84793151 0.38956679 -0.09731731 -0.279482487
## [4,] 0.75996988 0.80604335 -0.04945722 -0.162949298
## [5,] -2.73966777 0.01718087 0.36012615 0.358653044
## [6,] 2.10583168 0.32284393 0.18600422 -0.036456084
## [7,] -1.42105591 -0.06053165 0.21093321 -0.044223092
## [8,] -0.82583977 -0.78102576 -0.27557798 0.057288572
## [9,] -0.93464402 -0.58469242 -0.08814136 0.181037746
## [10,] 2.36463820 -0.36532199 0.08840476 0.045520127
## [11,] 2.83741916 0.34875841 0.03310423 -0.031146930
## [12,] -2.60851224 0.21278728 -0.33398037 0.210157574
## [13,] -2.44253342 -0.16769496 -0.46918095 -0.162987830
## [14,] 1.86630669 0.05021384 0.37720280 -0.358821916
## [15,] 2.81347421 -0.31790107 -0.03291329 -0.222035112
## [16,] 0.06392983 0.20718448 0.04334340 0.703533624
## [17,] -1.55561022 -1.70439674 -0.33126406 0.007551879
## [18,] 1.07392251 -0.06763418 0.02283648 0.048606680
## [19,] -2.52174212 0.97274301 0.12164633 -0.390667991
## [20,] -2.14072377 0.02217881 0.37410972 0.129548960
## [21,] -0.79624422 0.16307887 0.12781270 -0.294140762
## [22,] 0.28708321 -0.35744666 -0.03962116 0.080991989
## [23,] -0.25151075 1.25555188 -0.55617325 0.109068939
## [24,] 2.05706032 0.78894494 -0.26552109 0.388088643
## [25,] -3.08596855 -0.05775318 0.62110421 -0.218939612
## [26,] -0.16367555 0.04317932 0.24481850 0.560248997
## [27,] 1.37265053 0.02220972 -0.23378320 -0.257399715
## [28,] 2.16097778 0.13733233 0.35589739 0.093123683
## [29,] 2.40434827 -0.48613137 -0.16154441 -0.007914021
## [30,] 0.50287468 0.14734317 -0.20590831 -0.122078819
在R语言中PCA对应函数是princomp,来自stats包。以美国的各州犯罪数据为对象进行分析,数据集USArrests在graphics包中。
princomp()主成分分析(可以从相关阵或者从协方差阵做主成分分析)
summary()提取主成分信息
loadings()显示主成分分析或因子分析中载荷的内容
predict()预测主成分的值
screeplot()画出主成分的碎石图
biplot()画出数据关于主成分的散点图和原坐标在主成分下的方向
library(stats)
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
summary(pc.cr <- princomp(USArrests, cor = TRUE))
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.5748783 0.9948694 0.5971291 0.41644938
## Proportion of Variance 0.6200604 0.2474413 0.0891408 0.04335752
## Cumulative Proportion 0.6200604 0.8675017 0.9566425 1.00000000
# 每个主成分对方差的贡献比例,显然Comp.1 + Comp2所占比例超过85%,因此能够用前两个主成分来表示整个数据集,也将数据从4维降到两维
# 查看每个特征在主成分中所在的比例
loadings(pc.cr)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## Murder -0.536 0.418 -0.341 0.649
## Assault -0.583 0.188 -0.268 -0.743
## UrbanPop -0.278 -0.873 -0.378 0.134
## Rape -0.543 -0.167 0.818
##
## Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings 1.00 1.00 1.00 1.00
## Proportion Var 0.25 0.25 0.25 0.25
## Cumulative Var 0.25 0.50 0.75 1.00
# 根据以上数据可很容易转换为几个数学等式:
# Comp1 = -0.536 * Murder + (-0.583) * Assault + (-0.278)*UrbanPop + (-0.543)* Rape
# Comp2 = 0.418 * Murder + 0.188 * Assault + (-0.873)*UrbanPop + (-0.167)* Rape
# 可以用Comp1、Comp2两个维度的数据来表示各州,在二维图上展现各州个聚类关系。
# scores包含有各州在四个主成分的得分
head(pc.cr$scores)
## Comp.1 Comp.2 Comp.3 Comp.4
## Alabama -0.9855659 1.1333924 -0.44426879 0.156267145
## Alaska -1.9501378 1.0732133 2.04000333 -0.438583440
## Arizona -1.7631635 -0.7459568 0.05478082 -0.834652924
## Arkansas 0.1414203 1.1197968 0.11457369 -0.182810896
## California -2.5239801 -1.5429340 0.59855680 -0.341996478
## Colorado -1.5145629 -0.9875551 1.09500699 0.001464887
# 将前两个Comp提取出来,转换为data.frame方便会面绘图
stats.arrests <- data.frame(pc.cr$scores[, -c(3:4)])
head(stats.arrests)
## Comp.1 Comp.2
## Alabama -0.9855659 1.1333924
## Alaska -1.9501378 1.0732133
## Arizona -1.7631635 -0.7459568
## Arkansas 0.1414203 1.1197968
## California -2.5239801 -1.5429340
## Colorado -1.5145629 -0.9875551
# 展现各州的分布情况,观察哪些州比较异常,哪些能够进行聚类
library(ggplot2)
ggplot(stats.arrests, aes(x = Comp.1, y = Comp.2)) +
xlab("First Component") + ylab("Second Component") +
geom_text(alpha = 0.75, label = rownames(stats.arrests), size = 4)
我们通过一张图片的处理来展示奇异值分解的魅力所在,对于图片的处理会用到R语言中raster和jpeg两个包。
## 载入图片,并且显示出来
library(raster)
library(jpeg)
raster.photo <- raster("C:/Users/abdata/Pictures/img/1082114.png")
photo.flip <- flip(raster.photo, direction = "y")
##将数据转换为矩阵形式
photo.raster <- t(as.matrix(photo.flip))
dim(photo.raster)
## [1] 256 256
image(photo.raster, col = grey(seq(0, 1, length = 256))) ##灰化处理
# 奇异值进行分解
photo.svd <- svd(photo.raster)
d <- diag(photo.svd$d)
v <- as.matrix(photo.svd$v)
u <- photo.svd$u
# 取第一个奇异值进行估计,如下图
u1 <- as.matrix(u[, 1])
d1 <- as.matrix(d[1, 1])
v1 <- as.matrix(v[, 1])
photo1 <- u1 %*% d1 %*% t(v1)
image(photo1, col = grey(seq(0, 1, length = 256)))
# 取前五十个奇异值进行模拟,基本能还原成最初的模样,如下图
u2 <- as.matrix(u[, 1:50])
d2 <- as.matrix(d[1:50, 1:50])
v2 <- as.matrix(v[, 1:50])
photo2 <- u2 %*% d2 %*% t(v2)
image(photo2, col = grey(seq(0, 1, length = 256)))