恩格尔定律

摘要

传统的线性回归模型描述了因变量的条件均值分布与自变量X的关系,为了和分位数回归相区别,把传统的回归又称为均值回归(Mean Regression)。其中,OLS是估计回归系数的最基本的方法。

分位数回归是估计一组回归变量X与被解释变量Y的分位数之间关系的建模方法。

问题的提出:恩格尔定律

德国统计学家恩格尔(Engel)使用收集的235个比利时家庭的收入与食物支出数据,得出了著名的恩格尔定律:收入越高的家庭将其收入用于食物支出的比例越低。

数据集

#install.packages('quantreg')
library(quantreg)
data(engel)
dim(engel)
## [1] 235   2
head(engel)
##     income  foodexp
## 1 420.1577 255.8394
## 2 541.4117 310.9587
## 3 901.1575 485.6800
## 4 639.0802 402.9974
## 5 750.8756 495.5608
## 6 945.7989 633.7978
class(engel)
## [1] "data.frame"
typeof(engel)
## [1] "list"
mode(engel)
## [1] "list"

模型方法

#install.packages('quantreg')
library(quantreg)
# data(engel)
# dim(engel)
# head(engel)
# class(engel)
# typeof(engel)
# mode(engel)


# plot of engel data and some rq lines see KB(1982) for references to data
data(engel)
attach(engel)
plot(income,foodexp,xlab="Household Income",ylab="Food Expenditure",type = "n", cex=.5)
points(income,foodexp,cex=.5,col="blue")


#中位数回归
q_0.5 <- rq(foodexp ~ income, tau=0.5, data=engel)
summary(q_0.5)
## 
## Call: rq(formula = foodexp ~ income, tau = 0.5, data = engel)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             coefficients lower bd  upper bd 
## (Intercept)  81.48225     53.25915 114.01156
## income        0.56018      0.48702   0.60199
#分位数(.05,.1,.25,.75,.9,.95)上的分位数回归
taus <- c(.05,.1,.25,.75,.9,.95)
xx <- seq(min(income),max(income),100)
f1 <- rq((foodexp)~(income),tau=taus)
f2 <- coef(f1)
summary(f1)
## 
## Call: rq(formula = (foodexp) ~ (income), tau = taus)
## 
## tau: [1] 0.05
## 
## Coefficients:
##             coefficients lower bd  upper bd 
## (Intercept) 124.88004     98.30212 130.51695
## income        0.34336      0.34333   0.38975
## 
## Call: rq(formula = (foodexp) ~ (income), tau = taus)
## 
## tau: [1] 0.1
## 
## Coefficients:
##             coefficients lower bd  upper bd 
## (Intercept) 110.14157     79.88753 146.18875
## income        0.40177      0.34210   0.45079
## 
## Call: rq(formula = (foodexp) ~ (income), tau = taus)
## 
## tau: [1] 0.25
## 
## Coefficients:
##             coefficients lower bd  upper bd 
## (Intercept)  95.48354     73.78608 120.09847
## income        0.47410      0.42033   0.49433
## 
## Call: rq(formula = (foodexp) ~ (income), tau = taus)
## 
## tau: [1] 0.75
## 
## Coefficients:
##             coefficients lower bd  upper bd 
## (Intercept)  62.39659     32.74488 107.31362
## income        0.64401      0.58016   0.69041
## 
## Call: rq(formula = (foodexp) ~ (income), tau = taus)
## 
## tau: [1] 0.9
## 
## Coefficients:
##             coefficients lower bd  upper bd 
## (Intercept)  67.35087     37.11802 103.17399
## income        0.68630      0.64937   0.74223
## 
## Call: rq(formula = (foodexp) ~ (income), tau = taus)
## 
## tau: [1] 0.95
## 
## Coefficients:
##             coefficients lower bd upper bd
## (Intercept) 64.10396     46.26495 83.57896
## income       0.70907      0.67390  0.73444
summary(f2)
##    tau= 0.05          tau= 0.10          tau= 0.25         tau= 0.75     
##  Min.   :  0.3434   Min.   :  0.4018   Min.   : 0.4741   Min.   : 0.644  
##  1st Qu.: 31.4775   1st Qu.: 27.8367   1st Qu.:24.2265   1st Qu.:16.082  
##  Median : 62.6117   Median : 55.2717   Median :47.9788   Median :31.520  
##  Mean   : 62.6117   Mean   : 55.2717   Mean   :47.9788   Mean   :31.520  
##  3rd Qu.: 93.7459   3rd Qu.: 82.7066   3rd Qu.:71.7312   3rd Qu.:46.958  
##  Max.   :124.8800   Max.   :110.1416   Max.   :95.4835   Max.   :62.397  
##    tau= 0.90         tau= 0.95      
##  Min.   : 0.6863   Min.   : 0.7091  
##  1st Qu.:17.3524   1st Qu.:16.5578  
##  Median :34.0186   Median :32.4065  
##  Mean   :34.0186   Mean   :32.4065  
##  3rd Qu.:50.6847   3rd Qu.:48.2552  
##  Max.   :67.3509   Max.   :64.1040
f <- coef(rq((foodexp)~(income),tau=taus))  # 提取分位数回归系数
yy <- cbind(1,xx)%*%f                       # 计算分位数回归拟合值
for(i in 1:length(taus)){
  lines(xx,yy[,i],col = "gray")
}
abline(lm(foodexp ~ income),col="red",lty = 2)
abline(rq(foodexp ~ income), col="blue")
legend(3500,620,c("mean (LSE) fit", "median (LAE) fit"), col = c("red","blue"), lty = c(2,1))

#Example of plotting of coefficients and their confidence bands
plot(summary(rq(foodexp~income,tau = 1:49/50,data=engel)))

#Example to illustrate inequality constrained fitting
n <- 100
p <- 5
X <- matrix(rnorm(n*p),n,p)
y <- .95*apply(X,1,sum)+rnorm(n)
#constrain slope coefficients to lie between zero and one
R <- cbind(0,rbind(diag(p),-diag(p)))
r <- c(rep(0,p),-rep(1,p))
rq(y~X,R=R,r=r,method="fnc")
## Call:
## rq(formula = y ~ X, method = "fnc", R = R, r = r)
## 
## Coefficients:
## (Intercept)          X1          X2          X3          X4          X5 
##   0.1819726   0.9323909   1.0000000   0.8925110   1.0000000   0.9920904 
## 
## Degrees of freedom: 100 total; 94 residual
# ?rq