\[ y_i = \beta_0 + f(x_{1i}) + f(x_{2i}) + \cdots + f(x_{pi}) + u_i \]
Datos: Wage
Wage and other data for a group of 3000 male workers in the Mid-Atlantic region.
library(ISLR)
datos = Wage
datos = datos[datos$wage<250,]
plot(datos$age,datos$wage, cex = 0.5, col = "darkgrey", ylab = "wage (x 1000 $)", xlab = "age")
library(splines)
m1 = lm(wage ~ ns(year,4)+ns(age,5)+education,data=datos)
summary(m1)
##
## Call:
## lm(formula = wage ~ ns(year, 4) + ns(age, 5) + education, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.554 -16.709 -0.828 15.741 93.005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.555 3.560 13.078 < 2e-16 ***
## ns(year, 4)1 7.604 2.645 2.875 0.004065 **
## ns(year, 4)2 3.443 2.255 1.527 0.126968
## ns(year, 4)3 8.706 3.206 2.715 0.006662 **
## ns(year, 4)4 6.269 1.828 3.429 0.000614 ***
## ns(age, 5)1 41.282 3.184 12.966 < 2e-16 ***
## ns(age, 5)2 40.911 3.822 10.703 < 2e-16 ***
## ns(age, 5)3 27.691 3.307 8.372 < 2e-16 ***
## ns(age, 5)4 55.514 8.001 6.938 4.88e-12 ***
## ns(age, 5)5 9.027 6.167 1.464 0.143360
## education2. HS Grad 9.972 1.828 5.456 5.29e-08 ***
## education3. Some College 21.405 1.929 11.097 < 2e-16 ***
## education4. College Grad 33.420 1.924 17.374 < 2e-16 ***
## education5. Advanced Degree 48.128 2.122 22.683 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.43 on 2907 degrees of freedom
## Multiple R-squared: 0.3181, Adjusted R-squared: 0.3151
## F-statistic: 104.3 on 13 and 2907 DF, p-value: < 2.2e-16
Con el paquete gam() podemos representar los regresores:
library(gam)
## Loading required package: foreach
## Loaded gam 1.16
par(mfrow=c(1,3))
plot.Gam(m1, se=TRUE, col="red")
m2 = gam(wage ~ s(year,4) + s(age,5) + education, data=datos)
summary(m2)
##
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = datos)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -104.212 -16.706 -1.002 15.696 92.999
##
## (Dispersion Parameter for gaussian family taken to be 698.3749)
##
## Null Deviance: 2978963 on 2920 degrees of freedom
## Residual Deviance: 2030176 on 2907 degrees of freedom
## AIC: 27434.32
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(year, 4) 1 22112 22112 31.662 2.009e-08 ***
## s(age, 5) 1 125401 125401 179.561 < 2.2e-16 ***
## education 4 615188 153797 220.221 < 2.2e-16 ***
## Residuals 2907 2030176 698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(year, 4) 3 1.240 0.2934
## s(age, 5) 4 47.265 <2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
El contraste anova se refiere a: H0: relación lineal, H1: realación no lineal. Luego parece que para year es buena idea no utilizar una spline.
par(mfrow=c(1,3))
plot.Gam(m2, se=TRUE, col="red")
m3 = gam(wage ~ year + s(age,5) + education, data=datos)
summary(m3)
##
## Call: gam(formula = wage ~ year + s(age, 5) + education, data = datos)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -104.5115 -16.6740 -0.9186 15.5037 92.0293
##
## (Dispersion Parameter for gaussian family taken to be 698.5517)
##
## Null Deviance: 2978963 on 2920 degrees of freedom
## Residual Deviance: 2032785 on 2910 degrees of freedom
## AIC: 27432.07
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## year 1 22108 22108 31.648 2.023e-08 ***
## s(age, 5) 1 124681 124681 178.485 < 2.2e-16 ***
## education 4 614955 153739 220.082 < 2.2e-16 ***
## Residuals 2910 2032785 699
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## year
## s(age, 5) 4 47.412 < 2.2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m4 = gam(wage ~ year + lo(age,span = 0.7) + education, data=datos)
summary(m4)
##
## Call: gam(formula = wage ~ year + lo(age, span = 0.7) + education,
## data = datos)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -102.788 -16.491 -1.199 15.501 92.317
##
## (Dispersion Parameter for gaussian family taken to be 704.772)
##
## Null Deviance: 2978963 on 2920 degrees of freedom
## Residual Deviance: 2052853 on 2912.791 degrees of freedom
## AIC: 27455.19
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## year 1.0 20771 20771 29.471 6.142e-08 ***
## lo(age, span = 0.7) 1.0 124814 124814 177.098 < 2.2e-16 ***
## education 4.0 638054 159513 226.333 < 2.2e-16 ***
## Residuals 2912.8 2052853 705
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## year
## lo(age, span = 0.7) 1.2 131.91 < 2.2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Para seleccionar el modelo vamos a calcular el MSE:
source("MSE.R")
model = list(m1,m2,m3,m4)
MSEi = numeric(4)
for (i in 1:4){
predi = predict(model[[i]], newdata = datos)
MSEi[i] = MSE(datos$wage, predi)
}
plot(MSEi, type = "b")