Datos: Wage
Wage and other data for a group of 3000 male workers in the Mid-Atlantic region.
library(ISLR)
datos = Wage
str(datos)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
plot(datos$age,datos$wage, cex = 0.5, col = "darkgrey")
Parece que hay dos grupos diferenciados: los que ganan más de 250.000$ y los que ganan menos. Vamos a trabajar con los que ganan menos
datos = datos[datos$wage<250,]
plot(datos$age,datos$wage, cex = 0.5, col = "darkgrey")
Modelo:
\[ y_i = \beta_0 + \beta_1 x_i + \beta_2 x_i^2 + \beta_3 x_i^3 + \cdots + \beta_d x_i^d + u_i \]
Hay varias maneras de implementarlos en R:
m1 = lm(wage ~ age + I(age^2) + I(age^3) + I(age^4), data = datos)
summary(m1)
##
## Call:
## lm(formula = wage ~ age + I(age^2) + I(age^3) + I(age^4), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.565 -20.689 -2.015 17.584 116.228
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.620e+02 4.563e+01 -3.550 0.000391 ***
## age 1.948e+01 4.477e+00 4.350 1.41e-05 ***
## I(age^2) -5.150e-01 1.569e-01 -3.283 0.001039 **
## I(age^3) 6.113e-03 2.334e-03 2.619 0.008869 **
## I(age^4) -2.800e-05 1.250e-05 -2.240 0.025186 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.16 on 2916 degrees of freedom
## Multiple R-squared: 0.1094, Adjusted R-squared: 0.1082
## F-statistic: 89.55 on 4 and 2916 DF, p-value: < 2.2e-16
m2 = lm(wage ~ cbind(age, age^2, age^3, age^4), data = datos)
summary(m2)
##
## Call:
## lm(formula = wage ~ cbind(age, age^2, age^3, age^4), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.565 -20.689 -2.015 17.584 116.228
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.620e+02 4.563e+01 -3.550 0.000391
## cbind(age, age^2, age^3, age^4)age 1.948e+01 4.477e+00 4.350 1.41e-05
## cbind(age, age^2, age^3, age^4) -5.150e-01 1.569e-01 -3.283 0.001039
## cbind(age, age^2, age^3, age^4) 6.113e-03 2.334e-03 2.619 0.008869
## cbind(age, age^2, age^3, age^4) -2.800e-05 1.250e-05 -2.240 0.025186
##
## (Intercept) ***
## cbind(age, age^2, age^3, age^4)age ***
## cbind(age, age^2, age^3, age^4) **
## cbind(age, age^2, age^3, age^4) **
## cbind(age, age^2, age^3, age^4) *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.16 on 2916 degrees of freedom
## Multiple R-squared: 0.1094, Adjusted R-squared: 0.1082
## F-statistic: 89.55 on 4 and 2916 DF, p-value: < 2.2e-16
m3 = lm(wage ~ poly(age, degree = 4, raw = T), data = datos)
summary(m3)
##
## Call:
## lm(formula = wage ~ poly(age, degree = 4, raw = T), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93.565 -20.689 -2.015 17.584 116.228
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.620e+02 4.563e+01 -3.550 0.000391 ***
## poly(age, degree = 4, raw = T)1 1.948e+01 4.477e+00 4.350 1.41e-05 ***
## poly(age, degree = 4, raw = T)2 -5.150e-01 1.569e-01 -3.283 0.001039 **
## poly(age, degree = 4, raw = T)3 6.113e-03 2.334e-03 2.619 0.008869 **
## poly(age, degree = 4, raw = T)4 -2.800e-05 1.250e-05 -2.240 0.025186 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.16 on 2916 degrees of freedom
## Multiple R-squared: 0.1094, Adjusted R-squared: 0.1082
## F-statistic: 89.55 on 4 and 2916 DF, p-value: < 2.2e-16
Vamos a dibujar la curva y los intervalos de confianza (95% = 2*SE):
age_grid = seq(from = min(datos$age), to = max(datos$age), by = 1)
yp = predict(m1, newdata = data.frame(age = age_grid), se = TRUE)
yp1 = yp$fit - 2*yp$se.fit
yp2 = yp$fit + 2*yp$se.fit
plot(wage ~ age, data = datos, xlim = range(age), cex = 0.5, col = "darkgrey")
title("Polinomio de grado 4")
lines(age_grid, yp$fit, lwd = 2, col = "blue")
lines(age_grid, yp1, col = "blue", lty = 3)
lines(age_grid, yp2, col = "blue", lty = 3)
library(leaps)
grado_max = 12
m_best = regsubsets(wage ~ poly(age, degree = grado_max), data = datos, nvmax = grado_max)
m_best_summary = summary(m_best)
plot(m_best_summary$adjr2, xlab = "Grado del polinomio", ylab = "R2 ajustado", type = "b")
source("cross_val_pos.R")
source("regsubsets_predict.R")
source("MSE.R")
# datos de los folds
num_folds = 10
set.seed(1)
pos = cross_val_pos(nrow(datos),num_folds)
mse_cv = matrix(0, nrow = num_folds, ncol = grado_max)
for (i in 1:num_folds){
# datos de training y de validation de cada fold
datos_train = datos[pos$train[[i]],]
datos_test = datos[pos$test[[i]],]
for (j in 1:grado_max){
m_cv = lm(wage ~ poly(age, degree = j), data = datos_train)
pred = predict(m_cv, newdata = datos_test)
mse_cv[i,j] = MSE(datos_test$wage,pred)
}
}
mse_cv_med = apply(mse_cv, 2, mean)
plot(mse_cv_med, type = "b")