1 Introducción

Datos: Wage

Wage and other data for a group of 3000 male workers in the Mid-Atlantic region.

library(ISLR)
datos = Wage
str(datos)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...
plot(datos$age,datos$wage, cex = 0.5, col = "darkgrey")

Parece que hay dos grupos diferenciados: los que ganan más de 250.000$ y los que ganan menos. Vamos a trabajar con los que ganan menos

datos = datos[datos$wage<250,]
plot(datos$age,datos$wage, cex = 0.5, col = "darkgrey")

2 Regresión polinómica

Modelo:

\[ y_i = \beta_0 + \beta_1 x_i + \beta_2 x_i^2 + \beta_3 x_i^3 + \cdots + \beta_d x_i^d + u_i \]

  • el modelo se estima con mínimos cuadrados, utilizando como regresores: \(x_i, x_i^2, x_i^3, \cdots, x_i^d\).
  • todas las cuestiones de inferencia estudiadas en el tema de regresión lineal son válidas aquí también.

Hay varias maneras de implementarlos en R:

m1 = lm(wage ~ age + I(age^2) + I(age^3) + I(age^4), data = datos)
summary(m1)
## 
## Call:
## lm(formula = wage ~ age + I(age^2) + I(age^3) + I(age^4), data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.565 -20.689  -2.015  17.584 116.228 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.620e+02  4.563e+01  -3.550 0.000391 ***
## age          1.948e+01  4.477e+00   4.350 1.41e-05 ***
## I(age^2)    -5.150e-01  1.569e-01  -3.283 0.001039 ** 
## I(age^3)     6.113e-03  2.334e-03   2.619 0.008869 ** 
## I(age^4)    -2.800e-05  1.250e-05  -2.240 0.025186 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.16 on 2916 degrees of freedom
## Multiple R-squared:  0.1094, Adjusted R-squared:  0.1082 
## F-statistic: 89.55 on 4 and 2916 DF,  p-value: < 2.2e-16
m2 = lm(wage ~ cbind(age, age^2, age^3, age^4), data = datos)
summary(m2)
## 
## Call:
## lm(formula = wage ~ cbind(age, age^2, age^3, age^4), data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.565 -20.689  -2.015  17.584 116.228 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        -1.620e+02  4.563e+01  -3.550 0.000391
## cbind(age, age^2, age^3, age^4)age  1.948e+01  4.477e+00   4.350 1.41e-05
## cbind(age, age^2, age^3, age^4)    -5.150e-01  1.569e-01  -3.283 0.001039
## cbind(age, age^2, age^3, age^4)     6.113e-03  2.334e-03   2.619 0.008869
## cbind(age, age^2, age^3, age^4)    -2.800e-05  1.250e-05  -2.240 0.025186
##                                       
## (Intercept)                        ***
## cbind(age, age^2, age^3, age^4)age ***
## cbind(age, age^2, age^3, age^4)    ** 
## cbind(age, age^2, age^3, age^4)    ** 
## cbind(age, age^2, age^3, age^4)    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.16 on 2916 degrees of freedom
## Multiple R-squared:  0.1094, Adjusted R-squared:  0.1082 
## F-statistic: 89.55 on 4 and 2916 DF,  p-value: < 2.2e-16
m3 = lm(wage ~ poly(age, degree = 4, raw = T), data = datos)
summary(m3)
## 
## Call:
## lm(formula = wage ~ poly(age, degree = 4, raw = T), data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.565 -20.689  -2.015  17.584 116.228 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -1.620e+02  4.563e+01  -3.550 0.000391 ***
## poly(age, degree = 4, raw = T)1  1.948e+01  4.477e+00   4.350 1.41e-05 ***
## poly(age, degree = 4, raw = T)2 -5.150e-01  1.569e-01  -3.283 0.001039 ** 
## poly(age, degree = 4, raw = T)3  6.113e-03  2.334e-03   2.619 0.008869 ** 
## poly(age, degree = 4, raw = T)4 -2.800e-05  1.250e-05  -2.240 0.025186 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.16 on 2916 degrees of freedom
## Multiple R-squared:  0.1094, Adjusted R-squared:  0.1082 
## F-statistic: 89.55 on 4 and 2916 DF,  p-value: < 2.2e-16

Vamos a dibujar la curva y los intervalos de confianza (95% = 2*SE):

age_grid = seq(from = min(datos$age), to = max(datos$age), by = 1)
yp = predict(m1, newdata = data.frame(age = age_grid), se = TRUE)
yp1 = yp$fit - 2*yp$se.fit
yp2 = yp$fit + 2*yp$se.fit
plot(wage ~ age, data = datos, xlim = range(age), cex = 0.5, col = "darkgrey")
title("Polinomio de grado 4")
lines(age_grid, yp$fit, lwd = 2, col = "blue")
lines(age_grid, yp1, col = "blue", lty = 3)
lines(age_grid, yp2, col = "blue", lty = 3)

3 Selección del grado máximo del polinomio

3.1 Usando best-subset, forward-stepwise o backward-stepwise

library(leaps)
grado_max = 12
m_best = regsubsets(wage ~ poly(age, degree = grado_max), data = datos, nvmax = grado_max)
m_best_summary = summary(m_best)
plot(m_best_summary$adjr2, xlab = "Grado del polinomio", ylab = "R2 ajustado", type = "b")

3.2 Usando subconjunto de validación o cross-validation

source("cross_val_pos.R")
source("regsubsets_predict.R")
source("MSE.R")

# datos de los folds
num_folds = 10
set.seed(1)
pos = cross_val_pos(nrow(datos),num_folds)

mse_cv = matrix(0, nrow = num_folds, ncol = grado_max)
for (i in 1:num_folds){
  # datos de training y de validation de cada fold
  datos_train = datos[pos$train[[i]],]
  datos_test = datos[pos$test[[i]],]
  
  for (j in 1:grado_max){
    m_cv = lm(wage ~ poly(age, degree = j), data = datos_train)
    pred = predict(m_cv, newdata = datos_test)
    mse_cv[i,j] = MSE(datos_test$wage,pred)
  }
}

mse_cv_med = apply(mse_cv, 2, mean)
plot(mse_cv_med, type = "b")