1 Datos

library(ISLR)
datos=Credit
str(datos)

## 'data.frame':    400 obs. of  12 variables:
##  $ ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Income   : num  14.9 106 104.6 148.9 55.9 ...
##  $ Limit    : int  3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
##  $ Rating   : int  283 483 514 681 357 569 259 512 266 491 ...
##  $ Cards    : int  2 3 4 3 2 4 2 2 5 3 ...
##  $ Age      : int  34 82 71 36 68 77 37 87 66 41 ...
##  $ Education: int  11 15 11 11 16 10 12 9 13 19 ...
##  $ Gender   : Factor w/ 2 levels " Male","Female": 1 2 1 2 1 1 2 1 2 2 ...
##  $ Student  : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ Married  : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
##  $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
##  $ Balance  : int  333 903 580 964 331 1151 203 872 279 1350 ...

2 Variables cualitativas

2.1 Variable cualitativa con dos niveles

class(datos$Gender)

## [1] "factor"

levels(datos$Gender)

## [1] " Male"  "Female"

\[ Balance = \beta_0 + \beta_1 X_1 + u \]

Male: X1 = 0

Female: X1 = 1

m1 = lm(Balance ~ Gender, data = datos)
summary(m1)

## 
## Call:
## lm(formula = Balance ~ Gender, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -529.54 -455.35  -60.17  334.71 1489.20 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    509.80      33.13  15.389   <2e-16 ***
## GenderFemale    19.73      46.05   0.429    0.669    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 460.2 on 398 degrees of freedom
## Multiple R-squared:  0.0004611,  Adjusted R-squared:  -0.00205 
## F-statistic: 0.1836 on 1 and 398 DF,  p-value: 0.6685

Male (X1 = 0): \(Balance = \beta_0\) El crédito medio de los hombres es 509.80

Female (X1 = 1): \(Balance = \beta_0 + \beta_1\) El crédito medio de las mujeres es 509.80 + 19.73 = 529.53

Cambiamos el orden del factor

Gender1 = relevel(datos$Gender, ref = "Female")
levels(Gender1)

## [1] "Female" " Male"

m2 = lm(datos$Balance ~ Gender1)
summary(m2)

## 
## Call:
## lm(formula = datos$Balance ~ Gender1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -529.54 -455.35  -60.17  334.71 1489.20 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    529.54      31.99  16.554   <2e-16 ***
## Gender1 Male   -19.73      46.05  -0.429    0.669    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 460.2 on 398 degrees of freedom
## Multiple R-squared:  0.0004611,  Adjusted R-squared:  -0.00205 
## F-statistic: 0.1836 on 1 and 398 DF,  p-value: 0.6685

Female (X1 = 0): \(Balance = \beta_0\) El crédito medio de las mujeres es 529.54

Male (X1 = 1): \(Balance = \beta_0 + \beta_1\) El crédito medio de las mujeres es 529.80 - 19.73 = 509.80

2.2 Variable cualitativa con tres niveles

class(datos$Ethnicity)

## [1] "factor"

levels(datos$Ethnicity)

## [1] "African American" "Asian"            "Caucasian"

Modelo general:

\[ Balance = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + u \]

Modelo para “African American”: X1 = 0, X2 = 0

\[ Balance = \beta_0 + u \]

Modelo para “Asian”: X1 = 1, X2 = 0

\[ Balance = \beta_0 + \beta_1 + u \]

Modelo para “Caucasian”: X1 = 0, X2 = 1

\[ Balance = \beta_0 + \beta_2 + u \]

m3 = lm(Balance ~ Ethnicity, data = datos)
summary(m3)

## 
## Call:
## lm(formula = Balance ~ Ethnicity, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -531.00 -457.08  -63.25  339.25 1480.50 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          531.00      46.32  11.464   <2e-16 ***
## EthnicityAsian       -18.69      65.02  -0.287    0.774    
## EthnicityCaucasian   -12.50      56.68  -0.221    0.826    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 460.9 on 397 degrees of freedom
## Multiple R-squared:  0.0002188,  Adjusted R-squared:  -0.004818 
## F-statistic: 0.04344 on 2 and 397 DF,  p-value: 0.9575

Según los pvalores, no hay diferencias entre “African American” y “Asian”. Ni tampoco entre “African American” y “Caucasian”.

Podemos hacer otras comparaciones utilizando:

Ethnicity1 = relevel(datos$Ethnicity, ref="Asian")

Ethnicity2 = factor(datos$Ethnicity,levels=c("Caucasian","Asian","African American"))

3 Variables cualitativas y cuantitativas

\[ Balance = \beta_0 + \beta_1 Income + \beta_2 X_1 + u \]

donde:

X1 = 0: No student
X1 = 1: Student

Si X1 = 0: \(Balance = \beta_0 + \beta_1 Income\)

Si X1 = 1: \(Balance = (\beta_0 + \beta_2) + \beta_1 Income\)

Tenemos dos rectas, con la misma pendiente y distinta \(\beta_0\).

m3 = lm(Balance ~ Income + Student, data = datos)
summary(m3)

## 
## Call:
## lm(formula = Balance ~ Income + Student, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -762.37 -331.38  -45.04  323.60  818.28 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 211.1430    32.4572   6.505 2.34e-10 ***
## Income        5.9843     0.5566  10.751  < 2e-16 ***
## StudentYes  382.6705    65.3108   5.859 9.78e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 391.8 on 397 degrees of freedom
## Multiple R-squared:  0.2775, Adjusted R-squared:  0.2738 
## F-statistic: 76.22 on 2 and 397 DF,  p-value: < 2.2e-16

plot(datos$Income, datos$Balance, col = datos$Student)
abline(m3$coefficients["(Intercept)"], m3$coefficients["Income"])
abline(m3$coefficients["(Intercept)"] + m3$coefficients["StudentYes"],
       m3$coefficients["Income"], col="red")

¿Podemos representar con un único modelo dos rectas con distinta pendiente, una para estudiantes y otra para no estudiantes?

\[ Balance = \beta_0 + \beta_1 Income + \beta_2 X_1 + \beta_3 X_1 Income + u \]

Si X1 = 0: \(Balance = \beta_0 + \beta_1 Income\)

Si X1 = 1: \(Balance = (\beta_0 + \beta_2) + (\beta_1 + \beta_3) Income\)

Tenemos dos rectas, con la misma pendiente y distinta \(\beta_0\).

m4 = lm(Balance ~ Income*Student, data = datos)
summary(m4)

## 
## Call:
## lm(formula = Balance ~ Income * Student, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -773.39 -325.70  -41.13  321.65  814.04 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       200.6232    33.6984   5.953 5.79e-09 ***
## Income              6.2182     0.5921  10.502  < 2e-16 ***
## StudentYes        476.6758   104.3512   4.568 6.59e-06 ***
## Income:StudentYes  -1.9992     1.7313  -1.155    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 391.6 on 396 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.2744 
## F-statistic:  51.3 on 3 and 396 DF,  p-value: < 2.2e-16

Según los resultados del análisis, las rectas tienen la misma pendiente.

La interacción en R se define utilizando los dos puntos (:). Por tanto, el modelo anterior es equivalente a poner:

m5 = lm(Balance ~ Income + Student + Income:Student, data = datos)
summary(m5)

## 
## Call:
## lm(formula = Balance ~ Income + Student + Income:Student, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -773.39 -325.70  -41.13  321.65  814.04 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       200.6232    33.6984   5.953 5.79e-09 ***
## Income              6.2182     0.5921  10.502  < 2e-16 ***
## StudentYes        476.6758   104.3512   4.568 6.59e-06 ***
## Income:StudentYes  -1.9992     1.7313  -1.155    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 391.6 on 396 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.2744 
## F-statistic:  51.3 on 3 and 396 DF,  p-value: < 2.2e-16

Regresión Lineal con variables cualitativas

Javier Cara

Curso 2018-19

1 Datos

2 Variables cualitativas

2.1 Variable cualitativa con dos niveles

2.2 Variable cualitativa con tres niveles

3 Variables cualitativas y cuantitativas