library(rpart)
d = read.table('datos/coches.txt', header = T)
d = na.omit(d)
d$origen = factor(d$origen, labels = c("USA","Europa","Japon"))
## n= 391
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 391 5910.74200 11.227620
## 2) cc< 3482 234 856.23930 8.709402
## 4) cv< 84.5 129 216.99220 7.658915
## 8) ano>=76.5 81 81.80247 7.049383 *
## 9) ano< 76.5 48 54.31250 8.687500 *
## 5) cv>=84.5 105 322.00000 10.000000
## 10) ano>=78.5 33 67.33333 8.666667 *
## 11) ano< 78.5 72 169.11110 10.611110
## 22) peso< 921 48 69.91667 9.958333 *
## 23) peso>=921 24 37.83333 11.916670 *
## 3) cc>=3482 157 1358.94300 14.980890
## 6) cv< 143.5 82 358.01220 13.109760
## 12) peso< 1242 60 185.00000 12.500000 *
## 13) peso>=1242 22 89.86364 14.772730 *
## 7) cv>=143.5 75 399.94670 17.026670
## 14) peso< 1453.5 46 132.80430 16.065220 *
## 15) peso>=1453.5 29 157.17240 18.551720
## 30) ano>=73.5 7 12.00000 16.000000 *
## 31) ano< 73.5 22 85.09091 19.363640 *
Lo que devuelve la tabla es:
## [1] 5910.742
## [1] 11.22762
t2 = rpart(consumo ~ ., data = d, method = "anova",
control = rpart.control(minsplit = 10, minbucket = 5, cp = 0.05))
plot(t2, margin = 0.02)
text(t2, cex=.75)
control:
## n= 391
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 391 5910.7420 11.227620
## 2) cc< 3482 234 856.2393 8.709402
## 4) cv< 84.5 129 216.9922 7.658915 *
## 5) cv>=84.5 105 322.0000 10.000000 *
## 3) cc>=3482 157 1358.9430 14.980890
## 6) cv< 143.5 82 358.0122 13.109760 *
## 7) cv>=143.5 75 399.9467 17.026670 *
## [1] 0.1016776
que es mayor que el límite cp = 0.05.
Podemos construir un arbol más profundo:
t3 = rpart(consumo ~ ., data = d, method = "anova",
control = rpart.control(minsplit = 10, minbucket = 5, cp = 0.01))
plot(t3, margin = 0.02)
text(t3, cex=.75)
## n= 391
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 391 5910.74200 11.227620
## 2) cc< 3482 234 856.23930 8.709402
## 4) cv< 84.5 129 216.99220 7.658915
## 8) ano>=76.5 81 81.80247 7.049383 *
## 9) ano< 76.5 48 54.31250 8.687500 *
## 5) cv>=84.5 105 322.00000 10.000000
## 10) ano>=78.5 33 67.33333 8.666667 *
## 11) ano< 78.5 72 169.11110 10.611110
## 22) peso< 921 48 69.91667 9.958333 *
## 23) peso>=921 24 37.83333 11.916670 *
## 3) cc>=3482 157 1358.94300 14.980890
## 6) cv< 143.5 82 358.01220 13.109760
## 12) peso< 1242 60 185.00000 12.500000 *
## 13) peso>=1242 22 89.86364 14.772730 *
## 7) cv>=143.5 75 399.94670 17.026670
## 14) peso< 1453.5 46 132.80430 16.065220 *
## 15) peso>=1453.5 29 157.17240 18.551720
## 30) ano>=73.5 7 12.00000 16.000000 *
## 31) ano< 73.5 22 85.09091 19.363640 *
## [1] 0.01860512
\[ R^2 = 1 - \frac{RSS}{TSS} \]
donde hay que recordar de RSS = deviance(nodo) y TSS = deviance(root)
Se denomina error relativo al cociente RSS/TSS. Y la X indica que se ha calculado mediante validación cruzada.
##
## Regression tree:
## rpart(formula = consumo ~ ., data = d, method = "anova", control = rpart.control(minsplit = 10,
## minbucket = 5, cp = 0.01))
##
## Variables actually used in tree construction:
## [1] ano cc cv peso
##
## Root node error: 5910.7/391 = 15.117
##
## n= 391
##
## CP nsplit rel error xerror xstd
## 1 0.625228 0 1.00000 1.00607 0.067288
## 2 0.101677 1 0.37477 0.39733 0.032330
## 3 0.053673 2 0.27310 0.31494 0.025133
## 4 0.018605 3 0.21942 0.25252 0.022073
## 5 0.014475 4 0.20082 0.24345 0.021894
## 6 0.014067 5 0.18634 0.24080 0.021982
## 7 0.013683 6 0.17228 0.23260 0.021767
## 8 0.010381 7 0.15859 0.21896 0.021282
## 9 0.010165 8 0.14821 0.21511 0.021024
## 10 0.010000 9 0.13805 0.21590 0.021003
Los árboles que hemos visto se construyen de arriba hacia abajo, desde el nodo raiz hasta las hojas. Otra estrategia es construir un arbol muy profundo y luego podarlo. Construiriamos el arbol, por tanto, de abajo hacia arriba.
Primero construimos un arbol profundo:
t4 = rpart(consumo ~ ., data = d, method = "anova",
control = rpart.control(minsplit = 2, cp = 0.001))
##
## Regression tree:
## rpart(formula = consumo ~ ., data = d, method = "anova", control = rpart.control(minsplit = 2,
## cp = 0.001))
##
## Variables actually used in tree construction:
## [1] acel ano cc cv peso
##
## Root node error: 5910.7/391 = 15.117
##
## n= 391
##
## CP nsplit rel error xerror xstd
## 1 0.6252277 0 1.000000 1.00657 0.067152
## 2 0.1016765 1 0.374772 0.40245 0.032587
## 3 0.0536730 2 0.273096 0.32739 0.026627
## 4 0.0186051 3 0.219423 0.26357 0.024510
## 5 0.0144746 4 0.200818 0.23812 0.021297
## 6 0.0140674 5 0.186343 0.23076 0.020878
## 7 0.0136831 6 0.172276 0.22985 0.020895
## 8 0.0103813 7 0.158593 0.20392 0.019197
## 9 0.0101648 8 0.148211 0.19410 0.020199
## 10 0.0092203 9 0.138046 0.18630 0.020144
## 11 0.0076586 10 0.128826 0.18234 0.020563
## 12 0.0054585 11 0.121168 0.18074 0.020264
## 13 0.0054152 12 0.115709 0.18033 0.020097
## 14 0.0041331 13 0.110294 0.18083 0.019981
## 15 0.0040529 14 0.106161 0.17516 0.017895
## 16 0.0040451 15 0.102108 0.16663 0.015460
## 17 0.0036915 16 0.098063 0.16430 0.015434
## 18 0.0033877 17 0.094371 0.16400 0.015355
## 19 0.0028197 18 0.090984 0.16353 0.015765
## 20 0.0027295 19 0.088164 0.16062 0.015613
## 21 0.0027098 20 0.085434 0.16180 0.015614
## 22 0.0023979 21 0.082725 0.16326 0.015668
## 23 0.0022840 23 0.077929 0.16015 0.015569
## 24 0.0022488 24 0.075645 0.15941 0.015577
## 25 0.0022074 25 0.073396 0.15715 0.015316
## 26 0.0021466 27 0.068981 0.15790 0.015311
## 27 0.0020556 28 0.066835 0.15790 0.015311
## 28 0.0017764 29 0.064779 0.15850 0.015480
## 29 0.0017623 30 0.063003 0.15697 0.015522
## 30 0.0016892 31 0.061240 0.15547 0.015538
## 31 0.0015987 32 0.059551 0.15857 0.015747
## 32 0.0015410 34 0.056354 0.15673 0.015611
## 33 0.0015032 35 0.054813 0.15698 0.015659
## 34 0.0014138 36 0.053310 0.15555 0.016079
## 35 0.0013535 37 0.051896 0.15338 0.015640
## 36 0.0013535 38 0.050542 0.15185 0.015422
## 37 0.0010659 39 0.049189 0.15003 0.015416
## 38 0.0010503 40 0.048123 0.15028 0.015430
## 39 0.0010026 42 0.046022 0.15030 0.015430
## 40 0.0010000 43 0.045020 0.15026 0.015349
A veces este gráfico tiene un mínimo, por lo que deberíamos seleccionar ese arbol. En caso contrario, elegimos el tamaño donde el error se estabilice.
Según el gráfico y la tabla anterior, un arbol de 12 hojas parece razonable.
## [1] 0.005458475
xp = data.frame(cc = 4500, cv = 110, peso = 950, acel = 13,
ano = 70, origen = "Europa", cilindros = 8)
predict(t4_prune, newdata = xp)
## 1
## 13.42857