##########7b. Exercícios de Regressão Múltipla######################################## #####################Uma estimativa da incerteza na previsão do modelo################ babies= read.table(file="babies.txt", header=TRUE, sep=" ") babies plot(bwt~gestation, data=babies) ##Para tirar os dados faltantes babies.1= babies[babies$gestation!=999,] babies.1 plot(bwt~gestation, data=babies.1) ##Vetor representando a variavel preditora x= seq(min(babies.1$gestation),max(babies.1$gestation),length.out=100 ) x ##Media dos valores m.babies.1= mean(babies.1$gestation) m.babies.1 ##Soma dos desvios quadraticos dos valores de x s.dq.babies.1= sum((babies.1$gestation-m.babies.1)^2) s.dq.babies.1 ##Numero de observacoes. babies.length= length(babies.1$gestation) babies.length ##Variancia de y var.y.babies.1= var(babies.1$bwt) var.y.babies.1 ##Erro padrao da previsao do modelo. erro.babies.lm= sqrt(var.y.babies.1*(1/babies.length+((x-m.babies.1)^2)/s.dq.babies.1)) erro.babies.lm ##Modelo com o peso do bebê (kg) em funçao do tempo de gestação (dias). bab.kgday= lm(babies.1$bwt~babies.1$gestation) summary(bab.kgday) ##Coeficientes do modelo bab.kgday coef.bab.kgday= coef(bab.kgday) coef.bab.kgday ##Valores preditos de y usando a expressao y=a+(b*x). y= coef.bab.kgday[1]+coef.bab.kgday[2]*x y ##Cálculo do grau de liberdade gl= n-2 gl ##Multiplicando o erro padrao da previsao do modelo pelo valor t de Student. quan.t= qt(0.95,gl) quan.t ##Intervalos de confiança. int.conf1= y + (erro.babies.lm*quan.t) int.conf1 int.conf2= y - (erro.babies.lm*quan.t) int.conf2 ##Gráfico avaliando a incerteza do bab.kgday com um intervalo de ##confianca de 95%. plot(bwt~gestation, data=babies.1) abline(bab.kgday, col="blue",lwd=2) curve((coef.bab.kgday[1]+(coef.bab.kgday[2]*x))+(erro.babies.lm*quan.t),add=T,col="blue",lwd=2,n=100) curve((coef.bab.kgday[1]+(coef.bab.kgday[2]*x))-(erro.babies.lm*quan.t),add=T,col="blue",lwd=2,n=100) ######################Galileu estava certo?############################################################ init.h = c(600, 700, 800, 950, 1100, 1300, 1500) h.d = c(253, 337, 395, 451, 495, 534, 573) mod1 <- lm(h.d~init.h) mod2 <- lm(h.d~init.h+I(init.h^2)) mod3 <- lm(h.d~init.h+I(init.h^2)+I(init.h^3)) coef2 <- coef(mod2) coef3 <- coef(mod3) anova(mod3, mod2) # Como o p valor menor que 0.05, podemos afirmar que o modelo 3 explica # melhor a variancia dos dados do modelo 2 plot(h.d~init.h) abline(mod1, lty=2, col="green") curve(coef2[1]+coef2[2]*x+coef2[3]*x^2, add=TRUE, lty=2, col="magenta") curve(coef3[1]+coef3[2]*x+coef3[3]*x^2+coef3[4]*x^3, add=TRUE, lty=2, col="navy") ###################Massa de Recém Nascidos############################################################# babies= read.table("babies.txt", header=T, sep=" ") head(babies) tail(babies) #Eliminando dados faltantes do dataframe para cada coluna, e verificando quantos #registros foram eliminados em cada etapa bbs.l <- subset(babies, babies$gestation!=999) length(bbs.l$gestation) bbs.li <- subset(bbs.l, bbs.l$parity!=9) length(bbs.li$gestation) bbs.lim <- subset(bbs.li, bbs.li$age!=99) length(bbs.lim$gestation) bbs.limp <- subset(bbs.lim, bbs.lim$height!=99) length(bbs.limp$gestation) bbs.limpo <- subset(bbs.limp, bbs.limp$weight!=999) length(bbs.limpo$gestation) bebes <- subset(bbs.limpo, bbs.limpo$smoke!=9) length(bebes$gestation) plot(bebes) #Considerando multiplas preditoras mod1 = lm(bwt~gestation, data=bebes) anova(mod1) mod2 = lm(bwt~gestation + parity, data=bebes) anova(mod1,mod2) mod3 = lm(bwt~gestation + parity + age, data=bebes) anova(mod2,mod3) #"age" nao explica a variaçao dos dados mod4 = lm(bwt~gestation + parity + height, data=bebes) anova(mod2,mod4) mod5 = lm(bwt~gestation + parity + height + weight, data=bebes) anova(mod4,mod5) mod6 = lm(bwt~gestation + parity + height + weight + smoke, data=bebes) anova(mod5,mod6) #Considerando interacoes - modelo complexo mod0 = lm(bwt~gestation + parity + height + weight + smoke + gestation:parity + gestation:height + gestation:weight + gestation:smoke + parity:height + parity:weight + parity:smoke + height:weight + height:smoke + weight:smoke + gestation:parity:height:weight:smoke, data=bebes) anova(mod0) anova(mod6,mod0) #o modelo mais complexo explica melhor a variacao #Modelo simples em funçao do melhor modelo de previsao da massa dos bebes ao nascer: mod = lm(bwt ~ gestation + parity + height + weight + smoke + gestation:smoke, data=bebes) summary(mod)