######################################################
##Uma estimativa da incerteza na previsão do modelo###
######################################################

dir()
setwd("C:/Users/Renan Parmigiani/Documents/MESTRADO/Disciplina R/exercicio_7b")
bebes. <- read.table("babies.txt", header=TRUE, sep = " ", as.is = TRUE)
summary(bebes.)
str(bebes.)
head(bebes.)
summary(bebes.)

#Eliminando os valores de 9,99 e 999, e convertendo em NA´s
bebes.[bebes.$gestation == 999,2] <- NA
bebes.[bebes.$age == 99,4] <- NA
bebes.[bebes.$height == 99,5] <- NA
bebes.[bebes.$weight == 999,6] <- NA
bebes.[bebes.$smoke == 9,7] <- NA
summary(bebes.)

#Excluindo os NA´s do meu data.frame
bebes<-na.exclude(bebes.)
summary(bebes$gestation)
class(bebes)

#Convertendo a massa de onças para kg

bebes$bwt <- bebes$bwt * 0.0283495

#Fazendo o modelo 
mod.bebe <- lm(bebes$bwt~bebes$gestation)
cf <- coefficients(mod.bebe)

#Criando os valores hipotéticos
summary(bebes$gestation)
x <- runif(100, min = 148, max = 353)
y<- x*cf[2]+cf[1] #Valores de bwt calculados pelo modelo
summary(y)

#Cálculo do erro padrão
media<- mean(bebes$gestation)
s2 <- var(bebes$bwt)
n<- length(bebes$gestation)
se.y <- sqrt(s2*((1/n)+((x-media)^2/sum((bebes$gestation-media)^2))))

#Cálculo dos intervalos de confiânça
i.conf.inf<- se.y*(qt(df = n-2, p = 0.025))
i.conf.sup<- se.y*(qt(df = n-2, p = 0.975))

#Por no gráfico
plot(bebes$bwt~bebes$gestation, col = 2, xlab= "tempo de gestação (dias)", ylab = "massa do bebe (Kg)")
abline(mod.bebe,col="blue")
lines(x = x, y = (y+i.conf.inf), col = "green")
lines(x = x, y = (y+i.conf.sup), col = "green")

##########################
##Galileu estava Certo?###
##########################

init.h = c(600, 700, 800, 950, 1100, 1300, 1500)
h.d = c(253, 337, 395, 451, 495, 534, 573)

mod1 <- lm(h.d~init.h)
mod2 <- update(mod1,.~. +I(init.h^2))
mod3 <- update(mod2,.~. +I(init.h^3))
anova(mod2,mod3)

#Sim, o modelo com um polinômio do terceiro explica melhor a relação entre as altura de lançamento e as distâncias, quando comparados com polinômios de primeiro e segundo grau

#############################
###Massa de Recém-Nascidos###
#############################

x11()
plot(bebes, pch = "*") #Fazer uma análise visual dos gráficos para ver quais variáveis parecem mais correlacionadas

mod.1<- lm(bebes$bwt~bebes$gestation) 
anova(mod.1)                        #Significativo
mod.2<- lm(bebes$bwt~bebes$parity)
anova(mod.2)                        #Não significativo
mod.3<- lm(bebes$bwt~bebes$age)
anova(mod.3)                        #Não significativo
mod.4<- lm(bebes$bwt~bebes$height)
anova(mod.4)                        #Significativo
mod.5<- lm(bebes$bwt~bebes$weight)
anova(mod.5)                        #Significativo
mod.6<- lm(bebes$bwt~bebes$smoke)
anova(mod.6)                        #Significativo

mod.7<- lm(bebes$bwt~bebes$age+bebes$gestation+bebes$height+bebes$weight+bebes$smoke+bebes$parity)
anova(mod.7) 
#Construindo modelos sem algumas variávei que foram pouco significativas e analisando com os mod.7 que tem todas as variáveis
mod.8 <- lm(bebes$bwt~bebes$gestation+bebes$height+bebes$weight+bebes$smoke+bebes$parity) #Modelo sem age
anova(mod.8, mod.7) #O modelo sem a variável age não é diferente de um modelo com essa variável
mod.9<- lm(bebes$bwt~bebes$age+bebes$gestation+bebes$height+bebes$weight+bebes$smoke) #Modelo sem parity
anova(mod.9, mod.7) #O modelo sem a variável parity não é diferente de um modelo com essa variável
#Vou continuar trabalhando só com as variáveis gestation, height, weight, smoke

mod.10 <- lm (bebes$bwt~bebes$gestation*bebes$height*bebes$weight*bebes$smoke) #Criando o meu modelo mais complexo, em cima das variávei que vi serem significativas com suas possíveis interações
anova(mod.10)
summary(mod.10)

mod.11<- lm (bebes$bwt~bebes$gestation+bebes$height+bebes$weight+bebes$smoke+bebes$gestation:bebes$height+bebes$gestation:bebes$weight+bebes$height:bebes$weight+bebes$gestation:bebes$smoke+bebes$height:bebes$smoke+bebes$weight:bebes$smoke+bebes$gestation:bebes$height:bebes$weight+bebes$gestation:bebes$height:bebes$smoke+bebes$gestation:bebes$weight:bebes$smoke+bebes$height:bebes$weight:bebes$smoke) #Modelo sem a interação das 4 variáveis
anova(mod.11, mod.10) #O teste mostrou diferença entre os modelos

mod.12<- lm(bebes$bwt~bebes$gestation+bebes$height+bebes$weight+bebes$smoke+bebes$gestation:bebes$height+bebes$gestation:bebes$weight+bebes$height:bebes$weight+bebes$gestation:bebes$smoke+bebes$height:bebes$smoke+bebes$gestation:bebes$height:bebes$weight+bebes$gestation:bebes$height:bebes$smoke+bebes$gestation:bebes$height:bebes$weight:bebes$smoke) #Retirando algumas variáveis que parecem não ser relacionadas e que no mod.10 apresentaram um alto valor de p (>0.7) indicado pela função summary
anova(mod.12,mod.10) #Foi mostrada diferença entre os modelos, logo permaneço com o modelo mais complexo

#Modelo final é o mod.10