# Edps 589 # Fall 2019 # c.j.anderson # # Titanic data and multiple logistic regression # # ###################################################### # Setup # ###################################################### # library(vcd) library(ResourceSelection) library(pROC) library(DescTools) library(rms) # an alternative package that does logistic regression # Will be especially useful for ordinal data. For now # I am using it to get Rsq. setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\5 logistic regression") #setwd("C:\\Users\\cja\\Dropbox\\edps 589\\5 logistic regression") #setwd("D:\\Dropbox\\edps 589\\5 logistic regression") ##################################################### # Read in data # ##################################################### t <- read.csv(file="titanic_data_R.csv", header=TRUE,sep=",") names(t) head(t) dim(t) ##################################################### # Some descriptive statistics # ##################################################### N <- length(t$pclass) table(t$survived) table(t$pclass) table(t$sex) mean(t$age) summary(t$age) #################################################### # Models # #################################################### # I will use default R coding (i.e., dummy coding with 1st value # the reference category) t$pclass <- as.factor(t$pclass) # saturated summary( model0 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age + pclass*sex*age,data=t,family=binomial) ) # all 2-way summary( model1 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age, data=t, family=binomial) ) hoslem.test(model1$y, fitted(model1), g=10) anova(model1,model0) ( model1.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + pclass*age + sex*age,data=t) ) # PS,PA summary( model2 <- glm(survived ~ pclass + sex + age + pclass*sex + pclass*age, data=t, family=binomial) ) hoslem.test(model2$y, fitted(model2), g=10) anova(model2,model1) ( model2.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + pclass*age,data=t) ) # PS,SA summary( model3 <- glm(survived ~ pclass + sex + age + pclass*sex + sex*age, data=t, family=binomial) ) hoslem.test(model3$y, fitted(model3), g=10) anova(model3,model1) ( model3.alt <- lrm(survived ~ pclass + sex + age + pclass*sex + sex*age,data=t) ) # PA,SA summary( model4 <- glm(survived ~ pclass + sex + age + pclass*age + sex*age, data=t, family=binomial) ) hoslem.test(model4$y, fitted(model4), g=10) anova(model4,model1) ( model4.alt <- lrm(survived ~ pclass + sex + age + pclass*age + sex*age ,data=t) ) # P,S,A summary( model5 <- glm(survived ~ pclass + sex + age , data=t, family=binomial) ) ( model5.alt <- lrm(survived ~ pclass + sex + age,data=t) ) #### Graph the best model: model1 # sub-set data and fitted values, t$fitted <- model1$fitted female1 <- t[ (which(t$sex=="female" & t$pclass=="1")),] female2 <- t[ (which(t$sex=="female" & t$pclass=="2")),] female3 <- t[ (which(t$sex=="female" & t$pclass=="3")),] male1 <- t[ (which(t$sex=="male" & t$pclass=="1")),] male2 <- t[ (which(t$sex=="male" & t$pclass=="2")),] male3 <- t[ (which(t$sex=="male" & t$pclass=="3")),] j <- order(female1$age) plot(female1$age[j],female1$fitted[j], type='l', lwd=2, ylim=c(0,1), xlim=c(1,80), col="blue", xlab="Age of Passenger", ylab="Fitted Probability of Survival", main="Titanic: survived~pclass+sex+age+pclass*sex+pclass*age+sex*age" ) j <- order(female2$age) lines(female2$age[j],female2$fitted[j],col="darkblue",lwd=2) j <- order(female3$age) lines(female3$age[j],female3$fitted[j],col="cyan",lwd=2) j <- order(male1$age) lines(male1$age[j],male1$fitted[j],col="orange",lwd=2) j <- order(male2$age) lines(male2$age[j],male2$fitted[j],col="red",lwd=2) j <- order(male3$age) lines(male3$age[j],male3$fitted[j],col="green",lwd=2) legend(57.5,.92,legend=c("female 1st", "female 2nd","female 3rd","male 1st", "male 2nd", "male 3rd"), col=c("blue","darkblue","cyan","orange","red","green"), lty=1,lwd=2, cex=1)