# Intro to glm for dichotomous data: High School and Beyond # Edps 589 # Fall 2018 # C.J.Anderson # # Read in data in csv formate # Fit the following models: # linear probability # logit # probit # Various nice graphs # # HSB data (N=600) # # id ='ID number ' # sex ='Gender: 1=male, 2=female' # ses ='Socio-Economic Status' # sctyp ='School Type: 1=public, 2=private' # hsp = 'High School Program' # locus ='Locus of Control' # concpt ='Self Concept' # mot ='Motivation' # car ='Career Choice' # rdg ='Reading T-Score' # wrtg ='Writing T-Score' # math ='Math T-Score' # sci ='Science T-Score' # civ ='Civics T-Score'; # # Variable I created for lecture # Group = group based on sum 5 achievement variables # Program = 1 for academic, 0 for general and votech # xmean = mean of 5 achievement variables # ...and some micsl as needed# library(data.table) # maybe I don't need this library(dplyr) library(MASS) # needed for negatiave binomial glm library(vcd) library(vcdExtra) library(ggplot2) #setwd("D:\\Dropbox\\edps 589\\4 glm") setwd("C:\\Users\\cja\\Dropbox\\edps 589\\4 glm") hsb <- read.csv(file="hsb_data.csv", header=TRUE, sep=",") names(hsb) head(hsb) #Give meaningful names to levels # Grouping according to sum of 5 achievement levels: hsb$fgroup <- as.factor(hsb$group) setattr(hsb$fgroup,"levels",c("<200", "200-225", "226-250", "251-275", "276-300","301-325","326-350")) # -- check --- table(hsb$group,hsb$fgroup) # High school programs hsb$fprogram <- as.factor(hsb$program) setattr(hsb$fprogram,"levels",c("non-academic","academic")) # -- check table(hsb$hsp,hsb$fprogram) # Gender hsb$gender <- as.factor(hsb$sex) setattr(hsb$gender,"levels",c("male","female") ) # -- check table(hsb$gender,hsb$sex) # Frequencies hsb.tab <- table(hsb$fprogram,hsb$fgroup) addmargins(hsb.tab) # proportions X 100 = percents # -- overall -- prop.table(table(hsb$fprogram,hsb$fgroup)) * 100 # -- row -- prop.table(table(hsb$fprogram,hsb$fgroup),1) * 100 # -- column -- this is in the notes prop.table(table(hsb$fprogram,hsb$fgroup),2)*100 # get mean scores for each group (for plotting and other things) grp.mean <- aggregate(hsb$achieve,by=list(hsb$fgroup),FUN=mean) colnames(grp.mean)<- c("fgroup","grp.mean") hsb <- merge(hsb,grp.mean,by="fgroup") # This produces side-by-side box plots plot(hsb$fprogram,hsb$grp.mean, main="Distributions of Group Mean Achievement Scores", ylab="Group Mean of Sum of 5 Achievement Scores", xlab="High School Program Type") # # Graphing the relationship between program and achievement # # -- grouped data # --- compute proportions which will be used to graph groupled data n.academic <- aggregate(hsb$program,by=list(hsb$fgroup),FUN=sum) n.group <- table(hsb$fgroup) n.group <- as.matrix(n.group,nrow=7,ncol=1) p.academic <- n.academic[,2]/n.group # --- The graph plot(grp.mean[,2],p.academic, type="p", main="Plot of Collapsed Data", xlab="Group Mean of sum of 5 Achievement Tests", ylab="Proportion in Academic Program", cex=1.75, col="black", pch=20 ) # -- un-grouped data...jittered point to show them a bit better. plot(hsb$achieve,jitter(hsb$program,0.1), main="Whether in Academic Program vs Mean Achievement", xlab="Mean of sum of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, col="black", pch=20 ) # -- ungroups data with loess (smooth) fitted plotted lw1 <- loess(program ~ achieve,data=hsb) # fit loess to data plot(hsb$achieve,jitter(hsb$program,0.1), main="Whether in Academic Program vs Mean Achievement", xlab="Mean of sum of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, col="black", pch=20 ) j <- order(hsb$xmean) lines(hsb$achieve[j],lw1$fitted[j],col="blue",lwd=3) text(300,.435,"Fitted Loess Curve",col="blue") arrows(270,.435,250,.435,length=.2,col="blue") # # Test of indepenedence # chisq.test(hsb.tab,correct=FALSE) assocstats(hsb.tab) # Any kind of association ("general") CMHtest(hsb.tab,correct=FALSE) # Ordinal/row mean scores differ # # Model the relationship on grouped data (for pedological reasons) # hsb.data <- read.table("hsb_table_data.txt",header=TRUE) hsb.data$nonac <- hsb.data$ncases - hsb.data$academic hsb.data$p.acd <- hsb.data$academic/hsb.data$ncases hsb.data # --- For putting in fitted values plot(hsb.data$achieve,hsb.data$p.acd, type="p", main="Observed and Fitted Proportion in Academic", xlab="Group Mean of sum of 5 Achievement Tests", ylab="Proportion in Academic Program", cex=1.75, col="black", pch=20 ) # Linear Probability Model ( mod.linear <- glm(academic/ncases ~ achieve, data=hsb.data, family=binomial(link="identity")) ) summary(mod.linear) lines(hsb.data$achieve,fitted(mod.linear),col="red") hsb.data$linear <- fitted(mod.linear) # logit (logistic regression) Model # -- null ( mod.logit0 <- glm(academic/ncases ~ 1, data=hsb.data, family=binomial, control=list(epsilon=1e-10) ) ) # -- non null ( mod.logit <- glm(academic/ncases ~ achieve, data=hsb.data, family=binomial) ) summary(mod.logit) lines(hsb.data$achieve,fitted(mod.logit),col="blue") hsb.data$logit <- fit ted(mod.logit) anova(mod.logit0,mod.logit) # Probit model ( mod.probit <- glm(academic/ncases ~ achieve, data=hsb.data, family=binomial(link="probit")) ) summary(mod.probit) lines(hsb.data$achieve,fitted(mod.probit),col="green") hsb.data$probit <- fitted(mod.probit) legend("topleft", legend=c("Linear", "Logit","Probit"), col=c("red", "blue","green"), lty=1, cex=1) # Fit to uncollapsed data linear.all <- glm(program ~ achieve,data=hsb,family=binomial(link="identity")) summary(linear.all) logit.all <- glm(program ~ achieve,data=hsb,family=binomial(link="logit")) summary(logit.all) probit.all <- glm(program ~ achieve,data=hsb,family=binomial(link="probit")) summary(probit.all) # -- ungrouped data with loess (smooth) fitted plotted lw1 <- loess(program ~ achieve,data=hsb) # fit loess to data plot(hsb$achieve,jitter(hsb$program,0.1), main="Whether in Academic Program vs Mean Achievement", xlab="Mean of sum of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, col="black", pch=20 ) j <- order(hsb$xmean) lines(hsb$achieve[j],lw1$fitted[j],col="black",lwd=3) text(300,.435,"Fitted Loess Curve",col="black") arrows(270,.435,250,.435,length=.2,col="black") lines(hsb$achieve,fitted(logit.all),col="red") lines(hsb$achieve,fitted(probit.all),col="green") text(325,.75,"Logit",col="red") arrows(325,.78,325,.85,length=.2,col="red") text(225,.1,"Probit",col="green") arrows(220,.1,190,.1,length=.2,col="green") # plot logistic with 95% confidence bands on means achievement.range <- seq(from=164,to=350,by=1) pred.logit <- predict(logit.all, huh <- data.frame(achieve = achievement.range), type="response", se.fit=TRUE ) upper <- pred.logit$fit + 1.96*pred.logit$se.fit lower <- pred.logit$fit - 1.96*pred.logit$se.fit # -- ungrouped data with loess (smooth) fitted plotted lw1 <- loess(program ~ achieve,data=hsb) # fit loess to data plot(hsb$achieve,fitted(logit.all),type='n', main="Data(Loess) vs Logit Fitted w/ 95% Bands", xlab="Mean of sum of 5 Achievement Tests", ylab="Probabilty in Academic Program", cex=1.75, pch=20 ) polygon(c(achievement.range,rev(achievement.range)) , c(upper,rev(lower)), col=rgb(0,0,1,0.2), border=NA ) j <- order(hsb$xmean) lines(hsb$achieve[j],lw1$fitted[j],col="green",lwd=3) lines(hsb$achieve[j],fitted(logit.all)[j],col="blue") legend("topleft",legend=c("Loess","Logit","95% CI"), col=c("green","blue","azure"), lty=1, cex=1)