# Edps 589 # Fall 2019 # C.J.Anderson # # HSB: Multiple regression of program type = school + type + ses # # 1. Set up and read in data (long format, one line per case) # 2. Create data frame for 3-way table # 3 Fit logit models to 3-way table # -- no effect of ses or sctyp on p(academic) # -- additive effects logit (or homoegeneous association) # 4. Logit models fit as poisson log-linear regression models # -- using glm, which only gives G2 # -- using loglm, which gives both pearson's X2 and G2 # 5. Change data to a wide format # 6. Compute X2 and G2 using their definitions # 7. Identification constraints # a. alternative dummy codes # b. effect codes # 8. Same odds ratio regardless of ID constraints # 9. Is SES ordinal or nominal? # 10. More odds ratios # 11. Model individual level: math, ses and schooltyp # a. program ~ math # b. program ~ math + ses (factor) # c. program ~ math + ses (numeric) # d. program ~ math + ses (numeric) + schtyp # e. program ~ math + ses (numeric) + schtyp + ses*schtyp # 12. Regression Diagnostics on model 11e. # a. Hosmer-Lemshow # b. QQ plot of residuals # c. ROC and concordance -- need to use trick for multiple logistic regression # 13. More predictors ## library(vcd) library(ResourceSelection) library(pROC) library(DescTools) setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\5 logistic regression") #setwd("C:\\Users\\cja\\Dropbox\\edps 589\\5 logistic regression") #setwd("D:\\Dropbox\\edps 589\\5 logistic regression") ##################################################### # 1. Read in data -- long format # ##################################################### hsb <- read.csv(file="hsb_data.csv", header=TRUE, sep=",") names(hsb) head(hsb) ##################################################### # 2. Create data frame for 3-way table # ##################################################### ########################################## # Long table # ########################################## var.levels <- expand.grid(academic=c("no", "yes"), sctyp=c("public","private"), ses = c("low","middle","high")) ( hsb3way <- data.frame(var.levels, count=c(91, 40, 4, 4, 138, 111, 14, 36, 44, 82, 1, 35 ))) ########################################## # Wide table (see below for alternative) # ########################################## var.levels2 <- expand.grid(sctyp=c("public","private"), ses = c("low","middle","high")) hsb3way.no <- data.frame(var.levels2, no=c(40, 4, 111, 36, 82, 35 )) hsb3way.yes <- data.frame(var.levels2, yes=c(91,4,136,14,44,1)) hsb3way.wide <- merge(hsb3way.yes,hsb3way.no, by=c("sctyp","ses")) hsb3way.wide$ncases <- hsb3way.wide$yes + hsb3way.wide$no hsb3way.wide ###################################################### # 3. Some logit models # ###################################################### # logit --- (joint) independence summary( i.mod <- glm(yes/ncases ~ 1 , weights=ncases, data=hsb3way.wide,family=binomial) ) # logit --- homogeneous association summary( h.logit <- glm(yes/ncases ~ sctyp + ses , weights=ncases, data=hsb3way.wide,family=binomial) ) X2 <- sum(residuals(h.logit,type="pearson")^2) ####################################################### # 4. Logit models as poisson regression models # ####################################################### # As a poisson --- one way to get X2 goodness-of-fit statistics summary( h.poi <- glm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way,family=poisson) ) X2 <- sum(residuals(h.poi,type="pearson")^2) # This will give both G2 and X2 library(MASS) summary( h.poi2 <- loglm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way) ) ###################################################### # 5. Change format of data to a wide format # # Alternate way to get wide format # ###################################################### yes <- hsb3way[ which(hsb3way$academic=="yes"),] no <- as.data.frame(hsb3way[ which(hsb3way$academic=="no"),4]) names(no) <- c("no") wide <- cbind(yes,no) wide$ncases <- wide$count+wide$no ###################################################### # 6. Computing X2 and G2 by using their definitions # ###################################################### logit.bin <- glm(academic ~ sctyp + ses, family = binomial, data = hsb3way, weight=count) # There must be an easier way to get G2 and X2 oddnumber <- function(x) x%%2 != 0 i <- oddnumber(1:12) wide$pi <- logit.bin$fitted[i] Xsq.yes <- (wide$count- wide$pi*wide$ncases)**2/(wide$pi*wide$ncases) Xsq.no <- (wide$no - (1-wide$pi)*wide$ncases)**2/((1-wide$pi)*wide$ncases) (Xsq <- sum(Xsq.yes + Xsq.no)) Gsq.yes <- 2*wide$count *log(wide$count/(wide$ncases*wide$pi)) Gsq.no <- 2*wide$no * log(wide$no/(wide$ncases*(1-wide$pi))) ( Gsq <- sum(Gsq.yes+Gsq.no) ) ################################################################## # 7. Identification Constraints # # --- to see why parameter estimates are different from SAS, # # see page 15 or so of lecture notes. It all has to do # # with ID constraints. The interpretation doesn't change. # # Below I re-fit model with alternative codings # ################################################################## ############################# # 7.a Alternate dummy codes # ############################# # Change ID constraints to match SAS: # -- change how Dummy codes are defined hsb3way$Dsctyp <- ifelse(hsb3way$sctyp=="public",1,0) hsb3way$Dlow <- ifelse(hsb3way$ses=="low",1,0) hsb3way$Dmid <- ifelse(hsb3way$ses=="middle",1,0) summary( logit.bin2 <- glm(academic ~ Dlow + Dmid + Dsctyp, family = binomial, data = hsb3way, weight=count) ) ############################# # 7.b Using Effect codes # ############################# hsb3way$Esctyp <- ifelse(hsb3way$sctyp=="public",1,-1) hsb3way$Elow <- ifelse(hsb3way$ses=="low",1,0) hsb3way$Elow <- ifelse(hsb3way$ses=="high",-1,hsb3way$Elow) hsb3way$Emid <- ifelse(hsb3way$ses=="middle",1,0) hsb3way$Emid <- ifelse(hsb3way$ses=="high",-1,hsb3way$Emid) summary( logit.bin3 <- glm(academic ~ Elow + Emid + Esctyp, family = binomial, data = hsb3way, weight=count) ) ############################################################## # 8. Same odds ratios regardless of ID constraints # ############################################################## # --- holding sctype constant: public vs private odds ratios (odds.dummy1 <- exp(0 - logit.bin$coefficients[4])) (odds.dummy2 <- exp(logit.bin2$coefficients[4]- 0)) (odds.effect3 <- exp(logit.bin3$coefficients[4]+logit.bin3$coefficients[4])) # -- try to get the rest that are in the notes ################################################################## # 9. SAS as nominal vs ordinal variable # ################################################################## # -- Create scores for ses hsb3way$xses <- as.numeric(hsb3way$ses) # -- ses as factor summary( logit.factor <- glm(academic ~ ses + sctyp, family = binomial, data = hsb3way, weight=count) ) # -- note: -"residual deviance"/2 = -2log(like) # -- ses as numeric summary( logit.numeric <- glm(academic ~ xses + sctyp, family = binomial, data = hsb3way, weight=count) ) # -- compute goodness-of-fit statistics for logit.numeric wide$pi.numeric <- logit.numeric$fitted[i] Xsq.yes <- (wide$count - wide$pi.numeric*wide$ncases)**2/(wide$pi.numeric*wide$ncases) Xsq.no <- (wide$no - (1-wide$pi.numeric)*wide$ncases)**2/((1-wide$pi.numeric)*wide$ncases) (Xsq <- sum(Xsq.yes + Xsq.no)) Gsq.yes <- 2*wide$count *log(wide$count/(wide$ncases*wide$pi.numeric)) Gsq.no <- 2*wide$no * log(wide$no/(wide$ncases*(1-wide$pi.numeric))) ( Gsq <- sum(Gsq.yes+Gsq.no) ) # -- LR test of restriction on values for Ses anova(logit.numeric, logit.factor) ############################################################## # 10. Computing odds ratios using model with numerical SES # ############################################################## # Note logit.numeric$coefficients # -- Holding schtyp constant, odds of academic are exp(logit.numeric$coefficients[2]) # the odds for an increase of SES by 1 unit # -- odds academic of low vs high ses exp((3-1)*logit.numeric$coefficients[2]) # -- try to get the others ############################################################# # 11. Individual level data: math, ses and school type # ############################################################# ####################### # 11.a program ~ math # ####################### summary(logit.mod1 <- glm(program ~ math, data=hsb,family = binomial)) j <- order(hsb$math) plot(hsb$math[j],logit.mod1$fitted[j], type='l', lwd=2, ylim=c(0,1), col="blue", main="Model 1: program ~ math") ###################### # 11.b ses as nominal# ###################### hsb$ses <- as.factor(hsb$ses) summary(logit.mod2 <- glm(program ~ math + ses, data=hsb,family = binomial)) # - subset for graphing hsb$fitted2 <- logit.mod2$fitted low <- hsb[ which(hsb$ses=="1"),] mid <- hsb[ which(hsb$ses=="2"),] hi <- hsb[ which(hsb$ses=="3"),] j <- order(low$math) plot(low$math[j],low$fitted2[j], type='l', lwd=2, ylim=c(0,1), xlim=c(30,80), col="blue", main="Model 2: program ~ math + ses (as factor) " ) j <- order(mid$math) lines(mid$math[j],mid$fitted2[j],col="green",lwd=2) j <- order(hi$math) lines(hi$math[j],hi$fitted2[j],col="red",lwd=2) legend("topleft",legend=c("high","middle","low"), col=c("red","green","blue"), lty=1,lwd=2, cex=1) #################### # 11.c ordinal ses # #################### hsb$ses <- as.numeric(hsb$ses) summary(logit.mod3 <- glm(program ~ math + ses, data=hsb,family = binomial)) # - subset for graphing hsb$fitted3 <- logit.mod3$fitted low <- hsb[ which(hsb$ses=="1"),] mid <- hsb[ which(hsb$ses=="2"),] hi <- hsb[ which(hsb$ses=="3"),] j <- order(low$math) plot(low$math[j],low$fitted3[j], type='l', lwd=2, ylim=c(0,1), xlim=c(30,80), col="blue", main="Model 3: program ~ math + ses (numeric) " ) j <- order(mid$math) lines(mid$math[j],mid$fitted3[j],col="green",lwd=2) j <- order(hi$math) lines(hi$math[j],hi$fitted3[j],col="red",lwd=2) legend("topleft",legend=c("high","middle","low"), col=c("red","green","blue"), lty=1,lwd=2, cex=1) #################### # 11.d + schtyp # #################### hsb$sctyp <- as.factor(hsb$sctyp) summary(logit.mod4 <- glm(program ~ math + ses + sctyp, data=hsb,family = binomial)) # - subset for graphing hsb$fitted4 <- logit.mod4$fitted low.public <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),] low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),] mid.public <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),] mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),] hi.public <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),] hi.private <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),] j <- order(low.public$math) plot(low.public$math[j],low.public$fitted4[j], type='l', lwd=2, ylim=c(0,1), xlim=c(30,80), col="blue", main="Model 4: program ~ math + ses (numeric) + school type " ) j <- order(low.private$math) lines(low.private$math[j],low.private$fitted4[j],col="darkblue",lwd=2) j <- order(mid.public$math) lines(mid.public$math[j],mid.public$fitted4[j],col="green",lwd=2) j <- order(mid.private$math) lines(mid.private$math[j],mid.private$fitted4[j],col="darkgreen",lwd=2) j <- order(hi.public$math) lines(hi.public$math[j],hi.public$fitted4[j],col="red",lwd=2) j <- order(hi.private$math) lines(hi.private$math[j],hi.private$fitted4[j],col="darkred",lwd=2) legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "), col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1) #################### # 11.d + schtyp # #################### hsb$sctyp <- as.factor(hsb$sctyp) summary(logit.mod5 <- glm(program ~ math + ses + sctyp + ses*sctyp, data=hsb,family = binomial)) # - subset for graphing hsb$fitted5 <- logit.mod5$fitted low.public <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),] low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),] mid.public <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),] mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),] hi.public <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),] hi.private <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),] j <- order(low.public$math) plot(low.public$math[j],low.public$fitted5[j], type='l', lwd=2, ylim=c(0,1), xlim=c(30,80), col="blue", main="Model 5: program ~ math + ses + schyp + ses*schtyp " ) j <- order(low.private$math) lines(low.private$math[j],low.private$fitted5[j],col="darkblue",lwd=2) j <- order(mid.public$math) lines(mid.public$math[j],mid.public$fitted5[j],col="green",lwd=2) j <- order(mid.private$math) lines(mid.private$math[j],mid.private$fitted5[j],col="darkgreen",lwd=2) j <- order(hi.public$math) lines(hi.public$math[j],hi.public$fitted5[j],col="red",lwd=2) j <- order(hi.private$math) lines(hi.private$math[j],hi.private$fitted5[j],col="darkred",lwd=2) legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "), col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1) ################################################################### # 12 Some diagnostics on model 5 # ################################################################### ######################## # 12 a Hosmer-Lemeshow # ######################## hoslem.test(logit.mod5$y, fitted(logit.mod5), g=10) ######################## # 12b qq plot residuals--- looks different from SAS# ######################## # Pearson (adjusted standardized residuals resid <- rstandard(logit.mod5, pearson=TRUE) qqnorm(resid, main="QQ plot of Pearson Residuals", col="blue") ########################### # 12c. ROC and concordance# ########################### # Concordance index Cstat(logit.mod1,resp=hsb$program) Cstat(logit.mod2,resp=hsb$program) Cstat(logit.mod3,resp=hsb$program) Cstat(logit.mod4,resp=hsb$program) Cstat(logit.mod5,resp=hsb$program) # For ROC w/ multivariate logitistic regression roc.data5 <-roc(logit.mod5$y , logit.mod5$fitted.values,ci=T) names(roc.data5) plot((1-roc.data5$specificities),roc.data5$sensitivities, type='l', lwd=2, col="blue", main="ROC for Models 5, 1 and chance", xlab="1-sensitivity (false positives)", ylab="Specificity (true positives)" ) roc.data1 <- roc(logit.mod1$y, logit.mod1$fitted.values,ci=T) lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2) text(.1,.99,"model 5: c=.79",col="blue") text(.1,.94,"model 1: c=.75",col="pink") text(.1,.89,"chance: c=.50",col="black") lines(c(0,1),c(0,1), lty=2) legend("bottomright", legend=c("math + ses + schyp + ses*schtyp", "math","chance"), col=c("blue","pink","black"), lty=c(1,1,2), lwd=2) ######################################################################### # 13 More predictors and problem of multicolinearity # ######################################################################### summary(logit.mod6 <- glm(program ~ math + ses + sctyp + ses*sctyp + ses*math + math*sctyp + math*ses*sctyp, data=hsb,family = binomial)) # center all variables hsb$cmath <- scale(hsb$math,center=TRUE,scale=FALSE) hsb$cses <- scale(hsb$ses,center=TRUE,scale=FALSE) hsb$sctyp <- as.numeric(hsb$sctyp) hsb$csctyp <- scale(hsb$sctyp,center=TRUE,scale=FALSE) summary(logit.mod7 <- glm(program ~ cmath + cses + csctyp + cses*csctyp + cses*cmath + cmath*csctyp +cmath*cses*csctyp, data=hsb,family = binomial)) summary(logit.mod8 <- glm(program ~ math + ses + sctyp + ses*sctyp + rdg + sci + wrtg + civ, data=hsb,family = binomial)) # Examine correlations among achievement tests var.names <- c("math","rdg","sci","wrtg","civ") var.data <- hsb[var.names] cor.mat <- cor(var.data,method="pearson") round(cor.mat,2) # Note eigen-vectors can be reflected (i.e., multiply by -1) eigen(cor.mat)