# Edps 589
# Fall 2019
# C.J.Anderson
#
#  HSB:  Multiple regression of program type = school + type + ses
#
#  1. Set up and read in data (long format, one line per case)
#  2. Create data frame for 3-way table
#  3 Fit logit models to 3-way table
#    --  no effect of ses or sctyp on p(academic)
#    --  additive effects logit (or homoegeneous association)
#  4. Logit models fit as poisson log-linear regression models
#     -- using glm, which only gives G2
#     -- using loglm, which gives both pearson's X2 and G2
#  5. Change data to a wide format
#  6. Compute X2 and G2 using their definitions
#  7. Identification constraints
#     a. alternative dummy codes
#     b. effect codes
#  8. Same odds ratio regardless of ID constraints
#  9. Is SES ordinal or nominal?
# 10. More odds ratios 
# 11. Model individual level:  math, ses and schooltyp
#     a. program ~ math
#     b. program ~ math + ses (factor)
#     c. program ~ math + ses (numeric)
#     d. program ~ math + ses (numeric) + schtyp
#     e. program ~ math + ses (numeric) + schtyp + ses*schtyp
# 12. Regression Diagnostics on model 11e.
#     a. Hosmer-Lemshow
#     b. QQ plot of residuals
#     c. ROC and concordance -- need to use trick for multiple logistic regression
# 13. More predictors 
##

library(vcd)
library(ResourceSelection)
library(pROC)               
library(DescTools)

setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\5 logistic regression")
#setwd("C:\\Users\\cja\\Dropbox\\edps 589\\5 logistic regression")
#setwd("D:\\Dropbox\\edps 589\\5 logistic regression")

#####################################################
# 1. Read in data -- long format                    #
#####################################################

hsb <- read.csv(file="hsb_data.csv", header=TRUE, sep=",")
names(hsb)
head(hsb)

#####################################################
# 2. Create data frame for 3-way table              #
#####################################################

########################################## 
# Long table                             #
##########################################
var.levels <- expand.grid(academic=c("no", "yes"),
                          sctyp=c("public","private"),
			  ses = c("low","middle","high"))
( hsb3way <- data.frame(var.levels, count=c(91, 40,  4,  4, 138, 111,
                                            14, 36, 44, 82,   1,  35 )))				  
##########################################
# Wide table (see below for alternative) # 
##########################################
var.levels2 <- expand.grid(sctyp=c("public","private"),
			  ses = c("low","middle","high"))
hsb3way.no <- data.frame(var.levels2, no=c(40, 4, 111,  36, 82,   35 ))	 					  
hsb3way.yes <- data.frame(var.levels2, yes=c(91,4,136,14,44,1))					  
hsb3way.wide <- merge(hsb3way.yes,hsb3way.no, by=c("sctyp","ses"))
hsb3way.wide$ncases <- hsb3way.wide$yes + hsb3way.wide$no
hsb3way.wide

######################################################
# 3. Some logit models                               #
######################################################											
# logit --- (joint) independence
summary(  i.mod <- glm(yes/ncases ~ 1 , weights=ncases, data=hsb3way.wide,family=binomial)  )

# logit ---  homogeneous association
summary(  h.logit <- glm(yes/ncases ~ sctyp + ses , weights=ncases, data=hsb3way.wide,family=binomial)  )
X2 <- sum(residuals(h.logit,type="pearson")^2)


#######################################################
# 4. Logit models as poisson regression models        #
#######################################################
# As a poisson --- one way to get  X2 goodness-of-fit statistics
summary( h.poi <- glm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way,family=poisson) )
X2 <- sum(residuals(h.poi,type="pearson")^2)

# This will give both G2 and X2
library(MASS)
summary( h.poi2 <- loglm(count ~ sctyp + academic + ses + sctyp*academic + sctyp*ses + ses*academic, data=hsb3way) )

######################################################
# 5. Change format of data to a wide format          #
#     Alternate way to get wide format               #
######################################################

yes <- hsb3way[ which(hsb3way$academic=="yes"),]
no <- as.data.frame(hsb3way[ which(hsb3way$academic=="no"),4])
names(no) <- c("no")
wide <- cbind(yes,no)
wide$ncases <- wide$count+wide$no

######################################################
# 6. Computing X2 and G2 by using their definitions  #
######################################################

logit.bin <- glm(academic ~ sctyp + ses, family = binomial, data = hsb3way, weight=count)

# There must be an easier way to get G2 and X2
oddnumber <- function(x) x%%2 != 0 
i <- oddnumber(1:12)
wide$pi <- logit.bin$fitted[i]
Xsq.yes <- (wide$count-    wide$pi*wide$ncases)**2/(wide$pi*wide$ncases)
Xsq.no <-  (wide$no   - (1-wide$pi)*wide$ncases)**2/((1-wide$pi)*wide$ncases)
(Xsq <- sum(Xsq.yes + Xsq.no))

Gsq.yes <- 2*wide$count *log(wide$count/(wide$ncases*wide$pi))
Gsq.no <- 2*wide$no * log(wide$no/(wide$ncases*(1-wide$pi)))
( Gsq <- sum(Gsq.yes+Gsq.no) )

##################################################################
# 7. Identification Constraints                                  #
#  --- to see why parameter estimates are different from SAS,    # 
#      see page 15 or so of lecture notes.  It all has to do     #
#      with ID constraints.  The interpretation doesn't change.  # 
#      Below I re-fit model with alternative codings             #
##################################################################

#############################     
# 7.a Alternate dummy codes #
#############################
#  Change ID constraints to match SAS: 

# -- change how Dummy codes are defined
hsb3way$Dsctyp <- ifelse(hsb3way$sctyp=="public",1,0)
hsb3way$Dlow   <- ifelse(hsb3way$ses=="low",1,0)
hsb3way$Dmid   <- ifelse(hsb3way$ses=="middle",1,0)

summary( logit.bin2 <- glm(academic ~  Dlow + Dmid + Dsctyp, family = binomial, data = hsb3way, weight=count) )

#############################  
# 7.b  Using Effect codes   #
#############################
hsb3way$Esctyp <- ifelse(hsb3way$sctyp=="public",1,-1)
hsb3way$Elow   <- ifelse(hsb3way$ses=="low",1,0)
hsb3way$Elow   <- ifelse(hsb3way$ses=="high",-1,hsb3way$Elow)
hsb3way$Emid   <- ifelse(hsb3way$ses=="middle",1,0)
hsb3way$Emid   <- ifelse(hsb3way$ses=="high",-1,hsb3way$Emid)

summary( logit.bin3 <- glm(academic ~  Elow + Emid + Esctyp, family = binomial, data = hsb3way, weight=count) )

##############################################################
# 8. Same odds ratios regardless of ID constraints           #
##############################################################
# --- holding sctype constant:  public vs private odds  ratios

(odds.dummy1 <- exp(0 - logit.bin$coefficients[4]))

(odds.dummy2 <-  exp(logit.bin2$coefficients[4]- 0))
 
(odds.effect3 <-  exp(logit.bin3$coefficients[4]+logit.bin3$coefficients[4]))
   
# -- try to get the rest that are in the notes
   
##################################################################
# 9. SAS as nominal vs ordinal variable                          #
##################################################################

# --  Create scores for ses
hsb3way$xses  <- as.numeric(hsb3way$ses)

# -- ses as factor
summary( logit.factor <- glm(academic ~  ses + sctyp, family = binomial, data = hsb3way, weight=count) )

# -- note:  -"residual deviance"/2 = -2log(like)

# -- ses as numeric
summary( logit.numeric <- glm(academic ~  xses + sctyp, family = binomial, data = hsb3way, weight=count) )

# -- compute goodness-of-fit statistics for logit.numeric
wide$pi.numeric <- logit.numeric$fitted[i]
Xsq.yes <- (wide$count -    wide$pi.numeric*wide$ncases)**2/(wide$pi.numeric*wide$ncases)
Xsq.no <-  (wide$no   - (1-wide$pi.numeric)*wide$ncases)**2/((1-wide$pi.numeric)*wide$ncases)
(Xsq <- sum(Xsq.yes + Xsq.no))

Gsq.yes <- 2*wide$count *log(wide$count/(wide$ncases*wide$pi.numeric))
Gsq.no <- 2*wide$no * log(wide$no/(wide$ncases*(1-wide$pi.numeric)))
( Gsq <- sum(Gsq.yes+Gsq.no) )

# -- LR test of restriction on values for Ses
anova(logit.numeric, logit.factor)

##############################################################
# 10. Computing odds ratios using model with numerical SES    #
##############################################################
# Note
logit.numeric$coefficients

# -- Holding schtyp constant, odds of academic are 
exp(logit.numeric$coefficients[2])
# the odds for an increase of SES by 1 unit

# -- odds academic of low vs high ses
exp((3-1)*logit.numeric$coefficients[2])

# -- try to get the others

	   
#############################################################
# 11. Individual level data: math, ses and school type      #
#############################################################	 
#######################
# 11.a program ~ math #
#######################
summary(logit.mod1 <- glm(program ~ math, data=hsb,family = binomial)) 
j <- order(hsb$math)	                                
plot(hsb$math[j],logit.mod1$fitted[j], 
    type='l',
	lwd=2,
	ylim=c(0,1),
	col="blue",
	main="Model 1: program ~ math")
######################
# 11.b ses as nominal#
######################
hsb$ses <- as.factor(hsb$ses)
summary(logit.mod2 <- glm(program ~ math + ses, data=hsb,family = binomial)) 

# - subset for graphing
hsb$fitted2 <- logit.mod2$fitted
low <- hsb[ which(hsb$ses=="1"),]
mid <- hsb[ which(hsb$ses=="2"),]
hi  <- hsb[ which(hsb$ses=="3"),]
j <- order(low$math)	                                
plot(low$math[j],low$fitted2[j], 
    type='l',
	lwd=2,
	ylim=c(0,1),
	xlim=c(30,80),
	col="blue",
	main="Model 2: program ~ math + ses (as factor) "
	)
j <- order(mid$math)	                                
lines(mid$math[j],mid$fitted2[j],col="green",lwd=2)
j <- order(hi$math)	                                
lines(hi$math[j],hi$fitted2[j],col="red",lwd=2)
legend("topleft",legend=c("high","middle","low"), 
       col=c("red","green","blue"), lty=1,lwd=2, cex=1)
	   
####################
# 11.c ordinal ses #
####################   
hsb$ses <- as.numeric(hsb$ses)
summary(logit.mod3 <- glm(program ~ math + ses, data=hsb,family = binomial)) 

# - subset for graphing
hsb$fitted3 <- logit.mod3$fitted
low <- hsb[ which(hsb$ses=="1"),]
mid <- hsb[ which(hsb$ses=="2"),]
hi  <- hsb[ which(hsb$ses=="3"),]
j <- order(low$math)	                                
plot(low$math[j],low$fitted3[j], 
    type='l',
	lwd=2,
	ylim=c(0,1),
	xlim=c(30,80),
	col="blue",
	main="Model 3: program ~ math + ses (numeric) "
	)
j <- order(mid$math)	                                
lines(mid$math[j],mid$fitted3[j],col="green",lwd=2)
j <- order(hi$math)	                                
lines(hi$math[j],hi$fitted3[j],col="red",lwd=2)
legend("topleft",legend=c("high","middle","low"), 
       col=c("red","green","blue"), lty=1,lwd=2, cex=1)
	   
####################
# 11.d + schtyp    #
####################   
hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod4 <- glm(program ~ math + ses + sctyp, data=hsb,family = binomial)) 

# - subset for graphing
hsb$fitted4 <- logit.mod4$fitted
low.public  <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),]
low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),]
mid.public  <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),]
mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),]
hi.public   <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),]
hi.private  <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),]
j <- order(low.public$math)	                               
plot(low.public$math[j],low.public$fitted4[j], 
    type='l',
	lwd=2,
	ylim=c(0,1),
	xlim=c(30,80),
	col="blue",
	main="Model 4: program ~ math + ses (numeric) + school type  "
	)
j <- order(low.private$math)	                                
lines(low.private$math[j],low.private$fitted4[j],col="darkblue",lwd=2)
j <- order(mid.public$math)	                                
lines(mid.public$math[j],mid.public$fitted4[j],col="green",lwd=2)
j <- order(mid.private$math)	                                
lines(mid.private$math[j],mid.private$fitted4[j],col="darkgreen",lwd=2)
j <- order(hi.public$math)	                                
lines(hi.public$math[j],hi.public$fitted4[j],col="red",lwd=2)
j <- order(hi.private$math)	                                
lines(hi.private$math[j],hi.private$fitted4[j],col="darkred",lwd=2)
legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "), 
       col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1)
	   

####################
# 11.d + schtyp    #
####################   
hsb$sctyp <- as.factor(hsb$sctyp)
summary(logit.mod5 <- glm(program ~ math + ses + sctyp + ses*sctyp, data=hsb,family = binomial)) 

# - subset for graphing
hsb$fitted5 <- logit.mod5$fitted
low.public  <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="1")),]
low.private <- hsb[ (which(hsb$ses==1 & hsb$sctyp=="2")),]
mid.public  <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="1")),]
mid.private <- hsb[ (which(hsb$ses==2 & hsb$sctyp=="2")),]
hi.public   <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="1")),]
hi.private  <- hsb[ (which(hsb$ses==3 & hsb$sctyp=="2")),]
j <- order(low.public$math)	                               
plot(low.public$math[j],low.public$fitted5[j], 
    type='l',
	lwd=2,
	ylim=c(0,1),
	xlim=c(30,80),
	col="blue",
	main="Model 5: program ~ math + ses + schyp + ses*schtyp "
	)
j <- order(low.private$math)	                                
lines(low.private$math[j],low.private$fitted5[j],col="darkblue",lwd=2)
j <- order(mid.public$math)	                                
lines(mid.public$math[j],mid.public$fitted5[j],col="green",lwd=2)
j <- order(mid.private$math)	                                
lines(mid.private$math[j],mid.private$fitted5[j],col="darkgreen",lwd=2)
j <- order(hi.public$math)	                                
lines(hi.public$math[j],hi.public$fitted5[j],col="red",lwd=2)
j <- order(hi.private$math)	                                
lines(hi.private$math[j],hi.private$fitted5[j],col="darkred",lwd=2)

legend("bottomright",legend=c("private high SES", "private middle SES","private low SES","public high SES", "public middle SES", "public low SES "), 
       col=c("darkred","darkgreen","darkblue","red","green","blue"), lty=1,lwd=2, cex=1)
	   
	
###################################################################
# 12 Some diagnostics on model 5                                  #
###################################################################
########################
# 12 a Hosmer-Lemeshow #
########################
hoslem.test(logit.mod5$y, fitted(logit.mod5), g=10)

########################
# 12b qq plot residuals--- looks different from SAS#
########################
# Pearson (adjusted standardized residuals
resid <- rstandard(logit.mod5, pearson=TRUE)
qqnorm(resid,
       main="QQ plot of  Pearson  Residuals",
	   col="blue")

###########################
# 12c. ROC and concordance#
###########################
# Concordance index
Cstat(logit.mod1,resp=hsb$program)
Cstat(logit.mod2,resp=hsb$program)
Cstat(logit.mod3,resp=hsb$program)
Cstat(logit.mod4,resp=hsb$program)
Cstat(logit.mod5,resp=hsb$program)


# For ROC w/ multivariate logitistic regression
roc.data5 <-roc(logit.mod5$y , logit.mod5$fitted.values,ci=T)
names(roc.data5)
plot((1-roc.data5$specificities),roc.data5$sensitivities,
      type='l',
	  lwd=2,
	  col="blue",
     main="ROC for Models 5, 1 and chance",
	 xlab="1-sensitivity (false positives)",
	 ylab="Specificity (true positives)"
	 )
roc.data1 <- roc(logit.mod1$y, logit.mod1$fitted.values,ci=T)
lines((1-roc.data1$specificities), roc.data1$sensitivities, col="pink", lwd=2)
text(.1,.99,"model 5: c=.79",col="blue")
text(.1,.94,"model 1: c=.75",col="pink")
text(.1,.89,"chance:  c=.50",col="black")
lines(c(0,1),c(0,1), lty=2)
legend("bottomright",
        legend=c("math + ses + schyp + ses*schtyp", "math","chance"),
		col=c("blue","pink","black"), lty=c(1,1,2), lwd=2)

#########################################################################
# 13 More predictors and problem of multicolinearity                    #
#########################################################################
summary(logit.mod6  <- glm(program ~ math + ses + sctyp + ses*sctyp + ses*math + math*sctyp + math*ses*sctyp, data=hsb,family = binomial)) 

# center all variables
hsb$cmath <- scale(hsb$math,center=TRUE,scale=FALSE)
hsb$cses <- scale(hsb$ses,center=TRUE,scale=FALSE)
hsb$sctyp <- as.numeric(hsb$sctyp)
hsb$csctyp <- scale(hsb$sctyp,center=TRUE,scale=FALSE)

summary(logit.mod7 <- glm(program ~ cmath + cses + csctyp + cses*csctyp + cses*cmath + cmath*csctyp +cmath*cses*csctyp, data=hsb,family = binomial)) 

summary(logit.mod8 <-  glm(program ~ math + ses + sctyp + ses*sctyp + rdg + sci + wrtg + civ, data=hsb,family = binomial)) 

# Examine correlations among achievement tests
var.names <- c("math","rdg","sci","wrtg","civ")
var.data <- hsb[var.names]
cor.mat <- cor(var.data,method="pearson")
round(cor.mat,2)

# Note eigen-vectors can be reflected (i.e., multiply by -1)
eigen(cor.mat)