# edps 589
# Fall 2018
# c.j.anderson
#
# Proportional odds & basline for high school and beyond data
#   o Recoding
#   o Proportional odds model (4 different methods)
#      (note I also did baseline using vglm)
#   o Graphing
#   o Baseline model for comparision
#
#
# for baseline multinomial logistic regression
library(nnet)     
# for proportional odds model
library(MASS)     # plor
library(VGAM)     # vglm
library(rms)      # lrm
library(ordinal)  # clm

library(car)      # Anova   (note captial "A")
library(dplyr)    # for select function


#setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\8 Multicategory_logit")
setwd("D:\\Dropbox\\edps 589\\8 Multicategory_logit")

hsb <- read.table("hsb_data.txt",header=TRUE)

# Re-code hsp so that it corresponds to what is in the notes:
hsb$hsprog <- 0
hsb$hsprog[hsb$hsp==2] <- 1
hsb$hsprog[hsb$hsp==1] <- 2
hsb$hsprog[hsb$hsp==3] <- 3

# But I would rather the shorted name: hsp
# ...and I need one as factor and other as numeric
#    (which we need, depends on the package)
hsb$hsp <- hsb$hsprog


#####################################
# Proportional odds model           #
# (a) clm from ordinal package      #
# (b) polr from MASS package        #
# (c) lrm from the rms package      #
# (d) vglm from the VGAM package    #
#####################################


######
# (a) clm from ordinal package 
######
# response has to be a factor
hsb$hsp <- as.factor(hsb$hsp)
po <- clm(hsp ~ achieve, data=hsb) 
anova(po)

# test proportional odds assumption (before really looking at model)
#  --- the test statistics is in column label "LRT"
nominal_test(po)

# Not let's look at parameters
summary(po)

# Fitted probabilities for each type of hsp:
hsb$po.fit <- po$fitted

######
# (b) polr from MASS package        
######
# response has to be a factor
hsb$hsp <- as.factor(hsb$hsp)

summary(po.polr <- polr(hsp ~ achieve, data=hsb,Hess=TRUE )

# See what's available (check manual for defintions of these
names(po.polr)


# calculate and store p values
ctable <- coef(summary(po.polr))
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2

# combined table of coefficeints, se, t and pvalues
(ctable <- cbind(ctable, "p value" = p))

# default method gives profiled CIs
(ci <- confint(po.polr, level=.99) 

# odds ratios
exp(coef(po.polr))

fit <- po.polr$fitted.values

######
# (c) lrm from the rms package      #  
######

# I'm not sure how to use this one (yet)

######
# (d) vglm from the VGAM package
####### 
# Note:  response should be numeric (ordered)
summary( po.vglm1 <- vglm(hsprog ~ achieve,family=cumulative(parallel=TRUE), data=hsb) )

summary( po.vglm2 <- vglm(hsprog ~ achieve,family=cumulative(parallel=FALSE), data=hsb) )


# Difference in deviances of these two models (same as given by SAS):
lr <-   1082.413 - 1081.608 
# p for testing proportional odds assumption
1 - pchisq(lr,1)

# Compare with 
summary( multinom.vglm <- vglm(hsprog ~ achieve,family=multinomial, data=hsb) )


############################################################################
# Graphs
############################################################################
# get just the academic
academ <- subset(filter(hsb, hsp==1),select=c("hsp","achieve","po.fit"))
j <- order(academ$achieve)
plot(academ$achieve[j],academ$po.fit[j],
     type='l',  lwd=2,
	 col="red",
	 ylim=c(0,1),
	 ylab="Predicted probability",
	 xlab="Achievement Scores",
	 main="Proportional Odds Model fit to HSB Data"
	 )
general <- subset(filter(hsb, hsp==2),select=c(hsp,achieve,po.fit))
j <- order(general$achieve)
lines(general$achieve[j],general$po.fit[j],col="green", lwd=2)
votech <- subset(filter(hsb, hsp==3),select=c(hsp,achieve,po.fit))
j <- order(votech$achieve)
lines(votech$achieve[j],votech$po.fit[j],col="blue",  lwd=2)
legend("topleft",inset=.02,legend=c("Academic","General","VoTech"), 
       col=c("red","green","blue"), lwd=2, cex=1)

	   
# Fitted cumulative probabilities
ag <- rbind(academ,general)
ag$pAG <- exp(-5.547988 +0.1330308*ag$achieve)/(1+exp(-5.547988 +0.1330308*ag$achieve))
j <- order(academ$achieve)
plot(academ$achieve[j],academ$po.fit[j],
     type='l',  lwd=2,
	 col="blue",
	 ylim=c(0,1),
	 ylab="Predicted Cumulative probability",
	 xlab="Achievement Scores",
	 main="Proportional Odds Model fit to HSB Data"
	 )
j <- order(ag$achieve)
lines(ag$achieve[j],ag$pAG[j],col="red", lwd=2)
legend("topleft",inset=.02,legend=c("P(academic or general)","P(academic)"), 
       col=c("red","blue"), lwd=2, cex=1)

#############################################################
# Baseline                                                  #
#############################################################

# But I would rather the shorted name: hsp
hsb$hsp <- hsb$hsprog

## This is taking votech as baseline
summary(base1 <- multinom(hsp ~ achieve, data = hsb) )
# z=sqrt(wald) statistics for parameters
(z <- summary(base1)$coefficients/summary(base1)$standard.errors)
(p <- (1 - pnorm(abs(z), 0, 1)) * 2)


# estimated odds ratios for 1 unit change in achieve:
# -- general vs votech
exp(0.0599778)
# -- academic vs votech
exp(0.16985890)
# -- general vs acahdemic
exp(0.0599778-0.16985890)

# estimated odds ratios for 10 unit change in achievement:
# -- general vs votech
exp(0.0599778*10)
# -- academic vs votech
exp(0.16985890*10)
# -- general vs academic
exp((0.0599778-0.16985890)*10)