# edps 589
# Fall 2018
# c.j.anderson
#
# baseline multinomial logit:   HSB 
#
#

library(nnet)     # for baseline multinomial logistic regression
library(dplyr)    # for select function
library(tidyr)    # for converting from wide to long format

setwd("C:\\Users\\cja\\Dropbox\\edps 589\\8 Multicategory_logit")
#setwd("D:\\Dropbox\\edps 589\\8 Multicategory_logit")

hsb <- read.table("hsb_data.txt",header=TRUE)

####################################################
# Model 1:  Simple                                 #
####################################################
hsb$hsp <- as.factor(hsb$hsp)

summary( mlogit <- multinom(hsp ~ achieve, data = hsb) )

# Change the reference category to VoTech, which is hsp=3.  
#  This will give same parameters as are in the lecture notes

hsb$hsp <- relevel(hsb$hsp, ref = "3")
summary( mlogit <- multinom(hsp ~ achieve, data = hsb) )

# estimated odds ratios for 1 unit change in achieve:
# -- general vs votech
exp(0.0599778)
# -- academic vs votech
exp(0.16985890)
# -- general vs acahdemic
exp(0.0599778-0.16985890)

# estimated odds ratios for 10 unit change in achievement:
# -- general vs votech
exp(0.0599778*10)
# -- academic vs votech
exp(0.16985890*10)
# -- general vs acahdemic
exp((0.0599778-0.16985890)*10)

names(mlogit)

# same fitted probabilites  
head(mlogit$fitted)
hsb$fitV <- mlogit$fitted[,1]
hsb$fitG <- mlogit$fitted[,2]
hsb$fitA <- mlogit$fitted[,3]


hsb <- hsb[order(hsb$achieve),]
# What probabilities look like	 
plot(hsb$achieve,hsb$fitV, type="l",lwd=2,col="blue",
     ylim=c(0,1),
	 ylab="Fitted Probabilities",
	 xlab="Achievement",
     main="Fitted Probabilities from Baseline Model")
lines(hsb$achieve,hsb$fitG, type="l",lwd=2,col="green")
lines(hsb$achieve,hsb$fitA, type="l",lwd=2,col="red")
legend("topleft",legend=c("P(VoTech)","P(General)", "P(Academic)"),
       col=c("blue","green","red"), lwd=2)
	 
###########################################
# Add SES as nominal and then as ordinal  #
#   explanatory varaible                  #
###########################################

# -- SES nominal
hsb$nses <- as.factor(hsb$ses)
hsb$nses <- relevel(hsb$nses, ref = "3")

summary(mlogit2 <- multinom(hsp ~ achieve + nses, data=hsb))

# --- some differences between this and SAS (even with 
#     same reference categories; however, the deviances
#     between SAS and R match.

# z=sqrt(wald) statistics for parameters
(z <- summary(mlogit2)$coefficients/summary(mlogit2)$standard.errors)
(p <- (1 - pnorm(abs(z), 0, 1)) * 2)


# -- SES ordinal
summary(mlogit3 <- multinom(hsp2 ~ achieve + ses, data=hsb))

# -- NOTE: anova does not work with multinom objects
# -- test of SES nominal vs ordinal
(lrt <- mlogit3$deviance -mlogit2$deviance)
1-pchisq(lrt,2)

# -- test achieve conditional on SES ordinal
(lrt <- mlogit$deviance -mlogit3$deviance)
1-pchisq(lrt,2)

# z=sqrt(wald) statistics for parameters
(z <- summary(mlogit3)$coefficients/summary(mlogit3)$standard.errors)
(p <- (1 - pnorm(abs(z), 0, 1)) * 2)

############################################
# Fit model using long format and glm as a #
# poisson regression                       #
############################################

hsb$id <- as.factor( seq(1:length(hsb$ses) ))
t <- table(hsb$id,hsb$hsp)
hsb.hsp <- as.data.frame(t)
names(hsb.hsp) <- c("id","program","y")

long <- merge(hsb, hsb.hsp, by=c("id"))

long$id <- as.factor(long$id)
long$program <- as.factor(long$program)

# -- take a little longer but works fine
summary(long.mod <- glm(y ~ id + program + program*achieve, data=long, family=poisson))

############################################
# Test whether slopes can be same          #
############################################

## Method 1: create a new variable that equates
##           the slopes for general and votech

# no restrictions
summary( logit.3slopes <- glm(y ~  id + program + program*achieve, data=long, family=poisson))

# restrictions
long$program2 <- ifelse(long$program=="2", 0, 1)
table(long$program2,long$program)

summary( logit.2slopes <- glm(y ~  id + program + program2*achieve, data=long, family=poisson))

anova(logit.2slopes,logit.3slopes,test="LRT")

## Method 2: Sub-set data and just run logistic
##           regression of binary data

hsb.VG <- sample(hsb[(hsb$hsp != "2"),])
summary(VGmod <- glm(hsp ~ achieve,data=hsb.VG,family=binomial))
anova(VGmod,test="LRT")