#
#  Edps/Psych/Stat  587
#  Spring 2020
#  Carolyn J Anderson
# 
#  All analyses in lecture on Random Intercept & Slope Models in R   (High School & Beyond Ones)
#
#  
#  I relied heavliy on Introduction to Multilevel Modeling in R by Grover, Guillermono and Hudson (2015)
#   downloaded from https://rstudio-pubs-static.s3.amazonaws.com/78961_fe5b5c6a77f446eca899afbb32bd1dc7.html
#
#  For p-value and different degrees of freedom using lmerTest package see 
#       http://mindingthebrain.blogspot.com/2014/02/three-ways-to-get-parameter-specific-p.html
#  Below I use lmerTest, which give you Satterthwaited df and p-values for HLM (these are same 
#    as given by SAS)
#
#
# Load Packages:
#          lme4  ---- fits hlm models (and multilevel logistic)
#          lattice--- multiple plots per page
#          lmrTest--- Satterthwaited df and t-tests for fixed paramters
#          texreg --- for nicer output
#          optimx --- change the optimizer that lmer uses
#

library(lme4)
library(lmerTest)
library(lattice)
library(texreg)   # nice lmer output there are others, e.g., stargazer and sjplot
library(optimx)

# Set working dircetory
#
#setwd('C:/Users/cja/Dropbox/edps587/lectures/4 randomslope')
setwd('D:/Dropbox/edps587/lectures/4 randomslope')

############################################################################
#  Data set up: merge hsb1 and hsb2                                        #
############################################################################
#
# Read in student level data
#
hsb1 <- read.table(file="HSB1data.txt", header=TRUE)

# Top & bottom of the file
#
head(hsb1)
tail(hsb1)


#
# Read in school level data
#
hsb2 <- read.table(file="HSB2data.txt", header=TRUE)

# Top & bottom of the file 
#
head(hsb2)
tail(hsb2)

#                                                               #
# Make long matrix                                              #
#                                                               #   
#  Already Sorted by school id                                  #
#                                                               #
# Merging student & school level files by school id ('id')      #
#                                                               #
hsb <- merge(hsb1,hsb2, by=c('id'))
head(hsb)


#########################################################################
# Some information about the data    (e.g., descriptive statistics      #
#########################################################################

#
# Number of schools
(nschools <- nrow(hsb2))



#
# Overall summary statistics
#
summary(hsb$mathach)
sd(hsb$mathach)

summary(hsb$ses)
sd(hsb$ses)

#
# Summary descriptive statistics by School
#

mmean <- aggregate(mathach~id, data=hsb, "mean")
msd <- aggregate(mathach~id, data=hsb, sd)
mmin <- aggregate(mathach~id, data=hsb, min)
mmax <- aggregate(mathach~id, data=hsb, max)
math.stat <- cbind(mmean,msd[,2], mmin[,2], mmax[,2])

names(math.stat) <- c('Mean Math', 'SD math', 'Min', 'Max')
math.stat


####################################################
# Some graphing                                    #
####################################################

#
# Plot of mean ses by id --> variability between schools
#

plot(hsb$id, hsb$meanses, type='p', pch=21,
     main='HSB: Group Mean SES by School ID',
     xlab='School ID',
     ylab='Group Mean SES')

#
# Plot of student ses by id --> variability within schools
#

plot(hsb$id, hsb$ses, type='p', pch=21,
     main='HSB: Student SES by School ID',
     xlab='School ID',
     ylab='Group Mean SES')

#
#  Just one school
#

school.1224 <- hsb[ which(hsb$id==1224), ]


plot(school.1224$ses, school.1224$mathach, type='p', pch=21,
     main="HSB: Math by SES for School 1224",
     xlab='Student SES',
     ylab='Math Achievement')
reg.1224 <- lm(school.1224$mathach~school.1224$ses, data=school.1224)
abline(reg.1224)

#
# Spanel plot of a random sample of 20
#

# Will need this for graph 

hsb$cses <- hsb$ses - hsb$meanses

# Random sample
groups <- unique(hsb$id)[sample(1:160,20)]

subset <- hsb[hsb$id%in%groups,]

xyplot(mathach ~ ses | id, data=subset, col.line='black', 
                      type=c('p'),
                      main='Varability in Math ~ SES relationship')
  

#
# panel plot of a random sample of 20 with regression lines
#

xyplot(mathach ~ ses | id, data=subset, col.line='black', 
                      type=c('p','r'),
                      main='Varability in Math ~ SES relationship')
 
#
#  Panel Now with school centered ses
#
 
xyplot(mathach ~ cses | id, data=subset, col.line='black', 
                      type=c('p','r'),
                      main='Math Achievement by School Centered SES',
                      xlab='School Centered SES',
                      ylab='Math Achievement Test Scores')
  


# 
# All regression lines in one plot
#
#
# page 3 Plot all regressions in one plot
#
# need school ID equal to 1 - 160
sch_id <- seq(1:nschools)
id <- unique(hsb$id)
sch_info <- as.data.frame(cbind(id,sch_id))
hsb <- merge(hsb,sch_info,by=c("id"))

# First make frame for plot
plot(hsb$ses,hsb$mathach,type = 'n', 
     ylim = c(-10, 30),
     xlim = c(-3, 3),
     cex.main = 1.5,
     xlab = 'SES', 
     ylab = "Math Scores",
     main = "Separate Regression for Each School "
     )

# now fit and add regression lines
for(i in 1:nschools){
  sub <- hsb[which(hsb$sch_id==i),]
  fitted <- fitted(lm(mathach~ses,data=sub))
  lines(sub$ses,fitted,col=i)
}

######################################################
# ~ page 45  Graph of estimated variances of random  # 
# intercept and intercept & slope                    #
######################################################
x <- seq(from=-2, to=2, by=.1)

total<- 8 + 2*(-0.5)*x + (1.5)*(x**2) + 30

plot(x,total,type = 'l', 
     cex.main = 1.5,
     col='blue',
     xlim=c(-2,2),
     xlab = 'Value of x',  
     ylab = "Estimated Variance of x",
     main = "Variance Function: Illustration of Heteroscedasticity"
     )



###########################################################   
# Create and add mean meanminority & meanfemale to data   #
###########################################################

# The data file already has mean SES but need mean center ses
hsb$ses.centered <- hsb$ses - hsb$meanses

meanminority  <- aggregate(minority ~ id , data=hsb, mean)
names(meanminority) <- c('id', 'meanminority')

hsb <- merge(hsb,meanminority, by =c('id'))



meanfemale  <- aggregate(female ~ id , data=hsb, mean)
names(meanfemale) <- c('id', 'meanfemale')

hsb <- merge(hsb,meanfemale, by =c('id'))


# If you want to save data so that you don't got through steps
#   to create is again, you can write the results to a file.
#   There are other file formats that you can use (e.g., csv)
#
#write.table(hsb, 'hsb.txt', row.names=F, na='.')



######################################################################
# Model fitting                                                      
#
# Note that "Std.Dev." in the Random effects table produced by lmer 
#   is NOT the standard error of the variance estmates.  The latter 
#   is not computed in lmer but it is computed in SAS.  These is some 
#   controversy over whether se of variance estimates should even be
#   computed. We do not use them, so not problem.
######################################################################

#
# Random inercept / empty HLM / Null model / RANDOM effects ANOVA
#
#

summary(model0 <- lmer(mathach ~ 1 + (1 | id), data=hsb, REML=FALSE))

screenreg(list(model0))


# Cut and paste
icc <- 8.553/(8.553+39.148)
icc

# Or extract the information to compute

vars <- as.data.frame(VarCorr(model0))[4]
total <- sum(vars)
tau00 <- vars[1,1]
icc <- tau00/total
icc

# Use the following function that I wrote:

icc.lmer <- function(modl){
  vars <- as.data.frame(VarCorr(modl))[4]
  total <- sum(vars)
  tau00 <- vars[1,1]
  icc <- tau00/total
  return(icc)
  }

# example of how to use this
icc.lmer(model0)


#
#  Random intercept and slope for centered ses
#

summary(model1 <- lmer(mathach ~ 1 + ses.centered + (1 | id), data=hsb, REML=FALSE))

icc.lmer(model1)

# for easier comparison
screenreg(list(model0,model1),single.row=TRUE,custom.model.names=c("Null","RI cses"))

# for latex table
texreg(list(model1),single.row=TRUE,custom.model.names=c("RI cses"))

#
#  Random intercept and  slope & meanses for ses
#
summary(model2 <- lmer(mathach ~ 1 + ses.centered  + (1 + ses.centered| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model2))[4]
deviance(model2)

# for easier comparison
screenreg(list(model0,model1,model2),single.row=TRUE,custom.model.names=c("Null","RI", "RI + S"))

# and for me, latex code
texreg(list(model0,model1,model2),single.row=TRUE,custom.model.names=c("Null","RI","RI and S"))

############################
# ~ page 20 plot           #
############################
par(mfrow=c(1,1)) 

fixed <- fixef(model2)
mu2 <- fixed[1] + fixed[2]*hsb$ses.centered 

plot(hsb$ses.centered, mu2, type='l',
     col='black',
     xlim=c(-2, 2),
     ylim=c(0, 25),
     xlab='School Mean Centered SES',
     ylab='Fitted Math Scores',
     main='HSB: Population Average Regressions (Model 2)'
)

################################################################
# ~ page 21, Sample of fitted school specific regression lines #
################################################################

random <- ranef(model2, level=0:2)
U0j <- random$id[,1]
U1j <- random$id[,2]

plot(hsb$mathach ~ hsb$ses.centered, type = 'n', 
     cex.main = 1.5,
     xlab = 'School Mean Centered SES',  
     ylab = "Fitted Math Scores",
     main = "HSB:  Fitted School Regression Lines (private,white)"
     )
	 
# draw in scho0l specific lines:  Private, white
for (i in 1:40){
    abline(a=(fixed[1]+ U0j[i]), b=(fixed[2]+U1j[i]),  col="blue")
  }

##############################################
# ~ p 22 Side by Side Comparison             #
##############################################
par(mfrow=c(1,2))
plot(hsb$ses.centered, mu2, type='l',
     col='black',
     xlim=c(-2, 2),
     ylim=c(0, 25),
     xlab='School Mean Centered SES',
     ylab='Fitted Math Scores',
     main='HSB: Population Average Regressions (Model 2)'
)

plot(hsb$mathach ~ hsb$ses.centered, type = 'n', 
     cex.main = 1.5,
     xlab = 'School Mean Centered SES',  
     ylab = "Fitted Math Scores",
     main = "HSB:  Fitted School Regression Lines"
     )
# draw in scho0l specific lines:  Private, white
for (i in 1:40){
    abline(a=(fixed[1]+ U0j[i]), b=(fixed[2]+U1j[i]),  col='blue')
  }


########################################################
#  ~ Page 23, estimated variance functions for 2 models#
########################################################
par(mfrow=c(1,1))
ses2 <- seq(from=-2, to= 3, by=.1)
vars.ri <- as.data.frame(VarCorr(model1))[4]
total.ri <- rep(sum(vars.ri), 61)

vars.ris <- as.data.frame(VarCorr(model2))[4]
total.ris<- vars.ris[1,] + 2*vars.ris[3,]*ses2 + vars.ris[2,]*(ses2**2) + vars.ris[4,]

plot(ses2,total.ris,type = 'l', 
     cex.main = 1.5,
     col='blue',
     xlim=c(-2,3),
     ylim=c(45,52),
     xlab = 'School Mean Centered SES',  
     ylab = "Estimated Variance of Math Scores",
     main = "HSB: Variance Function"
     )
# random inercept
     abline(a=sum(vars.ri), b=0, col='red')

legend(-2,52, 
       c('Random Intercept & Slope','Random Intercept'),
       lty=c(1,1),
       lwd=c(2.5,2.5),
       col=c('blue','red'))



#################################################################
#  Discrete Level 1 predictors                                  #
#################################################################

# Note female and minority are dummy coded

model.d1 <- lmer(mathach ~ 1 + ses.centered  + female + minority+ (1 + ses.centered| id), data=hsb, REML=FALSE)
summary(model.d1)
as.data.frame(VarCorr(model.d1))[4]
n2lnlike <- -2* logLik(model.d1)
n2lnlike


screenreg(list(model1,model2,model.d1),custom.model.names=c("RI","RI and RS","Discrete fixed"))

# Plot of population average model

fixed <- fixef(model.d1)
fem_mi <- fixed[1] + fixed[2]*hsb$ses.centered + fixed[3] + fixed[4]

plot(hsb$ses.centered, fem_mi, type='l',
     col='black',
     ylim = c(0,25),
     xlim = c(-2,2),
     xlab='School Centered SES',
     ylab='Fitted Math Scores',
     main='Population Average Regressions (discrete fixed)'
)
abline(a=(fixed[1] + fixed[3]), b=fixed[2], col='red')
abline(a=(fixed[1] + fixed[4]), b=fixed[2], col='blue')
abline(a=(fixed[1]), b=fixed[2], col='green')
legend(-2,25,
       c('Female, Minority', 'Female, non-Minority', 'Male, Minority', 'Male, non-Minority'),
       lty=c(1,1,1,1),
       col=c('black', 'red', 'blue', 'green')
    )

#
# Plot of variance function
#
ses2 <- seq(from=-2, to= 3, by=.1)
vars.d1 <- as.data.frame(VarCorr(model.d1))[4]
total.d1<- vars.d1[1,] + 2*vars.d1[3,]*ses2 + vars.d1[2,]*(ses2**2) + vars.d1[4,]

plot(ses2,total.d1,type = 'l', 
     cex.main = 1.5,
     col='blue',
     xlim=c(-2,3),
     xlab = 'School Mean Centered SES',  
     ylab = "Estimated Variance of Math Scores",
     main = "HSB: Variance Function (discrete fixed)"
     )

#####################################################
# ~ page 66, models fit to data                     #              
#   Showing effect of random categorical predictors #
#####################################################
hsb$gender <- as.factor(hsb$female)
hsb$race   <- as.factor(hsb$minority)
hsb$boy <- as.factor(-1*(hsb$female-1))
hsb$male <- -1*(hsb$female-1)

summary(model.00 <- lmer(mathach ~ 1 + ses.centered  + gender + race + (1 + ses.centered| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.00))[4]


# Model b1 
summary(model.b1 <- lmer(mathach ~ 1 + ses.centered  + female + race + (1 + ses.centered + female| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.b1))[4]

# Alternative Model b1 
summary(model.ab1 <- lmer(mathach ~ 1 + ses.centered  + gender + race + (1 + ses.centered + gender| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.ab1))[4]

# Model b2
summary(model.b2 <- lmer(mathach ~ 1 + ses.centered  + hsb$male + race + (1 + ses.centered + hsb$male| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.b2))[4]

# Model c.
summary(model.c <- lmer(mathach ~ 1 + ses.centered  + hsb$male + race + (0 + ses.centered + male + female + minority| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.b2))[4]


# Model: random slopes for all--- THIS FAILS as it SHOULD
summary(model.all <- lmer(mathach ~ 1 + ses.centered  + female + minority + (1 + ses.centered + female + minority | id), data=hsb, REML=FALSE))


# Model c + minority --- FAILS IN R BUT NOT SAS
summary(model.c <- lmer(mathach ~ 1 + ses.centered  + female + minority + (0 + ses.centered  + female + race| id), data=hsb, REML=FALSE))


###################################################################################### 
# You're on your own for the figures --- cut, paste & edit code for similar figures  #
######################################################################################


#########################################################
# p118ish Cross level interactions                      #
#########################################################
summary(model.3 <- lmer(mathach ~ 1 + ses.centered  + meanses + ses.centered*meanses + (1 + ses.centered| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.3))[4]

# drop non-significant interactions
summary(model.4 <- lmer(mathach ~ 1 + ses.centered + meanses  + (1 + ses.centered| id), data=hsb, REML=FALSE))

#
# p127 Big model  --- R starts to get slow, can switch to micro-soft (free) R which is about 4 times faster.
#
#
# this fails
summary(model.5 <- lmer(mathach ~ 1 + ses.centered  + female + minority + meanses  + size + sector 
               + pracad + disclim + himinty +(1 + ses.centered + female + minority| id), data=hsb, REML=FALSE))
#
# However rescale school size which has a very different scale
#
hsb$xsize <- scale(hsb$size,center=FALSE,scale=TRUE)
summary(model.5 <- lmer(mathach ~ 1 + ses.centered  + female + minority + meanses  + xsize + sector 
               + pracad + disclim + himinty +(1 + ses.centered + female + minority| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.5))[4]

#
# Other "tricks" to get models to converge is topic for another lecture,
# but I did need an additional one to get model 6 to converge.

#
# p127 Bigger model  --- add sector as cross-level (i.e. predictor at level 2) & drop random gender.
#                        Do see some differences in estimates of tau relative to SAS.
#
# Needed to center himinity and change optimzier to get convergence
#
hsb$xhiminty <- scale(hsb$himinty,center=FALSE,scale=TRUE)
summary(model.6 <- lmer(mathach ~ 1 + ses.centered  + female + minority + meanses  + xsize + sector 
               + pracad + disclim + xhiminty + ses.centered*sector + minority*sector 
               + (1 + ses.centered + minority| id), data=hsb, REML=FALSE,
                  control = lmerControl(optimizer ="Nelder_Mead") ))
as.data.frame(VarCorr(model.6))[4]


#
# p127 A bit simpler model  --- only random intercept
#

summary(model.7 <- lmer(mathach ~ 1 + ses.centered  + female + minority + meanses  + xsize + sector 
               + pracad + disclim  + ses.centered*sector + minority*sector 
               + (1| id), data=hsb, REML=FALSE))
as.data.frame(VarCorr(model.7))[4]