# Edps/Psych/Stat 587
# Computer lab 4 template
# Spring 2019
#
# C.J.Anderson
#
#
#
# Packages:
# lme4
# lmerTest
# lattice
# ggplot2
# stringi (this needs to be installed before HLMdiag or
# HLMdiag won't load properly) 
# texreg
# optimx

# (install) and load packages
library(lme4)
library(lmerTest)
library(lattice)
library(ggplot2)
library(stringi)
library(HLMdiag)
library(texreg)
library(optimx)
#############################################################################
# read in data into R
# the first row of the data matrix is the variable names so there is a header
# change to your path
#############################################################################

lab4<-read.table("D:/Dropbox/edps587/Lab_and_homework/Computer lab4/lab3.txt",header=TRUE)

#look at first few rows of data set
head(lab4)

##################################################
# Set-up -- from before
##################################################

# Use if statment to create "boy"

lab4$boy = ifelse(lab4$gender=="boy", 1, 0)

# Create Third
lab4$third <- ifelse(lab4$grade==3, 1, 0)

# School mean Center math and school mean math

grpMmath <- as.data.frame(aggregate(math~idschool, data=lab4, "mean"))
names(grpMmath) <- c('idschool', 'grpMmath')
lab4 <- merge(lab4,grpMmath, by=c('idschool'))

lab4$grpCmath <- lab4$math - lab4$grpMmath

#
# make school id a factor variable
#
lab4$idschool<-as.factor(lab4$idschool)

#
#  Re-scale for latter use
#
lab4$gcmath <- scale(lab4$grpCmath, center=FALSE, scale=TRUE)
lab4$gmmath <- scale(lab4$grpMmath, center=FALSE, scale=TRUE)


################# # #
# Comand to pause between figures where ENTER
# gives next one.
#
# par(ask=TRUE)
################

####################################################
# Get random sample of 25 schools
####################################################
#
#   Look at individuals within schools
#
####################################################

groups <- unique(lab4$idschool)[sample(1:145,25)]

subset <- lab4[lab4$idschool%in%groups,]

#   Plot 2:  Lattice plot with regressions
xyplot(science ~ math | idschool, data=subset, col.line='black', 
type=c('p','r'), 
main='Variability in Science ~ math relationship')

#   Plot 3:  Lattice plot with smooth
xyplot(science ~ math | idschool, data=subset, col.line='black', 
type=c('p','smooth'), 
main='Variability in Science ~ math relationship')


############################################################
# All regressions in one plot: science ~ math
############################################################
ggplot(lab4, aes(x=math, y=science, col=idschool,type='l'),
       main='Regressions all in one plot')  + geom_smooth(method="lm", se=FALSE ) +  theme(legend.position="none")

	   
y <- aggregate(science~hoursTV, data=lab4, "mean")
y        # take a look at them


plot(y[,1], y[,2], type='b', 
     ylab='Mean Science',
     xlab='Hours Watching TV',
     main='Marginal of Science x HoursTV')


# Linear regressions
ggplot(lab4, aes(x=hoursTV, y=science, col=idschool,type='l'))  + geom_smooth(method="lm", se=FALSE )  +  theme(legend.position="none")

# smooth regressions
ggplot(lab4, aes(x=hoursTV, y=science, col=idschool,type='l'))  + geom_smooth(method="loess", se=FALSE )  +  theme(legend.position="none")

########################################################
# Only have to do this once for the regressions.  
#  This sets up index to identify each school's data.  
########################################################
nclusters <- length(unique(lab4$idschool))   # number of schools
nj <- table(lab4$idschool)   # number per school
nj <- as.data.frame(nj)
names(nj) <- c("idschool","nj")

# For a simple model: science ~ math

ssmodel <- (0)
sstotal <- (0)
R2 <- matrix(99,nrow=nclusters,ncol=2)  # create matrix to hold Rsquares

index <-1
for (i in (unique(lab4$idschool))) {
    sub <- lab4[ which(lab4$idschool==i),]
    model0 <- lm(science ~ math, data=sub)
    a <- anova(model0)                        # need sums of squares
    ssmodel <- ssmodel + a[1,2]               # takes first one fo model
    sstotal <- sstotal + sum(a[,2])           # takes sum of all SS for total
    R2[index,1:2] <- as.numeric(cbind(i,summary(model0)$r.squared))  # collect R2 in object R2
	index <- index+1
}

R2meta <- ssmodel/sstotal

R2 <- as.data.frame(R2)
names(R2) <- c("idschool","R2")
R2.mod1 <- merge(R2,nj,by=c("idschool"))

plot(R2.mod1$nj,R2.mod1$R2,type='p', 
     ylim=c(0,1),
     ylab="R squared",
     xlab="n_j (number of observations per school)"	, 
     main="Science ~ math")      # scatter plot
abline(h=R2meta,col='blue')      # adds horzontal line for R2meta
text(70,0.95,'R2meta=.36',col="blue")       # text of R2meta's value


#
# fit regress using hours as numeric:  sicence ~ math + hoursTV + hourscomputergames
#
ssmodel <- (0)
sstotal <- (0)
R2 <- matrix(99,nrow=nclusters,ncol=2)  # create matrix to hold Rsquares
index <- 1
for (i in (unique(lab4$idschool))) {
    sub <- lab4[ which(lab4$idschool==i),]
    model0 <- lm(science ~ 1 + math + hoursTV +hourscomputergames,data=sub)
    a <- anova(model0)
    ssmodel <- ssmodel + a[1,2] + a[2,2] + a[3,2] 
    sstotal <- sstotal + sum(a[,2])
    R2[index,1:2]<- as.numeric(cbind(i,summary(model0)$r.squared))
	index <- index+1
}

R2meta <- ssmodel/sstotal

R2 <- as.data.frame(R2)
names(R2) <- c("idschool","R2")
R2.mod2 <- merge(R2,nj,by=c("idschool"))
plot(R2.mod2$nj, R2.mod2$R2, type='p',
     ylim=c(0,1),
     ylab="R squared",
     xlab="n_j (number of observations per school)"	, 
     main='Hours TV as numeric Variable')
abline(h=R2meta, col='blue')               # adds line for R2meta
text(70,0.95,'R2meta=.39',col="blue")      # text of R2meta value


#
# makes hours a factor & give new name

lab4$hrsTV <- as.factor(lab4$hoursTV)
lab4$hrscg  <- as.factor(lab4$hourscomputergames)

#
#  Fit regression to each school using hours as factor: science ~ math + hrsTV + hrscg
#
ssmodel <- (0)
sstotal <- (0)
R2 <- matrix(99,nrow=nclusters,ncol=2)  # create matrix to hold Rsquares
index <- 1
for (i in (unique(lab4$idschool))) {
    sub <- lab4[ which(lab4$idschool==i),]
    model0 <- lm(science ~ 1 +  math + hrsTV+ hrscg,data=sub)
    a <- anova(model0)
    ssmodel <- ssmodel + a[1,2] + a[2,2] + a[3,2] 
    sstotal <- sstotal + sum(a[,2])
    R2[index,1:2]<- as.numeric(cbind(i,summary(model0)$r.squared))
	index <- index+1
}

R2meta <- ssmodel/sstotal
R2 <- as.data.frame(R2)
names(R2) <- c("idschool","R2")
R2.mod3 <- merge(R2,nj,by=c("idschool"))
plot(R2.mod3$nj, R2.mod3$R2, type='p',
     ylim=c(0,1),
     ylab="R squared",
     xlab="n_j (number of observations per school)"	, 
     main='Hours TV as categorical Variable')
abline(h=R2meta, col='blue')               # adds line for metaR2
text(70,0.95,'R2meta=.47',col="blue")      # add text for meta R2

#
#  Compare models: hours discrete by hours numeric
#
plot(R2.mod2$R2,R2.mod3$R2, pch=20,cex=1.4,
     ylim=c(0,1),
     xlim=c(0,1),
     ylab='Hours as Discrete',
     xlab='Hours as Numeric',
     main='Hours Discrete vs Numeric')
abline(a=0,b=1, col='blue')                 # adds a reference line


############################
# Diagonstic plots
############################


lab4$hoursTV <- as.numeric(lab4$hoursTV)
lab4$hrsTV <- ifelse(lab4$hoursTV<5,0,1)
table(lab4$hrsTV,lab4$hoursTV)              # to check re-coding

lab4$hourscomputergames <- as.numeric(lab4$hourscomputergames )
lab4$hrscg <- ifelse(lab4$hourscomputergames<5,0,1)
table(lab4$hrscg,lab4$hourscomputergames)   # to check re-coding

modelxx <- lmer(science ~ 1 + gcmath + gmmath + hrsTV + hrscg    + ( 1 + gcmath|idschool), lab4, REML=FALSE,
   control = lmerControl(optimizer ="Nelder_Mead")) 

   
plot(modelxx, xlab='Fitted Conditional', 
              ylab='Pearson Residuals')

fit <- fitted(modelxx)  # get conditional fitted values	

xfit <- seq(-40, 40, length=50)  # sets range & number quantiles
yfit <- dnorm(xfit, mean=0, sd=7.177)      # should be normal
yfit <- yfit*diff(h$mids[1:2])*length(res1)# use mid-points 
lines(xfit, yfit, col='darkblue', lwd=2)   # draws normal
	

mdfit <- mdffits(modelxx,group="idschool")
dotplot_diag(x=mdfit, cutoff="internal", name="mdffits", 
	ylab=("MDFIts"), xlab=("School"))

screenreg(modelxx)