# Example of modeling Bully data from Espelage et al.: 
#    Poisson, Negative Binomial & ZIP models
# Edps 589
# Fall 2018
# C.J.Anderson
#
library(MASS)  # for negative binomial
library(pscl)  # for zero inflated

setwd("D:/Users/cja/Dropbox/edps 589/4 glm")

bully <- read.table("bully_data.txt",header=TRUE)

# Some basic descriptive statistics
(n <- length(bully$nom))
(ybar <- mean(bully$nom))
(s <- sd(bully$nom))
(s2 <- var(bully$nom))
summary(bully$nom)

# Histogram of bully nominations
hist(bully$nom,
     main="Distribution of Number of Bully Nominations",
     xlab="Number of Nominations that a student is Bully",
     breaks=50,
     col="cyan")
text(41.2,200,"N = 291")	 
text(40,190,"Mean = 2.49")	 
text(41,180,"sd = 6.42")
text(41.1,170,"s2 = 41.26")
text(39.1,160,"min = 0")
text(39.1,150,"max = 50")


# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)	 
	 
hist(bully$nom, prob=TRUE, col="cyan", breaks=50, 
     main="Bully Nominations w/ Poisson Overlayed",
     xlab="Number of Bully Nominations")     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1.5)                 # add a density estimate 

#
# 1st model is poisson model
#
mod1 <- glm(nom ~ bsc, data=bully, family=poisson)
summary(mod1)
bully$pred1 <- fitted.values(mod1)
pred <- predict(mod1, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE
        )
bully$lower1 <- exp(pred$fit  - 1.96*pred$se.fit)
bully$upper1 <- exp(pred$fit  + 1.96*pred$se.fit)
bully$pred1  <- exp(pred$fit)

bully.sort <- bully[order(bully$bsc),]
plot(bully.sort$bsc,bully.sort$nom,type='p',pch=20, col="blue",
     main='Data, Fitted Line, and 95% Confidence', 
     xlab='Bully Scale Score', ylab='Fitted Count')
lines(bully.sort$bsc,bully.sort$pred1,col="darkgreen",lty=1,lwd=2)
lines(bully.sort$bsc,bully.sort$lower1,col="darkgreen",lty=2)
lines(bully.sort$bsc,bully.sort$upper1,col="darkgreen",lty=2)

##############################################
# Other models       -- digression           #
##############################################
# identity link in poisson
summary(mod.Pidentity <- glm(nom ~ bsc, data=bully, family=poisson("identity")))
mod.Pidentity$aic

# identity link in negative binomial
summary(nb.indentity <- glm.nb(nom ~ bsc, data=bully, link=identity))
nb.indentity$aic
##############################################

# But the line is for fitted mean..so another way to look at the
# goodness of fit is to group data based on bully scale score
# and re-fit the model, etc.    
# 
# This is ONLY for assessing fit (fit to individual level
#  is what should be report and interpreted
#

# Group data and get a look-see at how good it may be fitting the data
bully$grp <- cut(bully$bsc, quantile(bully$bsc, c(0, 1/7, 2/7, 3/7, 4/7, 5/7, 6/7, 1))) 	
grp.bsc <- aggregate(bully$bsc,by=list(bgrp=bully$grp),FUN=mean)
grp.nom <- aggregate(bully$nom,by=list(bgrp=bully$grp),FUN=mean)
grpdata <- cbind(grp.bsc,grp.nom)
grpdata <- as.data.frame(grpdata)
names(grpdata) <- c("grp","mbsc","grp2","mnom")
plot(grpdata$mbsc,grpdata$snum)


mod1b <- glm(mnom ~ mbsc, data=grpdata, family=poisson)
summary(mod1b)
#-- test of fit
1 - pchisq(mod1b$deviance,mod1b$df.residual)   # look good

#  S3 method for class 'glm'
pred <- predict(mod1b, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE
        )
### this looks difference from SAS but numerical results are 
### spot on (i.e. equal for all quanties so ...

lower <- exp(pred$fit  - 1.96*pred$se.fit)
upper <- exp(pred$fit  + 1.96*pred$se.fit)
fit   <- exp(pred$fit)

plot(grpdata$mbsc,grpdata$mnom, 
     pch=20, col="blue", 
     main='Grouped: Means bully scale by Nominations (95% Conifence)',
     xlab='mean bully scale score',
     ylab='observed and fitted number nominations',
     ylim=c(0,35),
     xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)





#
# Fitted Marginal distribution (back to individual level)
#
ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob <- matrix(99,nrow=291,ncol=51)
for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu <- exp(-0.66308 + 0.81434*bully$bsc[i] )
    fitprob[i,yindex] = ( exp(-mu) * mu**yobs)/(factorial(yobs))
  }
}
# summing over y values (colums) should all equal 1
rowSums(fitprob)

# mean over kids (rows) is what I want to plot
marginals <- colSums(fitprob)/length(bully$bsc)

# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)	 
	 
hist(bully$nom, prob=TRUE, col="cyan", breaks=50, 
     main="Bully Nominations with Poisson Overlayed",
     xlab="Number of Bully Nominations")     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1)                 # add a density estimate 
lines(y,marginals, col="black", lty=1, lwd=2)              # mod 1, poisson with predictor
text(30,0.68,"Data",col="cyan")
text(36,0.65,"Poisson no predcitors",col="darkgreen")
text(39.4,0.62,"Poisson nom ~ bullyscale score",col="black")


######################## Negative Binomial ######################
library(MASS)

# Fit model to individual level data
mod2 <- glm.nb(nom ~ bsc, data=bully)  # dispersion phi = 1/"disperson from R"
summary(mod2)

# Using my grouped data
mod2b <- glm.nb(mnom ~ mbsc, data=grpdata)
summary(mod2b)
#-- test of fit
1 - pchisq(mod2b$deviance,mod2b$df.residual)   # looks good

#  S3 method for class 'glm'
pred <- predict(mod2b, newdata = NULL,
            type = c("link", "response", "terms"),
            se.fit = TRUE         )

lower <- exp(pred$fit  - 1.96*pred$se.fit)
upper <- exp(pred$fit  + 1.96*pred$se.fit)
fit   <- exp(pred$fit)

plot(grpdata$mbsc,grpdata$mnom, 
     pch=20, col="blue", 
     main='NegBin to Grouped: Means bully scale by Nominations (95%)',
     xlab='mean bully scale score',
     ylab='observed and fitted number nominations',
     ylim=c(0,35),
     xlim=c(1,4) )
lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2)
lines(grpdata$mbsc,lower,col='darkgreen',lty=2)
lines(grpdata$mbsc,upper,col='darkgreen',lty=2)                 # doesn't look same as SAS


#

# Fitted Marginal distribution (back to individual level)
#
# This seems to be off a bit...
# 
ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob2 <- matrix(99,nrow=291,ncol=51)               # For negative binomial
phi = 0.2858
for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu <- exp(-1.1940 + 1.0921*bully$bsc[i] )
    term1 <- lgamma(mu+phi)/(factorial(yobs)*lgamma(phi))
    term2 <- ( phi/(mu+phi))**phi 
    term3 <- ( mu/(mu+phi))**yobs  
    fitprob2[i,yindex] = term1*term2*term3
  }
}

# mean over kids (rows) is what I want to plot
marginals2 <- colSums(fitprob2)/length(bully$bsc)

# Histogram of bully nominations with null poisson overlay
y <- seq(1:51)-1
d.poisson <- dpois(y, ybar)	 
	 
hist(bully$nom, prob=TRUE, col="cyan", breaks=30, 
     main="Bully Nominations with Fitted Overlayed",
     xlab="Number of Bully Nominations",
     )     # prob=TRUE for probabilities not counts
lines(y,d.poisson, col="darkgreen", lty=2,  lwd=1)         # add a density estimate 
lines(y,marginals, col="black", lty=1, lwd=1)              # mod 1, poisson with predictor
lines(y,marginals2,col="red",lty=1, lwd=2)                 # negative binomial
text(30,0.68,"Data",col="cyan")                            # zip
text(31,0.65,"Poisson no predcitors",col="darkgreen")
text(32.4,0.62,"Poisson nom ~ bullyscale score",col="black")
text(32.4,0.59,"NegBin nom ~ bullyscale score",col="red")


##################### ZIP #######################3
library(pscl)

summary(mod3 <- zeroinfl(nom ~ bsc | 1, data = bully))


summary(mod4 <- zeroinfl(nom ~ bsc | bsc, data = bully))



ss <- seq(1:length(bully$bsc))
yvalues <- seq(1:51)
fitprob4 <- matrix(99,nrow=291,ncol=51)               # For zip (mod4)

for (i in ss) {
  for (yindex in yvalues) { 
    yobs <- yindex-1
    mu.zip <- exp(0.4967 + 0.59609*bully$bsc[i] )
    pie <- exp(1.4967 - 0.7716*bully$bsc[i])/( 1 + exp(1.4967 - 0.7716*bully$bsc[i]))
    if (yobs==0) {
       fitprob4[i,yindex] <- pie + (1-pie)*exp(-mu.zip)
    } else {
       fitprob4[i,yindex] <- (1-pie)*exp(-mu.zip)*(mu.zip**yobs)/factorial(yobs)
    }
  }
}


# mean over kids (rows) is what I want to plot
marginals4 <- colSums(fitprob4)/length(bully$bsc)
lines(y,marginals4,col="violet", lty=4, lwd=2)


#
#  The marginal predictions for zip and negbin seem to be a bit off.  Note sure why.
#