# Intro to glm:   difference between T-cells counts in Hodgkins and non-Hodgkins 
# Edps 589
# Fall 2018
# C.J.Anderson
#
# Notes: R/glm uses iterative weighted least squares or Fisher scoring algorithm; that is, it uses
#            the EXPECTED Hessian matrix (i.e., information or matrix of 2nd partial derivatives).
#
#        SAS/GENMOD uses (ridge stabilized) Newton-Raphson algorthim; that is, it uses the 
#             OBSERVED Hessian matrix.
#
#        The 2 algorithms are essentaly the same except for how the Hessian is computed.
#
#        The estimated parameters should be the same, but the estimated standard errors (and covariance
#             matrix) of parameter estimates will differ a bit.
#			 

library(MASS)               # needed for negatiave binomial glm

#setwd("D:\\Dropbox\\edps 589\\4 glm")
setwd("C:\\Users\\cja\\Documents\\Dropbox\\edps 589\\4 glm")

h <- read.table("hodgkins_data.txt",header=TRUE)


# normal (default link is identity)...I added it to show how to include it.
# The scale parameter is sigma

normal.0 <- glm(count ~ 1,data=h,family=gaussian(link="identity"))
summary(normal.0)
normal.1 <- glm(count ~ disease,data=h,family=gaussian(link="identity"))
summary(normal.1)

# Normal with log link
normal.0 <- glm(count ~ 1,data=h,family=gaussian(link="log"))
summary(normal.0)
normal.1 <- glm(count ~ disease,data=h,family=gaussian(link="log"))
summary(normal.1)

# Gamma (default link is inverse)

gamma.0 <- glm(count ~ 1,data=h,family=Gamma)
summary(gamma.0)
gamma.1 <- glm(count ~ 1 + disease,data=h,family=Gamma)
summary(gamma.1)

invGauss.0 <- glm(count ~ 1,data=h,family=inverse.gaussian)
summary(invGauss.0)
invGauss.1 <- glm(count ~ disease,data=h,family=inverse.gaussian)
summary(invGauss.1)

# poisson
# Scale parameter = 1
poisson.0 <- glm(count ~ 1,data=h,family=poisson)
summary(poisson.0)
poisson.1 <- glm(count ~ disease,data=h,family=poisson)
summary(poisson.1)

# negbin
# Scale parameter = k = dispersion is k.  SAS estimates 1/k)
negbin.0 <- glm.nb(count ~ 1,data=h)
summary(negbin.0)
negbin.1 <- glm.nb(count ~ disease,data=h)
summary(negbin.1)

# Negative binomial is much better relative to poisson?
anova(poisson.1,negbin.1)


# Why?   Show overdispersion in data
aggregate(count ~ disease, data=h, FUN="mean")
aggregate(count ~ disease, data=h, FUN="var")