# Homework 6
# Edps 589
# Fall 2019
# c.j.anderson
#
#  Logistic regression
#    problem 1 crab data
#
#
library(ResourceSelection)      # Hosmer-Lemeshow
library(DescTools)              # concordance index
library(pROC)                   # ROC and conordance index (for 1 predictor variable)
library(car)                    # Influence diagnostics

###################################################################
# Problem 1                                                       #
###################################################################

### Set-up and read data
setwd("C:\\Users\\cja\\Dropbox\\edps 589\\homework")

crabs <- read.table("crab_data.txt",header=TRUE)
head(crabs)

#################################
# 1(a)                          #
#################################

## Need to create binary variable
crabs$sat <- ifelse(crabs$satell>0,1,0)

## Model weight in kg
crabs$weight <- crabs$weight/1000

summary(mod1 <- glm(sat ~ weight, data=crabs, family=binomial))

## to cut down on writing
a <- mod1$coefficient[1]
b <- mod1$coefficient[2]

##############################
# 1(b)                       #
##############################

## attach fitted values to main data set
crabs$fit <- mod1$fitted

## To find min and max 
summary(crabs$weight)

## i.  pi at minimum weight
crabs$fit[crabs$weight==min(crabs$weight)]

## ii.  pi at mean weight-- there is no value
##    at the exact mean, so use equation to 
##    get this
(mweigh<- mean(crabs$weight))
(pi <- exp(a + b*mweigh)/(1+ exp(a+ b*mweigh)))

# iii. pi at maximum weight
crabs$fit[crabs$weight==max(crabs$weight)]

#############################
# Graph of data and fitted  #
#############################
plot(crabs$weight,jitter(crabs$sat,0.1),
     type="p",
	 pch=21,
	 xlab="Weight of Crab in kg",
	 ylab="Probability of Any Satellites",
	 main="Observed and Fitted for Whether Any Satellites")
j <- order(crabs$weight)
lw1 <- loess(sat ~ weight,data=crabs)                   # fit loess to data
lines(crabs$weight[j],lw1$fitted[j],col="cyan",lwd=1)
lines(crabs$weight[j],crabs$fit[j], col="red", lwd=2)
legend("bottomright",legend=c("Data","Loess","Logistic"), 
       col=c("black","cyan","red"), lty=1, cex=1)
	   
	   
##############################
# 1(c)                       #
##############################

## Weight were pi=.5
-(a/b)

## Slope at pi=.5
(slope <- b*.5*(1-.5))


##############################
# 1(d)                       #
##############################

## i. slope at weight=204kg
(pi <- exp(a+b*2.04)/(1+exp(a+b*2.04)))

## ii. 
slope/10


##############################
# 1(e)                       #
##############################

## Odds ratio
(or <- exp(b))

## 95% CI for beta
(lo <- b - 1.96*(.3767))
(hi <- b + 1.96*(.3767))

## 95% for exp(beta), odds ratio
(lower <- exp(lo))
(upper <- exp(hi))

##############################
# 1(f)                       #
##############################
(wald <- (b/.3767)**2)
1-pchisq(wald,1)

## likelihood ratio for Ho:beta=0
anova(mod1,test="LRT")


###############################
# 1(g) Group data and then    #
#      fit model              #
###############################

levels <- c(-Inf, 1.749, 1.999, 2.249, 2.499, 2.7499, 2.999, 3.2499, Inf)
labels <- c("<1.74","1.75-2.00","2.00-2.25","2.25-2.50","2.50-2.75","2.75-3.00","3.00-3.25",">3.25")
crabs$grp.weight = cut(crabs$weight, levels, labels = labels)
# - number of satell and no satell per group
(data.table <- table(crabs$grp.weight,crabs$sat))

# - number per group
grp.n <- table(crabs$grp.weight)
grp.n <- c(grp.n,grp.n)

# - mean per group
grp.mean <- aggregate(crabs$weight,by=list(crabs$grp.weight),FUN=mean)

# - put together
data.g <- cbind(data.table,grp.n,grp.mean)
names(data.g)<-c("range","sat","count","ncases","ignore","mean.weight")

data.g <- data.g[9:16,]

summary(mod.g <- glm(sat ~ mean.weight, weight=count, data=data.g, family=binomial))
data.g$phat <- mod.g$fitted
data.g$phat[1:8] <- 1-data.g$phat[9:16]
data.g$p <- data.g$count/data.g$ncases

data.g$fhat <- data.g$phat * data.g$ncases

### Not sure why this differs from what I did in SAS
(X2 <- sum((data.g$count-data.g$fhat)**2/data.g$fhat))
1-pchisq(X2,7)

Gcomp  <- data.g$count*log(data.g$count/data.g$fhat)
G2 <- 2*(sum(Gcomp[1:7])+sum(Gcomp[9:16]))
1-pchisq(G2,7)

############################
# 1(h)                     #
############################

######## There is more here that requested by problem

#####################################
# Hosmer-Lemeshow                   #
# Uses ResourceSelection package    #
#####################################
hoslem.test(mod1$y, fitted(mod1), g=10)

#####################################
#  - Cocordance index               #
#####################################
Cstat(mod1,resp=crabs$sat)

#####################################
# Plot of ROC curve                 #
#####################################

roc1 <- roc(crabs$sat ~ crabs$weight, auc=TRUE, ci=TRUE, plot=TRUE)
names(roc1)
# - area usnder curve (i.e. concordance)
roc1$auc
# - concordance index (area under curve) and confidence interval for it.
roc1$ci
# - a 2nd ROC plot
plot(roc1,
     main="ROC Curve for Logit Model fit to Crab data (c=.73)",
	 col="red")
	 
	 
######################################
# qqplot residuals                   #
######################################
# - what comes from glm
qqnorm(mod1$residuals)


#####################################
# Influence                         #
#####################################
inf <- influence.measures(mod1)
names(inf)

summary(inf)    # will pick out potentially influential ones


influencePlot(mod1)

influenceIndexPlot(mod1, vars=c("Cook","Studentized","hat"), id.n=5)

# To see what is here:
head(inf$infmat)

## could do plots of other statistics using the following:

inf.data <- as.data.frame(inf$infmat)
index <- seq(1:length(crabs$sat))
plot(index,inf.data$dfb.wght,type='h')