# Homework 6 # Edps 589 # Fall 2019 # c.j.anderson # # Logistic regression # problem 1 crab data # # library(ResourceSelection) # Hosmer-Lemeshow library(DescTools) # concordance index library(pROC) # ROC and conordance index (for 1 predictor variable) library(car) # Influence diagnostics ################################################################### # Problem 1 # ################################################################### ### Set-up and read data setwd("C:\\Users\\cja\\Dropbox\\edps 589\\homework") crabs <- read.table("crab_data.txt",header=TRUE) head(crabs) ################################# # 1(a) # ################################# ## Need to create binary variable crabs$sat <- ifelse(crabs$satell>0,1,0) ## Model weight in kg crabs$weight <- crabs$weight/1000 summary(mod1 <- glm(sat ~ weight, data=crabs, family=binomial)) ## to cut down on writing a <- mod1$coefficient[1] b <- mod1$coefficient[2] ############################## # 1(b) # ############################## ## attach fitted values to main data set crabs$fit <- mod1$fitted ## To find min and max summary(crabs$weight) ## i. pi at minimum weight crabs$fit[crabs$weight==min(crabs$weight)] ## ii. pi at mean weight-- there is no value ## at the exact mean, so use equation to ## get this (mweigh<- mean(crabs$weight)) (pi <- exp(a + b*mweigh)/(1+ exp(a+ b*mweigh))) # iii. pi at maximum weight crabs$fit[crabs$weight==max(crabs$weight)] ############################# # Graph of data and fitted # ############################# plot(crabs$weight,jitter(crabs$sat,0.1), type="p", pch=21, xlab="Weight of Crab in kg", ylab="Probability of Any Satellites", main="Observed and Fitted for Whether Any Satellites") j <- order(crabs$weight) lw1 <- loess(sat ~ weight,data=crabs) # fit loess to data lines(crabs$weight[j],lw1$fitted[j],col="cyan",lwd=1) lines(crabs$weight[j],crabs$fit[j], col="red", lwd=2) legend("bottomright",legend=c("Data","Loess","Logistic"), col=c("black","cyan","red"), lty=1, cex=1) ############################## # 1(c) # ############################## ## Weight were pi=.5 -(a/b) ## Slope at pi=.5 (slope <- b*.5*(1-.5)) ############################## # 1(d) # ############################## ## i. slope at weight=204kg (pi <- exp(a+b*2.04)/(1+exp(a+b*2.04))) ## ii. slope/10 ############################## # 1(e) # ############################## ## Odds ratio (or <- exp(b)) ## 95% CI for beta (lo <- b - 1.96*(.3767)) (hi <- b + 1.96*(.3767)) ## 95% for exp(beta), odds ratio (lower <- exp(lo)) (upper <- exp(hi)) ############################## # 1(f) # ############################## (wald <- (b/.3767)**2) 1-pchisq(wald,1) ## likelihood ratio for Ho:beta=0 anova(mod1,test="LRT") ############################### # 1(g) Group data and then # # fit model # ############################### levels <- c(-Inf, 1.749, 1.999, 2.249, 2.499, 2.7499, 2.999, 3.2499, Inf) labels <- c("<1.74","1.75-2.00","2.00-2.25","2.25-2.50","2.50-2.75","2.75-3.00","3.00-3.25",">3.25") crabs$grp.weight = cut(crabs$weight, levels, labels = labels) # - number of satell and no satell per group (data.table <- table(crabs$grp.weight,crabs$sat)) # - number per group grp.n <- table(crabs$grp.weight) grp.n <- c(grp.n,grp.n) # - mean per group grp.mean <- aggregate(crabs$weight,by=list(crabs$grp.weight),FUN=mean) # - put together data.g <- cbind(data.table,grp.n,grp.mean) names(data.g)<-c("range","sat","count","ncases","ignore","mean.weight") data.g <- data.g[9:16,] summary(mod.g <- glm(sat ~ mean.weight, weight=count, data=data.g, family=binomial)) data.g$phat <- mod.g$fitted data.g$phat[1:8] <- 1-data.g$phat[9:16] data.g$p <- data.g$count/data.g$ncases data.g$fhat <- data.g$phat * data.g$ncases ### Not sure why this differs from what I did in SAS (X2 <- sum((data.g$count-data.g$fhat)**2/data.g$fhat)) 1-pchisq(X2,7) Gcomp <- data.g$count*log(data.g$count/data.g$fhat) G2 <- 2*(sum(Gcomp[1:7])+sum(Gcomp[9:16])) 1-pchisq(G2,7) ############################ # 1(h) # ############################ ######## There is more here that requested by problem ##################################### # Hosmer-Lemeshow # # Uses ResourceSelection package # ##################################### hoslem.test(mod1$y, fitted(mod1), g=10) ##################################### # - Cocordance index # ##################################### Cstat(mod1,resp=crabs$sat) ##################################### # Plot of ROC curve # ##################################### roc1 <- roc(crabs$sat ~ crabs$weight, auc=TRUE, ci=TRUE, plot=TRUE) names(roc1) # - area usnder curve (i.e. concordance) roc1$auc # - concordance index (area under curve) and confidence interval for it. roc1$ci # - a 2nd ROC plot plot(roc1, main="ROC Curve for Logit Model fit to Crab data (c=.73)", col="red") ###################################### # qqplot residuals # ###################################### # - what comes from glm qqnorm(mod1$residuals) ##################################### # Influence # ##################################### inf <- influence.measures(mod1) names(inf) summary(inf) # will pick out potentially influential ones influencePlot(mod1) influenceIndexPlot(mod1, vars=c("Cook","Studentized","hat"), id.n=5) # To see what is here: head(inf$infmat) ## could do plots of other statistics using the following: inf.data <- as.data.frame(inf$infmat) index <- seq(1:length(crabs$sat)) plot(index,inf.data$dfb.wght,type='h')