# Example of modeling Bully data from Espelage et al.: # Poisson, Negative Binomial & ZIP models # Edps 589 # Fall 2018 # C.J.Anderson # library(MASS) # for negative binomial library(pscl) # for zero inflated setwd("D:/Users/cja/Dropbox/edps 589/4 glm") bully <- read.table("bully_data.txt",header=TRUE) # Some basic descriptive statistics (n <- length(bully$nom)) (ybar <- mean(bully$nom)) (s <- sd(bully$nom)) (s2 <- var(bully$nom)) summary(bully$nom) # Histogram of bully nominations hist(bully$nom, main="Distribution of Number of Bully Nominations", xlab="Number of Nominations that a student is Bully", breaks=50, col="cyan") text(41.2,200,"N = 291") text(40,190,"Mean = 2.49") text(41,180,"sd = 6.42") text(41.1,170,"s2 = 41.26") text(39.1,160,"min = 0") text(39.1,150,"max = 50") # Histogram of bully nominations with null poisson overlay y <- seq(1:51)-1 d.poisson <- dpois(y, ybar) hist(bully$nom, prob=TRUE, col="cyan", breaks=50, main="Bully Nominations w/ Poisson Overlayed", xlab="Number of Bully Nominations") # prob=TRUE for probabilities not counts lines(y,d.poisson, col="darkgreen", lty=2, lwd=1.5) # add a density estimate # # 1st model is poisson model # mod1 <- glm(nom ~ bsc, data=bully, family=poisson) summary(mod1) bully$pred1 <- fitted.values(mod1) pred <- predict(mod1, newdata = NULL, type = c("link", "response", "terms"), se.fit = TRUE ) bully$lower1 <- exp(pred$fit - 1.96*pred$se.fit) bully$upper1 <- exp(pred$fit + 1.96*pred$se.fit) bully$pred1 <- exp(pred$fit) bully.sort <- bully[order(bully$bsc),] plot(bully.sort$bsc,bully.sort$nom,type='p',pch=20, col="blue", main='Data, Fitted Line, and 95% Confidence', xlab='Bully Scale Score', ylab='Fitted Count') lines(bully.sort$bsc,bully.sort$pred1,col="darkgreen",lty=1,lwd=2) lines(bully.sort$bsc,bully.sort$lower1,col="darkgreen",lty=2) lines(bully.sort$bsc,bully.sort$upper1,col="darkgreen",lty=2) ############################################## # Other models -- digression # ############################################## # identity link in poisson summary(mod.Pidentity <- glm(nom ~ bsc, data=bully, family=poisson("identity"))) mod.Pidentity$aic # identity link in negative binomial summary(nb.indentity <- glm.nb(nom ~ bsc, data=bully, link=identity)) nb.indentity$aic ############################################## # But the line is for fitted mean..so another way to look at the # goodness of fit is to group data based on bully scale score # and re-fit the model, etc. # # This is ONLY for assessing fit (fit to individual level # is what should be report and interpreted # # Group data and get a look-see at how good it may be fitting the data bully$grp <- cut(bully$bsc, quantile(bully$bsc, c(0, 1/7, 2/7, 3/7, 4/7, 5/7, 6/7, 1))) grp.bsc <- aggregate(bully$bsc,by=list(bgrp=bully$grp),FUN=mean) grp.nom <- aggregate(bully$nom,by=list(bgrp=bully$grp),FUN=mean) grpdata <- cbind(grp.bsc,grp.nom) grpdata <- as.data.frame(grpdata) names(grpdata) <- c("grp","mbsc","grp2","mnom") plot(grpdata$mbsc,grpdata$snum) mod1b <- glm(mnom ~ mbsc, data=grpdata, family=poisson) summary(mod1b) #-- test of fit 1 - pchisq(mod1b$deviance,mod1b$df.residual) # look good # S3 method for class 'glm' pred <- predict(mod1b, newdata = NULL, type = c("link", "response", "terms"), se.fit = TRUE ) ### this looks difference from SAS but numerical results are ### spot on (i.e. equal for all quanties so ... lower <- exp(pred$fit - 1.96*pred$se.fit) upper <- exp(pred$fit + 1.96*pred$se.fit) fit <- exp(pred$fit) plot(grpdata$mbsc,grpdata$mnom, pch=20, col="blue", main='Grouped: Means bully scale by Nominations (95% Conifence)', xlab='mean bully scale score', ylab='observed and fitted number nominations', ylim=c(0,35), xlim=c(1,4) ) lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2) lines(grpdata$mbsc,lower,col='darkgreen',lty=2) lines(grpdata$mbsc,upper,col='darkgreen',lty=2) # # Fitted Marginal distribution (back to individual level) # ss <- seq(1:length(bully$bsc)) yvalues <- seq(1:51) fitprob <- matrix(99,nrow=291,ncol=51) for (i in ss) { for (yindex in yvalues) { yobs <- yindex-1 mu <- exp(-0.66308 + 0.81434*bully$bsc[i] ) fitprob[i,yindex] = ( exp(-mu) * mu**yobs)/(factorial(yobs)) } } # summing over y values (colums) should all equal 1 rowSums(fitprob) # mean over kids (rows) is what I want to plot marginals <- colSums(fitprob)/length(bully$bsc) # Histogram of bully nominations with null poisson overlay y <- seq(1:51)-1 d.poisson <- dpois(y, ybar) hist(bully$nom, prob=TRUE, col="cyan", breaks=50, main="Bully Nominations with Poisson Overlayed", xlab="Number of Bully Nominations") # prob=TRUE for probabilities not counts lines(y,d.poisson, col="darkgreen", lty=2, lwd=1) # add a density estimate lines(y,marginals, col="black", lty=1, lwd=2) # mod 1, poisson with predictor text(30,0.68,"Data",col="cyan") text(36,0.65,"Poisson no predcitors",col="darkgreen") text(39.4,0.62,"Poisson nom ~ bullyscale score",col="black") ######################## Negative Binomial ###################### library(MASS) # Fit model to individual level data mod2 <- glm.nb(nom ~ bsc, data=bully) # dispersion phi = 1/"disperson from R" summary(mod2) # Using my grouped data mod2b <- glm.nb(mnom ~ mbsc, data=grpdata) summary(mod2b) #-- test of fit 1 - pchisq(mod2b$deviance,mod2b$df.residual) # looks good # S3 method for class 'glm' pred <- predict(mod2b, newdata = NULL, type = c("link", "response", "terms"), se.fit = TRUE ) lower <- exp(pred$fit - 1.96*pred$se.fit) upper <- exp(pred$fit + 1.96*pred$se.fit) fit <- exp(pred$fit) plot(grpdata$mbsc,grpdata$mnom, pch=20, col="blue", main='NegBin to Grouped: Means bully scale by Nominations (95%)', xlab='mean bully scale score', ylab='observed and fitted number nominations', ylim=c(0,35), xlim=c(1,4) ) lines(grpdata$mbsc,fit,col='darkgreen',lty=1,lwd=2) lines(grpdata$mbsc,lower,col='darkgreen',lty=2) lines(grpdata$mbsc,upper,col='darkgreen',lty=2) # doesn't look same as SAS # # Fitted Marginal distribution (back to individual level) # # This seems to be off a bit... # ss <- seq(1:length(bully$bsc)) yvalues <- seq(1:51) fitprob2 <- matrix(99,nrow=291,ncol=51) # For negative binomial phi = 0.2858 for (i in ss) { for (yindex in yvalues) { yobs <- yindex-1 mu <- exp(-1.1940 + 1.0921*bully$bsc[i] ) term1 <- lgamma(mu+phi)/(factorial(yobs)*lgamma(phi)) term2 <- ( phi/(mu+phi))**phi term3 <- ( mu/(mu+phi))**yobs fitprob2[i,yindex] = term1*term2*term3 } } # mean over kids (rows) is what I want to plot marginals2 <- colSums(fitprob2)/length(bully$bsc) # Histogram of bully nominations with null poisson overlay y <- seq(1:51)-1 d.poisson <- dpois(y, ybar) hist(bully$nom, prob=TRUE, col="cyan", breaks=30, main="Bully Nominations with Fitted Overlayed", xlab="Number of Bully Nominations", ) # prob=TRUE for probabilities not counts lines(y,d.poisson, col="darkgreen", lty=2, lwd=1) # add a density estimate lines(y,marginals, col="black", lty=1, lwd=1) # mod 1, poisson with predictor lines(y,marginals2,col="red",lty=1, lwd=2) # negative binomial text(30,0.68,"Data",col="cyan") # zip text(31,0.65,"Poisson no predcitors",col="darkgreen") text(32.4,0.62,"Poisson nom ~ bullyscale score",col="black") text(32.4,0.59,"NegBin nom ~ bullyscale score",col="red") ##################### ZIP #######################3 library(pscl) summary(mod3 <- zeroinfl(nom ~ bsc | 1, data = bully)) summary(mod4 <- zeroinfl(nom ~ bsc | bsc, data = bully)) ss <- seq(1:length(bully$bsc)) yvalues <- seq(1:51) fitprob4 <- matrix(99,nrow=291,ncol=51) # For zip (mod4) for (i in ss) { for (yindex in yvalues) { yobs <- yindex-1 mu.zip <- exp(0.4967 + 0.59609*bully$bsc[i] ) pie <- exp(1.4967 - 0.7716*bully$bsc[i])/( 1 + exp(1.4967 - 0.7716*bully$bsc[i])) if (yobs==0) { fitprob4[i,yindex] <- pie + (1-pie)*exp(-mu.zip) } else { fitprob4[i,yindex] <- (1-pie)*exp(-mu.zip)*(mu.zip**yobs)/factorial(yobs) } } } # mean over kids (rows) is what I want to plot marginals4 <- colSums(fitprob4)/length(bully$bsc) lines(y,marginals4,col="violet", lty=4, lwd=2) # # The marginal predictions for zip and negbin seem to be a bit off. Note sure why. #