# Poisson regression for rates (lung cancer data) # Edps 589 # Fall 2018 # C.J.Anderson library(MASS) setwd("D:\\Dropbox\\edps 589\\4 glm") lc <- read.table("lung_cancer_data.txt",header=TRUE) # Some things I'll need later lc$rate <- lc$cases/lc$population lc$agesq <- lc$age_midpt**2 lc$frederic <- ifelse(lc$city=="Fredericia",1,0) lc # Graphs ignoring which city data is from plot(lc$age,lc$rate,type="p",pch=20,lex=2,main="Lung Cancer Rate x Age Group") plot(lc$age_midpt,lc$rate,type="p",pch=20,lex=2,main="Lung Cancer Rate x Mid-point of Age Group") # Graph of data by city plot(lc$age_midpt[lc$city=="Fredericia"], lc$rate[lc$city=="Fredericia"], type="b" ,col="red", ylim=c(0,0.025), main="Lung Cancer rate by age for 5 Cities", xlab="Age Midpoint", ylab="Rate of Lung Cancer" ) lines(lc$age_midpt[lc$city=="Horsens"], lc$rate[lc$city=="Horsens"], type="b" ,col="blue" ) lines(lc$age_midpt[lc$city=="Kolding"], lc$rate[lc$city=="Kolding"], type="b" ,col="green" ) lines(lc$age_midpt[lc$city=="Vejle"], lc$rate[lc$city=="Vejle"], type="b" ,col="orange" ) legend("topleft",legend=c("Frederica","Horsens","Kolding","Vejle"), col=c("red","blue","green","orange" ), lwd=c(1,1,1,1,1)) ##### # Modeling the data: note that deviance and df are the same as SAS. # Parameter estimates differ due to different dummy coding by SAS # and R. If you compute # # Model 1: 2 nominal/catgorial predictors model1 <- glm(cases ~ offset(log(population)) + city + age, data=lc, family=poisson) summary(model1) anova(model1) # Model 2: nominal city, ordinal age model2 <- glm(cases ~ offset(log(population)) + city + age_midpt, data=lc, family=poisson) summary(model2) anova(model2) # Graph of data by city # Add in lines for fited values fit2 <- fitted(model2) lc <- cbind(lc,fit2) lc$fitrate <= lc$fit2/lc$population plot(lc$age_midpt[lc$city=="Fredericia"], lc$rate[lc$city=="Fredericia"], type="p" ,col="red", ylim=c(0,0.025), main="Data & Fitted Values: Lung Cancer rate by age for 4 Cities", xlab="Age Midpoint", ylab="Rate of Lung Cancer = cases/population" ) lines(lc$age_midpt[lc$city=="Fredericia"], lc$fitrate[lc$city=="Fredericia"],col="red",type="l",lwd=2) lines(lc$age_midpt[lc$city=="Horsens"], lc$fitrate[lc$city=="Horsens"], type="l" ,col="blue",lwd=2 ) lines(lc$age_midpt[lc$city=="Kolding"], lc$fitrate[lc$city=="Kolding"], type="l" ,col="green",lwd=2 ) lines(lc$age_midpt[lc$city=="Vejle"], lc$fitrate[lc$city=="Vejle"], type="l" ,col="orange",lwd=2 ) lines(lc$age_midpt[lc$city=="Horsens"], lc$rate[lc$city=="Horsens"], type="p" ,col="blue" ) lines(lc$age_midpt[lc$city=="Kolding"], lc$rate[lc$city=="Kolding"], type="p" ,col="green" ) lines(lc$age_midpt[lc$city=="Vejle"], lc$rate[lc$city=="Vejle"], type="p" ,col="orange" ) legend("topleft",legend=c("Frederica","Horsens","Kolding","Vejle"), col=c("red","blue","green","orange" ), lwd=c(1,1,1,1,1)) # Model 3: Try age^2 model3 <- glm(cases ~ offset(log(population)) + city + age_midpt + agesq, data=lc, family=poisson) summary(model3) anova(model3) fit3 <- fitted(model3) lc <- cbind(lc,fit3) lc$fitrate3 <- lc$fit3/lc$population plot(lc$age_midpt[lc$city=="Fredericia"], lc$rate[lc$city=="Fredericia"], type="p" ,col="red", ylim=c(0,0.025), main="Data & Fitted Values: Lung Cancer rate by age for 4 Cities", xlab="Age Midpoint", ylab="Rate of Lung Cancer = cases/population" ) lines(lc$age_midpt[lc$city=="Fredericia"], lc$fitrate3[lc$city=="Fredericia"],col="red",type="l",lwd=2) lines(lc$age_midpt[lc$city=="Horsens"], lc$fitrate3[lc$city=="Horsens"], type="l" ,col="blue",lwd=2 ) lines(lc$age_midpt[lc$city=="Kolding"], lc$fitrate3[lc$city=="Kolding"], type="l" ,col="green",lwd=2 ) lines(lc$age_midpt[lc$city=="Vejle"], lc$fitrate3[lc$city=="Vejle"], type="l" ,col="orange",lwd=2 ) lines(lc$age_midpt[lc$city=="Horsens"], lc$rate[lc$city=="Horsens"], type="p" ,col="blue" ) lines(lc$age_midpt[lc$city=="Kolding"], lc$rate[lc$city=="Kolding"], type="p" ,col="green" ) lines(lc$age_midpt[lc$city=="Vejle"], lc$rate[lc$city=="Vejle"], type="p" ,col="orange" ) legend("topleft",legend=c("Frederica","Horsens","Kolding","Vejle"), col=c("red","blue","green","orange" ), lwd=c(1,1,1,1,1)) # Simply by collapsing over cities, that is, Fredericia vs others # Note that this is one of the first things I did (I knew what would # work having played with these data before model4 <- glm(cases ~ offset(log(population)) + frederic + age_midpt + agesq, data=lc, family=poisson) summary(model4) anova(model4) # Graph of data and fitted values fit4 <- fitted(model4) lc <- cbind(lc,fit4) lc$fitrate4<- lc$fit4/lc$population plot(lc$age_midpt[lc$city=="Fredericia"], lc$rate[lc$city=="Fredericia"], type="p" ,col="red", ylim=c(0,0.025), main="Data & Fitted Values: Lung Cancer rate by age for 4 Cities", xlab="Age Midpoint", ylab="Rate of Lung Cancer = cases/population" ) lines(lc$age_midpt[lc$city=="Fredericia"], lc$fitrate4[lc$city=="Fredericia"],col="red",type="l",lwd=2) lines(lc$age_midpt[lc$city=="Horsens"], lc$fitrate4[lc$city=="Horsens"], type="l" ,col="blue",lwd=2 ) lines(lc$age_midpt[lc$city=="Kolding"], lc$fitrate4[lc$city=="Kolding"], type="l" ,col="green",lwd=2 ) lines(lc$age_midpt[lc$city=="Vejle"], lc$fitrate4[lc$city=="Vejle"], type="l" ,col="orange",lwd=2 ) lines(lc$age_midpt[lc$city=="Horsens"], lc$rate[lc$city=="Horsens"], type="p" ,col="blue" ) lines(lc$age_midpt[lc$city=="Kolding"], lc$rate[lc$city=="Kolding"], type="p" ,col="green" ) lines(lc$age_midpt[lc$city=="Vejle"], lc$rate[lc$city=="Vejle"], type="p" ,col="orange" ) legend("topleft",legend=c("Frederica","others"), col=c("red","blue"), lwd=c(1,1))