# Edps 590BAY --------- post this one online ---------- # Fall 2019 # C.J.Anderson # # Using data from 1 school from nels to illustrate simple linear regression # Graphing data # ols # Gibbs sampling with y ~ normal # Gibbs sampling with y ~ t "robust" # Model checking # Outlier # Bayesian p-values # Looking at posterior distribution # # library(coda) library(plotly) # needed for "select" command...data manipuatio libary(rjags) library(runjags) setwd("C:\\Users\\cja\\Dropbox\\edps 590BAY\\Lectures\\7 Linear Regression") # # Read in data and look at it # nels <- read.table("nels_school_62821.txt",header=TRUE) head(nels) ### Graph(s) of data # Plot of data plot(nels$homework,nels$math,type='p', main="One School from NELS", xlab="Time Spent Doing Homework", ylab="Math Scores", cex=1.5, pch=1) # # Simple linear regression & graph # # Get the means to add to plot schMathHmwk <- aggregate(nels$math, list(nels$homework), mean) lines(schMathHmwk$Group.1,schMathHmwk$x,type='p',pch=19,col="blue") # Add in ols regression line ols.lm <- lm(math~homework,data=nels) summary(ols.lm) coef(ols.lm) abline(ols.lm,col="blue",lwd=2) text(3,45,"y=59.21 + 1.09(Homework)",col="blue",cex=2) densData <- dnorm(100, df = as.data.frame(nels) plot_ly(df, x = Y, y = X, z = Z, group = X, type = "scatter3d", mode = "lines") ####################### rjags ################################ library(rjags) set.seed(75) dataList <- list(y=nels$math, x=nels$homework, N=length(nels$math), sdY = sd(nels$math) ) nelsLR1 = "model { for (i in 1:N){ y[i] ~ dnorm(mu[i] , precision) mu[i] <- b0 + b1*x[i] } b0 ~ dnorm(0 , 1/(100*sdY^2) ) b1 ~ dnorm(0 , 1/(100*sdY^2) ) sigma ~ dunif( 1E-3, 1E+30 ) precision <- 1/sigma^2 } " writeLines(nelsLR1, con="nelsLR1.txt") b0Init = mean(nels$math) b1Init = 0 sigmaInit = sd(nels$math) initsList = list(b0=b0Init, b1=b1Init, sigma=sigmaInit ) jagsNelsLR1 <- jags.model(file="nelsLR1.txt", # compiles and intializes model data=dataList, inits=initsList, n.chains=4, n.adapt=500) # adaptive phase for 500 iterations update (jagsNelsLR1, n.iter=500) # burn in of 500 iterations # gets samples from all chains for 4000 iterations Samples1 <- coda.samples(jagsNelsLR1, variable.names=c("b0","b1","sigma"), n.iter=4000) # output summary information summary(Samples1) plot(Samples1) autocorr.plot(Samples1,auto.layout=TRUE) geweke.diag(Samples1,frac1=0.1,frac2=0.5) effectiveSize(Samples1) cumuplot(Samples1,probs=c(.25,.50,.75),lwd=c(1,2),lty=c(2,1),col=c("blue","red")) gelman.diag(Samples1) gelman.plot(Samples1) HPDinterval(Samples1) print(Samples1) ###################### runjags ######################## library(runjags) # Needs inital values for each of the 4 chains: initsList = list(list("b0"=mean(nels$math), "b1"=sd(nels$math), "sigma"=sd(nels$math)**2), list("b0"=rnorm(1,-2,4), "b1"=1 , "sigma"=2), list("b0"=rnorm(1,0,1), "b1"=8 ,"sigma"=5), list("b0"=rnorm(1,2,2), "b1"=.5 ,"sigma"=15 ) ) out.runjags <- run.jags(model=nelsLR1, monitor=c("b0","b1","sigma","dic"), data=dataList, n.chains=4, inits=initsList) print(out.runjags) par(ask=TRUE) print(out.runjags) par(ask=TRUE) plot(out.runjags) gelman.plot(out.runjags) ############################################################################ ############ Robust linear regression: swap out normal and use t ########### ############################################################################ ############## rjags library(rjags) set.seed(75) dataList <- list(y=nels$math, x=nels$homework, N=length(nels$math), sdY = sd(nels$math) ) tMod1 = "model {for (i in 1:N){ y[i] ~ dt(mu[i] , precision,nu) mu[i] <- b0 + b1*x[i] } b0 ~ dnorm(0 , 1/(100*sdY^2) ) b1 ~ dnorm(0 , 1/(100*sdY^2) ) sigma ~ dunif(0, 1E+10 ) precision <- 1/sigma^2 nuMinusOne ~ dexp(1/29) nu <- nuMinusOne+1 } " writeLines(tMod1, con="tMod1.txt") initsList = list("b0"=mean(nels$math), "b1"=sd(nels$math), "sigma"=sd(nels$math)**2) jagsModelLm2 <- jags.model(file="tMod1.txt", # compiles and intializes model data=dataList, inits=initsList, n.chains=4, n.adapt=1000) # adaptive phase for 1000 iterations update (jagsModelLm2, n.iter=1000) # burn in of 1000 iterations # contains samples from all chains with 500 iterations tSamples2 <- coda.samples(jagsModelLm2, variable.names=c("b0","b1","sigma","nu"), n.iter=2000) # output summary information summary(tSamples2) plot(tSamples2) autocorr.plot(tSamples2,auto.layout=TRUE) geweke.diag(tSamples2,frac1=0.1,frac2=0.5) effectiveSize(tSamples2) gelman.diag(tSamples2) cumuplot(tSamples2,probs=c(.25,.50,.75),lwd=c(1,2),lty=c(2,1),col=c("blue","red")) gelman.plot(tSamples2) HPDinterval(tSamples2) #### runjags library(runjags) initsList = list(list("b0"=mean(nels$math), "b1"=sd(nels$math), "sigma"=sd(nels$math)**2), list("b0"=rnorm(1,-2,4), "b1"=1 , "sigma"=2), list("b0"=rnorm(1,0,1), "b1"=8 ,"sigma"=5), list("b0"=rnorm(1,2,2), "b1"=.5 ,"sigma"=15 ) ) out.runjags <- run.jags(model=tMod1, monitor=c("b0","b1","sigma","nu","dic"), data=dataList, n.chains=4, inits=initsList) print(out.runjags) plot(out.runjags) autocorr.plot(out.runjags,auto.layout=TRUE) geweke.diag(out.runjags,frac1=0.1,frac2=0.5) effectiveSize(out.runjags) gelman.diag(out.runjags) cumuplot(out.runjags,probs=c(.25,.50,.75),lwd=c(1,2),lty=c(2,1),col=c("blue","red")) gelman.plot(out.runjags) HPDinterval(out.runjags) #################### Basic checks ######################### # Compare these to parameters mean(nels$math) sd(nels$math) min(nels$math) max(nels$math) summary(nels$math) ################# found an outlier ################## par(mfrow=c(2,2)) hist(nels$math,main="Outlier on math") plot(nels$homework,nels$math,main="NELS: Looks like an outlier") lines(1,min(nels$math),type='p',col="red", pch=20,cex=2) ## Try analysis without the outlier nels2 <- subset(nels, math>min(nels$math)) min(nels2$math) dataList <- list(y=nels2$math, x=nels2$homework, N=length(nels2$math),We sdY = sd(nels2$math) ) ModelLm1x = "model { for (i in 1:N){ y[i] ~ dnorm(mu[i] , precision) mu[i] <- b0 + b1*x[i] } b0 ~ dnorm(0 , 1/(100*sdY^2) ) b1 ~ dnorm(0 , 1/(100*sdY^2) ) sigma ~ dunif( 1E-3, 1E+30 ) precision <- 1/sigma^2 } " writeLines(ModelLm1x, con="ModelLm1x.txt") b0Init = mean(nels2$math) b1Init = 0 sigmaInit = sd(nels2$math) initsList = list(b0=b0Init, b1=b1Init, sigma=sigmaInit ) jagsModelLm1x <- jags.model(file="ModelLm1x.txt", # compiles and intializes model data=dataList, inits=initsList, n.chains=4, n.adapt=1000) # adaptive phase for 500 iterations update (jagsModelLm1x, n.iter=1000) # burn in of 1000 iterations # gets samples from all chains for 2000 iterations Samples1x <- coda.samples(jagsModelLm1x, variable.names=c("b0","b1","sigma"), n.iter=2000) # output summary information summary(Samples1x) plot(Samples1x) ################## 200 draws from posterior distribution ################# n <- length(nels2$math) replications <- 200 yrep <- matrix(99,nrow=n,ncol=replications) for (s in 1:replications){ b0 <- rnorm(1,60.131,sd=1.3714) b1 <- rnorm(1,0.89476,sd=0.36696) # b0 <- rnorm(1,60.131,sd=1) # b1 <- rnorm(1,0.89476,sd=1)des for (i in 1:n){ yrep[i,s] = b0 + b1*nels2$homework[i] + rnorm(1,0,5.0667) } } # descriptive statistics yhats <- apply(yrep,2,"mean") ymin <- apply(yrep,2,"min") ymax <- apply(yrep,2,"max") ysd <- apply(yrep,2,"sd") # Bayesian p-values pmean <- mean(mean(nels2$math) < yhats) pmin <- mean(min(nels2$math) < ymin) pmax <- mean(max(nels2$math) < ymax) psd <- mean(sd(nels2$math) < ysd) # Graphs of descriptive statistics from posterior predictive distribution par(mfrow=c(2,2)) hist(ymin,breaks=10,main=paste("Simulated N=200 Minimums", "\nBayesian P-value =", round(pmin, 2))) lines(c(min(nels2$math),min(nels2$math)),c(0,40),col="blue",lwd=2) hist(ymax,breaks=10,main=paste("Simulated N=200 Maximums", "\nBayesian P-value =", round(pmax, 2))) lines(c(max(nels2$math),max(nels2$math)),c(0,40),col="blue",lwd=2) hist(yhats,breaks=10,main=paste("Simulated N=200 Means", "\nBayesian P-value =", round(pmean, 2))) lines(c(mean(nels2$math),mean(nels2$math)),c(0,40),col="blue",lwd=2) hist(ysd,breaks=10,main=paste("Simulated N=200 SDs", "\nBayesian P-value =", round(psd, 2))) lines(c(sd(nels2$math),sd(nels2$math)),c(0,40),col="blue",lwd=2) # Graphs of data and posterior predictive distribution ypred <- apply(yrep,1,"mean") par(mfrow=c(2,2)) hist(nels2$math,main="Data Distribution",breaks=10) hist(ypred,main="Predicted Posterior Distribution",breaks=10) plot(nels2$homework,nels2$math,main="Data: math x homework") plot(nels2$homework, ypred,type='p',main='Predictions: math x homework',ylim=c(59,80)) schMathHmwk <- aggregate(nels2$math, list(nels2$homework), "mean") lines(schMathHmwk$Group.1,schMathHmwk$x,type='p',pch=19,col="blue") ############################################################################################################# ###### Alternative way to get value from posterior predictive distribution: discrepancies ################### ## ## dataList <- list(y=nels2$math, x=nels2$homework, N=length(nels2$math), sdY = sd(nels2$math) ) ModelLm1x = "model { ## Likelihood for (i in 1:N){ y[i] ~ dnorm(mu[i] , precision) mu[i] <- b0 + b1*x[i] res[i] <- y[i] - mu[i] # Data residuals emp.new[i] ~ dnorm(mu[i],precision) # New Predicted y res.new[i] <- emp.new[i] - mu[i] # New Predicted residuals } ## Prior b0 ~ dnorm(0 , 1/(100*sdY^2) ) b1 ~ dnorm(0 , 1/(100*sdY^2) ) sigma ~ dunif( 1E-3, 1E+30 ) precision <- 1/sigma^2 ## Derived parameters fit <- sum(res[]) # sum data residuals this iterations fit.new <- sum(res.new[]) # sum predicted residuals fit.mean <- mean(emp.new[]) # mean of posterior predictions } " writeLines(ModelLm1x, con="ModelLm1x.txt") b0Init = mean(nels2$math) b1Init = 0 sigmaInit = sd(nels2$math) initsList = list(b0=b0Init, b1=b1Init, sigma=sigmaInit ) jagsModelLm1x <- jags.model(file="ModelLm1x.txt", data=dataList, inits=initsList, n.chains=4, n.adapt=1000) update (jagsModelLm1x, n.iter=1000) # gets samples from all chains for 2000 iterations Samples1x <- coda.samples(jagsModelLm1x, variable.names=c("b0","b1","sigma","fit","fit.new","fit.mean","emp.new"), thin.iter=2000) # output summary information summary(Samples1x) plot(Samples1x) ## Posterior predictive check of residuals ## Get the betas in a matrix head(Samples1x) ffit <- as.array(Samples1x) # change from mcmc to array object b <- rbind(ffit[,,1],ffit[,,2],ffit[, , 3], ffit[, , 4]) # one from each chain # put values into vectors or array: b0 <- b[,1] b1 <- b[,2] pred <- b[,3:68] fit <- b[,69] fit.mean <- b[,70] fit.new <- b[,71] sigma <- b[,72] mean(nels2$math) mean(fit.mean) # Bayesian pvalue for data vs predictions: bpval <- mean(nels2$math < pred) bpval