# Utility programs for use with the book, # Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition: # A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier. # This file contains several functions that are called by other programs # or can be called directly by the user. To load all the functions into # R's working memory, at R's command line type: # source("DBDA2E-utilities.R") #------------------------------------------------------------------------------ bookInfo = "Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:\nA Tutorial with R, JAGS, and Stan. Academic Press / Elsevier." bannerBreak = "\n*********************************************************************\n" cat(paste0(bannerBreak,bookInfo,bannerBreak,"\n")) #------------------------------------------------------------------------------ # Check that required packages are installed: want = c("parallel","rjags","runjags","compute.es") have = want %in% rownames(installed.packages()) if ( any(!have) ) { install.packages( want[!have] ) } # Load rjags. Assumes JAGS is already installed. try( library(rjags) ) # Load runjags. Assumes JAGS is already installed. try( library(runjags) ) try( runjags.options( inits.warning=FALSE , rng.warning=FALSE ) ) # set default number of chains and parallelness for MCMC: library(parallel) # for detectCores(). nCores = detectCores() if ( !is.finite(nCores) ) { nCores = 1 } if ( nCores > 4 ) { nChainsDefault = 4 # because JAGS has only 4 rng's. runjagsMethodDefault = "parallel" } if ( nCores == 4 ) { nChainsDefault = 3 # save 1 core for other processes. runjagsMethodDefault = "parallel" } if ( nCores < 4 ) { nChainsDefault = 3 runjagsMethodDefault = "rjags" # NOT parallel } #------------------------------------------------------------------------------ # Functions for opening and saving graphics that operate the same for # Windows and Macintosh and Linux operating systems. At least, that's the hope! openGraph = function( width=7 , height=7 , mag=1.0 , ... ) { if ( .Platform$OS.type != "windows" ) { # Mac OS, Linux tryInfo = try( X11( width=width*mag , height=height*mag , type="cairo" , ... ) ) if ( class(tryInfo)=="try-error" ) { lineInput = readline("WARNING: Previous graphics windows will be closed because of too many open windows.\nTO CONTINUE, PRESS IN R CONSOLE.\n") graphics.off() X11( width=width*mag , height=height*mag , type="cairo" , ... ) } } else { # Windows OS tryInfo = try( windows( width=width*mag , height=height*mag , ... ) ) if ( class(tryInfo)=="try-error" ) { lineInput = readline("WARNING: Previous graphics windows will be closed because of too many open windows.\nTO CONTINUE, PRESS IN R CONSOLE.\n") graphics.off() windows( width=width*mag , height=height*mag , ... ) } } } saveGraph = function( file="saveGraphOutput" , type="pdf" , ... ) { if ( .Platform$OS.type != "windows" ) { # Mac OS, Linux if ( any( type == c("png","jpeg","jpg","tiff","bmp")) ) { sptype = type if ( type == "jpg" ) { sptype = "jpeg" } savePlot( file=paste0(file,".",type) , type=sptype , ... ) } if ( type == "pdf" ) { dev.copy2pdf(file=paste0(file,".",type) , ... ) } if ( type == "eps" ) { dev.copy2eps(file=paste0(file,".",type) , ... ) } } else { # Windows OS file=paste0(file,".",type) savePlot( file=file , type=type , ... ) } } #------------------------------------------------------------------------------ # Functions for computing limits of HDI's: HDIofMCMC = function( sampleVec , credMass=0.95 ) { # Computes highest density interval from a sample of representative values, # estimated as shortest credible interval. # Arguments: # sampleVec # is a vector of representative values from a probability distribution. # credMass # is a scalar between 0 and 1, indicating the mass within the credible # interval that is to be estimated. # Value: # HDIlim is a vector containing the limits of the HDI sortedPts = sort( sampleVec ) ciIdxInc = ceiling( credMass * length( sortedPts ) ) nCIs = length( sortedPts ) - ciIdxInc ciWidth = rep( 0 , nCIs ) for ( i in 1:nCIs ) { ciWidth[ i ] = sortedPts[ i + ciIdxInc ] - sortedPts[ i ] } HDImin = sortedPts[ which.min( ciWidth ) ] HDImax = sortedPts[ which.min( ciWidth ) + ciIdxInc ] HDIlim = c( HDImin , HDImax ) return( HDIlim ) } HDIofICDF = function( ICDFname , credMass=0.95 , tol=1e-8 , ... ) { # Arguments: # ICDFname is R's name for the inverse cumulative density function # of the distribution. # credMass is the desired mass of the HDI region. # tol is passed to R's optimize function. # Return value: # Highest density iterval (HDI) limits in a vector. # Example of use: For determining HDI of a beta(30,12) distribution, type # HDIofICDF( qbeta , shape1 = 30 , shape2 = 12 ) # Notice that the parameters of the ICDFname must be explicitly named; # e.g., HDIofICDF( qbeta , 30 , 12 ) does not work. # Adapted and corrected from Greg Snow's TeachingDemos package. incredMass = 1.0 - credMass intervalWidth = function( lowTailPr , ICDFname , credMass , ... ) { ICDFname( credMass + lowTailPr , ... ) - ICDFname( lowTailPr , ... ) } optInfo = optimize( intervalWidth , c( 0 , incredMass ) , ICDFname=ICDFname , credMass=credMass , tol=tol , ... ) HDIlowTailPr = optInfo$minimum return( c( ICDFname( HDIlowTailPr , ... ) , ICDFname( credMass + HDIlowTailPr , ... ) ) ) } HDIofGrid = function( probMassVec , credMass=0.95 ) { # Arguments: # probMassVec is a vector of probability masses at each grid point. # credMass is the desired mass of the HDI region. # Return value: # A list with components: # indices is a vector of indices that are in the HDI # mass is the total mass of the included indices # height is the smallest component probability mass in the HDI # Example of use: For determining HDI of a beta(30,12) distribution # approximated on a grid: # > probDensityVec = dbeta( seq(0,1,length=201) , 30 , 12 ) # > probMassVec = probDensityVec / sum( probDensityVec ) # > HDIinfo = HDIofGrid( probMassVec ) # > show( HDIinfo ) sortedProbMass = sort( probMassVec , decreasing=TRUE ) HDIheightIdx = min( which( cumsum( sortedProbMass ) >= credMass ) ) HDIheight = sortedProbMass[ HDIheightIdx ] HDImass = sum( probMassVec[ probMassVec >= HDIheight ] ) return( list( indices = which( probMassVec >= HDIheight ) , mass = HDImass , height = HDIheight ) ) } #------------------------------------------------------------------------------ # Function(s) for plotting properties of mcmc coda objects. DbdaAcfPlot = function( codaObject , parName=varnames(codaObject)[1] , plColors=NULL ) { if ( all( parName != varnames(codaObject) ) ) { stop("parName must be a column name of coda object") } nChain = length(codaObject) if ( is.null(plColors) ) plColors=1:nChain xMat = NULL yMat = NULL for ( cIdx in 1:nChain ) { acfInfo = acf(codaObject[,c(parName)][[cIdx]],plot=FALSE) xMat = cbind(xMat,acfInfo$lag) yMat = cbind(yMat,acfInfo$acf) } matplot( xMat , yMat , type="o" , pch=20 , col=plColors , ylim=c(0,1) , main="" , xlab="Lag" , ylab="Autocorrelation" ) abline(h=0,lty="dashed") EffChnLngth = effectiveSize(codaObject[,c(parName)]) text( x=max(xMat) , y=max(yMat) , adj=c(1.0,1.0) , cex=1.25 , labels=paste("ESS =",round(EffChnLngth,1)) ) } DbdaDensPlot = function( codaObject , parName=varnames(codaObject)[1] , plColors=NULL ) { if ( all( parName != varnames(codaObject) ) ) { stop("parName must be a column name of coda object") } nChain = length(codaObject) # or nchain(codaObject) if ( is.null(plColors) ) plColors=1:nChain xMat = NULL yMat = NULL hdiLims = NULL for ( cIdx in 1:nChain ) { densInfo = density(codaObject[,c(parName)][[cIdx]]) xMat = cbind(xMat,densInfo$x) yMat = cbind(yMat,densInfo$y) hdiLims = cbind(hdiLims,HDIofMCMC(codaObject[,c(parName)][[cIdx]])) } matplot( xMat , yMat , type="l" , col=plColors , main="" , xlab="Param. Value" , ylab="Density" ) abline(h=0) points( hdiLims[1,] , rep(0,nChain) , col=plColors , pch="|" ) points( hdiLims[2,] , rep(0,nChain) , col=plColors , pch="|" ) text( mean(hdiLims) , 0 , "95% HDI" , adj=c(0.5,-0.2) ) EffChnLngth = effectiveSize(codaObject[,c(parName)]) MCSE = sd(as.matrix(codaObject[,c(parName)]))/sqrt(EffChnLngth) text( max(xMat) , max(yMat) , adj=c(1.0,1.0) , cex=1.25 , paste("MCSE =\n",signif(MCSE,3)) ) } diagMCMC = function( codaObject , parName=varnames(codaObject)[1] , saveName=NULL , saveType="jpg" ) { DBDAplColors = c("skyblue","black","royalblue","steelblue") openGraph(height=5,width=7) par( mar=0.5+c(3,4,1,0) , oma=0.1+c(0,0,2,0) , mgp=c(2.25,0.7,0) , cex.lab=1.5 ) layout(matrix(1:4,nrow=2)) # traceplot and gelman.plot are from CODA package: require(coda) coda::traceplot( codaObject[,c(parName)] , main="" , ylab="Param. Value" , col=DBDAplColors ) tryVal = try( coda::gelman.plot( codaObject[,c(parName)] , main="" , auto.layout=FALSE , col=DBDAplColors ) ) # if it runs, gelman.plot returns a list with finite shrink values: if ( class(tryVal)=="try-error" ) { plot.new() print(paste0("Warning: coda::gelman.plot fails for ",parName)) } else { if ( class(tryVal)=="list" & !is.finite(tryVal$shrink[1]) ) { plot.new() print(paste0("Warning: coda::gelman.plot fails for ",parName)) } } DbdaAcfPlot(codaObject,parName,plColors=DBDAplColors) DbdaDensPlot(codaObject,parName,plColors=DBDAplColors) mtext( text=parName , outer=TRUE , adj=c(0.5,0.5) , cex=2.0 ) if ( !is.null(saveName) ) { saveGraph( file=paste0(saveName,"Diag",parName), type=saveType) } } diagStanFit = function( stanFit , parName , saveName=NULL , saveType="jpg" ) { codaFit = mcmc.list( lapply( 1:ncol(stanFit) , function(x) { mcmc(as.array(stanFit)[,x,]) } ) ) DBDAplColors = c("skyblue","black","royalblue","steelblue") openGraph(height=5,width=7) par( mar=0.5+c(3,4,1,0) , oma=0.1+c(0,0,2,0) , mgp=c(2.25,0.7,0) , cex.lab=1.5 ) layout(matrix(1:4,nrow=2)) # traceplot is from rstan package require(rstan) traceplot(stanFit,pars=parName,nrow=1,ncol=1)#,main="",ylab="Param. Value",col=DBDAplColors) # gelman.plot are from CODA package: require(coda) tryVal = try( coda::gelman.plot( codaObject[,c(parName)] , main="" , auto.layout=FALSE , col=DBDAplColors ) ) # if it runs, gelman.plot returns a list with finite shrink values: if ( class(tryVal)=="try-error" ) { plot.new() print(paste0("Warning: coda::gelman.plot fails for ",parName)) } else { if ( class(tryVal)=="list" & !is.finite(tryVal$shrink[1]) ) { plot.new() print(paste0("Warning: coda::gelman.plot fails for ",parName)) } } DbdaAcfPlot(codaFit,parName,plColors=DBDAplColors) DbdaDensPlot(codaFit,parName,plColors=DBDAplColors) mtext( text=parName , outer=TRUE , adj=c(0.5,0.5) , cex=2.0 ) if ( !is.null(saveName) ) { saveGraph( file=paste0(saveName,"Diag",parName), type=saveType) } } #------------------------------------------------------------------------------ # Functions for summarizing and plotting distribution of a large sample; # typically applied to MCMC posterior. normalize = function( v ){ return( v / sum(v) ) } require(coda) # loaded by rjags, but redundancy doesn't hurt summarizePost = function( paramSampleVec , compVal=NULL , ROPE=NULL , credMass=0.95 ) { meanParam = mean( paramSampleVec ) medianParam = median( paramSampleVec ) dres = density( paramSampleVec ) modeParam = dres$x[which.max(dres$y)] mcmcEffSz = round( effectiveSize( paramSampleVec ) , 1 ) names(mcmcEffSz) = NULL hdiLim = HDIofMCMC( paramSampleVec , credMass=credMass ) if ( !is.null(compVal) ) { pcgtCompVal = ( 100 * sum( paramSampleVec > compVal ) / length( paramSampleVec ) ) } else { compVal=NA pcgtCompVal=NA } if ( !is.null(ROPE) ) { pcltRope = ( 100 * sum( paramSampleVec < ROPE[1] ) / length( paramSampleVec ) ) pcgtRope = ( 100 * sum( paramSampleVec > ROPE[2] ) / length( paramSampleVec ) ) pcinRope = 100-(pcltRope+pcgtRope) } else { ROPE = c(NA,NA) pcltRope=NA pcgtRope=NA pcinRope=NA } return( c( Mean=meanParam , Median=medianParam , Mode=modeParam , ESS=mcmcEffSz , HDImass=credMass , HDIlow=hdiLim[1] , HDIhigh=hdiLim[2] , CompVal=compVal , PcntGtCompVal=pcgtCompVal , ROPElow=ROPE[1] , ROPEhigh=ROPE[2] , PcntLtROPE=pcltRope , PcntInROPE=pcinRope , PcntGtROPE=pcgtRope ) ) } plotPost = function( paramSampleVec , cenTend=c("mode","median","mean")[1] , compVal=NULL, ROPE=NULL, credMass=0.95, HDItextPlace=0.7, xlab=NULL , xlim=NULL , yaxt=NULL , ylab=NULL , main=NULL , cex=NULL , cex.lab=NULL , col=NULL , border=NULL , showCurve=FALSE , breaks=NULL , ... ) { # Override defaults of hist function, if not specified by user: # (additional arguments "..." are passed to the hist function) if ( is.null(xlab) ) xlab="Param. Val." if ( is.null(cex.lab) ) cex.lab=1.5 if ( is.null(cex) ) cex=1.4 if ( is.null(xlim) ) xlim=range( c( compVal , ROPE , paramSampleVec ) ) if ( is.null(main) ) main="" if ( is.null(yaxt) ) yaxt="n" if ( is.null(ylab) ) ylab="" if ( is.null(col) ) col="skyblue" if ( is.null(border) ) border="white" # convert coda object to matrix: if ( class(paramSampleVec) == "mcmc.list" ) { paramSampleVec = as.matrix(paramSampleVec) } summaryColNames = c("ESS","mean","median","mode", "hdiMass","hdiLow","hdiHigh", "compVal","pGtCompVal", "ROPElow","ROPEhigh","pLtROPE","pInROPE","pGtROPE") postSummary = matrix( NA , nrow=1 , ncol=length(summaryColNames) , dimnames=list( c( xlab ) , summaryColNames ) ) # require(coda) # for effectiveSize function postSummary[,"ESS"] = effectiveSize(paramSampleVec) postSummary[,"mean"] = mean(paramSampleVec) postSummary[,"median"] = median(paramSampleVec) mcmcDensity = density(paramSampleVec) postSummary[,"mode"] = mcmcDensity$x[which.max(mcmcDensity$y)] HDI = HDIofMCMC( paramSampleVec , credMass ) postSummary[,"hdiMass"]=credMass postSummary[,"hdiLow"]=HDI[1] postSummary[,"hdiHigh"]=HDI[2] # Plot histogram. cvCol = "darkgreen" ropeCol = "darkred" if ( is.null(breaks) ) { if ( max(paramSampleVec) > min(paramSampleVec) ) { breaks = c( seq( from=min(paramSampleVec) , to=max(paramSampleVec) , by=(HDI[2]-HDI[1])/18 ) , max(paramSampleVec) ) } else { breaks=c(min(paramSampleVec)-1.0E-6,max(paramSampleVec)+1.0E-6) border="skyblue" } } if ( !showCurve ) { par(xpd=NA) histinfo = hist( paramSampleVec , xlab=xlab , yaxt=yaxt , ylab=ylab , freq=F , border=border , col=col , xlim=xlim , main=main , cex=cex , cex.lab=cex.lab , breaks=breaks , ... ) } if ( showCurve ) { par(xpd=NA) histinfo = hist( paramSampleVec , plot=F ) densCurve = density( paramSampleVec , adjust=2 ) plot( densCurve$x , densCurve$y , type="l" , lwd=5 , col=col , bty="n" , xlim=xlim , xlab=xlab , yaxt=yaxt , ylab=ylab , main=main , cex=cex , cex.lab=cex.lab , ... ) } cenTendHt = 0.9*max(histinfo$density) cvHt = 0.7*max(histinfo$density) ROPEtextHt = 0.55*max(histinfo$density) # Display central tendency: mn = mean(paramSampleVec) med = median(paramSampleVec) mcmcDensity = density(paramSampleVec) mo = mcmcDensity$x[which.max(mcmcDensity$y)] if ( cenTend=="mode" ){ text( mo , cenTendHt , bquote(mode==.(signif(mo,3))) , adj=c(.5,0) , cex=cex ) } if ( cenTend=="median" ){ text( med , cenTendHt , bquote(median==.(signif(med,3))) , adj=c(.5,0) , cex=cex , col=cvCol ) } if ( cenTend=="mean" ){ text( mn , cenTendHt , bquote(mean==.(signif(mn,3))) , adj=c(.5,0) , cex=cex ) } # Display the comparison value. if ( !is.null( compVal ) ) { pGtCompVal = sum( paramSampleVec > compVal ) / length( paramSampleVec ) pLtCompVal = 1 - pGtCompVal lines( c(compVal,compVal) , c(0.96*cvHt,0) , lty="dotted" , lwd=2 , col=cvCol ) text( compVal , cvHt , bquote( .(round(100*pLtCompVal,1)) * "% < " * .(signif(compVal,3)) * " < " * .(round(100*pGtCompVal,1)) * "%" ) , adj=c(pLtCompVal,0) , cex=0.8*cex , col=cvCol ) postSummary[,"compVal"] = compVal postSummary[,"pGtCompVal"] = pGtCompVal } # Display the ROPE. if ( !is.null( ROPE ) ) { pInROPE = ( sum( paramSampleVec > ROPE[1] & paramSampleVec < ROPE[2] ) / length( paramSampleVec ) ) pGtROPE = ( sum( paramSampleVec >= ROPE[2] ) / length( paramSampleVec ) ) pLtROPE = ( sum( paramSampleVec <= ROPE[1] ) / length( paramSampleVec ) ) lines( c(ROPE[1],ROPE[1]) , c(0.96*ROPEtextHt,0) , lty="dotted" , lwd=2 , col=ropeCol ) lines( c(ROPE[2],ROPE[2]) , c(0.96*ROPEtextHt,0) , lty="dotted" , lwd=2 , col=ropeCol) text( mean(ROPE) , ROPEtextHt , bquote( .(round(100*pLtROPE,1)) * "% < " * .(ROPE[1]) * " < " * .(round(100*pInROPE,1)) * "% < " * .(ROPE[2]) * " < " * .(round(100*pGtROPE,1)) * "%" ) , adj=c(pLtROPE+.5*pInROPE,0) , cex=1 , col=ropeCol ) postSummary[,"ROPElow"]=ROPE[1] postSummary[,"ROPEhigh"]=ROPE[2] postSummary[,"pLtROPE"]=pLtROPE postSummary[,"pInROPE"]=pInROPE postSummary[,"pGtROPE"]=pGtROPE } # Display the HDI. lines( HDI , c(0,0) , lwd=4 , lend=1 ) text( mean(HDI) , 0 , bquote(.(100*credMass) * "% HDI" ) , adj=c(.5,-1.7) , cex=cex ) text( HDI[1] , 0 , bquote(.(signif(HDI[1],3))) , adj=c(HDItextPlace,-0.5) , cex=cex ) text( HDI[2] , 0 , bquote(.(signif(HDI[2],3))) , adj=c(1.0-HDItextPlace,-0.5) , cex=cex ) par(xpd=F) # return( postSummary ) } #------------------------------------------------------------------------------ # Shape parameters from central tendency and scale: betaABfromMeanKappa = function( mean , kappa ) { if ( mean <=0 | mean >= 1) stop("must have 0 < mean < 1") if ( kappa <=0 ) stop("kappa must be > 0") a = mean * kappa b = ( 1.0 - mean ) * kappa return( list( a=a , b=b ) ) } betaABfromModeKappa = function( mode , kappa ) { if ( mode <=0 | mode >= 1) stop("must have 0 < mode < 1") if ( kappa <=2 ) stop("kappa must be > 2 for mode parameterization") a = mode * ( kappa - 2 ) + 1 b = ( 1.0 - mode ) * ( kappa - 2 ) + 1 return( list( a=a , b=b ) ) } betaABfromMeanSD = function( mean , sd ) { if ( mean <=0 | mean >= 1) stop("must have 0 < mean < 1") if ( sd <= 0 ) stop("sd must be > 0") kappa = mean*(1-mean)/sd^2 - 1 if ( kappa <= 0 ) stop("invalid combination of mean and sd") a = mean * kappa b = ( 1.0 - mean ) * kappa return( list( a=a , b=b ) ) } gammaShRaFromMeanSD = function( mean , sd ) { if ( mean <=0 ) stop("mean must be > 0") if ( sd <=0 ) stop("sd must be > 0") shape = mean^2/sd^2 rate = mean/sd^2 return( list( shape=shape , rate=rate ) ) } gammaShRaFromModeSD = function( mode , sd ) { if ( mode <=0 ) stop("mode must be > 0") if ( sd <=0 ) stop("sd must be > 0") rate = ( mode + sqrt( mode^2 + 4 * sd^2 ) ) / ( 2 * sd^2 ) shape = 1 + mode * rate return( list( shape=shape , rate=rate ) ) } #------------------------------------------------------------------------------ # Make some data files for examples... createDataFiles=FALSE if ( createDataFiles ) { source("HtWtDataGenerator.R") N=300 m = HtWtDataGenerator( N , rndsd=47405 ) write.csv( file=paste0("HtWtData",N,".csv") , row.names=FALSE , m ) # Function for generating normal data with normal outliers: genYwithOut = function( N , pcntOut=15 , sdOut=3.0 ) { inl = rnorm( N-ceiling(pcntOut/100*N) ) out = rnorm( ceiling(pcntOut/100*N) ) inl = (inl-mean(inl))/sd(inl) out = (out-mean(out))/sd(out) * sdOut return(c(inl,out)) } # Two-group IQ scores with outliers set.seed(47405) y1 = round(pmax(50,genYwithOut(63,20,3.5)*17.5+106)) y2 = round(pmax(50,genYwithOut(57,20,3.5)*10+100)) write.csv( file="TwoGroupIQ.csv" , row.names=FALSE , data.frame( Score=c(y1,y2) , Group=c(rep("Smart Drug",length(y1)), rep("Placebo",length(y2))) ) ) # One-group log-normal set.seed(47405) z = rnorm(123) logY = (z-mean(z))/sd(z) * 0.5 + 5.5 # logY has mean 5.5 and sd 0.5 y = round( exp(logY) , 2 ) write.csv( file="OneGroupLogNormal.csv" , row.names=FALSE , cbind(y) ) # One-group gamma desiredMode = 250 desiredSD = 100 desiredRate = (desiredMode+sqrt(desiredMode^2+4*desiredSD^2))/(2*desiredSD^2) desiredShape = 1+desiredMode*desiredRate set.seed(47405) y = round( rgamma( 153 , shape=desiredShape , rate=desiredRate ) , 2 ) write.csv( file="OneGroupGamma.csv" , row.names=FALSE , cbind(y) ) } # end if createDataFiles