############################################################### # R code to analyze repeat count data using # Bayesian N-mixture models in JAGS # # Written by Bill Peterman # August 2013 ############################################################## # This analysis uses JAGS, but the code can very easily be adapted to use # WinBUGS or OpenBUGS. # ****** Before starting, download and install JAGS (http://sourceforge.net/projects/mcmc-jags/files/) ****** # Run code below to check for, and install necessary R packages for the analysis libs=c("R2jags","reshape2") # Packages used in this code type=getOption("pkgType") CheckInstallPackage <- function(packages, repos="http://cran.r-project.org", depend=c("Depends", "Imports", "LinkingTo", "Suggests", "Enhances"), ...) { installed=as.data.frame(installed.packages()) for(p in packages) { if(is.na(charmatch(p, installed[,1]))) { install.packages(p, repos=repos, dependencies=depend, ...) } } } CheckInstallPackage(packages=libs) # Load packages require(reshape2) require(R2jags) # Set working directory to folder where your data file is stored setwd("C:/Rdata/") # Import data data (Randomly subsampled data, with replacement, from A. annulatum are provided) data <- read.csv("AnCon.dat.csv") names(data) ########################## # Data fields # pond.name = sequential pond name # id = a unique ID for each row. Note that because the data provided is sampled with replacement, # some ponds are included multiple times, although each has been given a unique ID # t1-t6 = temperature in Celsius during survey # d1-d6 = Julian day of survey # m1-m6 = Method used to capture larvae (0=Dipnet, 1=Minnow trap) # s1-s6 = The total number of salamander larvae captured on a survey day # fish = presence of fish # hydro = 4 category hydrology measure # ctail = Percentage of pond covered by cattails # open = Percentage of pond without vegetation or structure # pred = Number of invertebrate predator species captured during surveys of ponds # f.300 = Amount of forest (square meters) within a 300m radius of the focal pond # f.1km = Amount of forest (square meters) within a 1km radius of the focal pond # p.300 = Number of ponds within a 300m radius of the focal pond # d.forest = Distance (meters) to forest # cnpy = Percent canopy coverage over the pond # slope = slope od the pond basin # effort = number of traps/dipnet sweeps used during survey # area = surface area (square meters) of pond # age = estimated age (years) of each pond ########################## # Sort data by numeric ID data <- data[order(data$id),] # Extract variables for analysis # Counts at ponds vars <- paste("s", 1:6, sep="") y.data <- data[vars] id<- data[,2] y.data <- cbind(id, y.data) # Attach ID for later merging # Method of sampling vars <- paste("m", 1:6, sep="") method2 <- data[vars] id<- data[,2] method <- cbind(id, method2) # Temp data vars <- paste("t", 1:6, sep="") temp <- data[vars] id<- data[,2] temp <- cbind(id, temp) # Survey date vars <- paste("d", 1:6, sep="") date <- data[vars] id<- data[,2] date <- cbind(id, date) age <- data["age"] slope <- data["slope"] # offset to account for variable sampling effort at each site offset <- as.matrix(data["effort"]);colnames(offset) <- NULL; offset<-as.numeric(offset) fish <- as.matrix(data["fish"]); colnames(fish) <- NULL; fish<-as.numeric(fish) hydro <- as.matrix(data["hydro"]); colnames(hydro) <- NULL; hydro<-as.numeric(hydro) ctail <- data["ctail"] open <- data["open"] pred <- data["pred"] f.300 <- data["f.300"] f.1km <- data["f.1km"] p.300 <- data["p.300"] frst.dist <- data["d.frst"] cnpy <- data["cnpy"] #Impute means for missing values ctail[is.na(ctail)] <- colMeans(ctail, na.rm=TRUE) open[is.na(open)] <- colMeans(open, na.rm=TRUE) cnpy[is.na(cnpy)] <- colMeans(cnpy, na.rm=TRUE) age[is.na(age)] <- colMeans(age, na.rm=TRUE) slope[is.na(slope)] <- colMeans(slope, na.rm=TRUE) pred[is.na(pred)] <- colMeans(pred, na.rm=TRUE) # Scale and center each covariate ctail <- scale(ctail,center=T,scale=T) open <- scale(open,center=T,scale=T) cnpy <- scale(cnpy,center=T,scale=T) age <- scale(age,center=T,scale=T) slope <- scale(slope,center=T,scale=T) pred <- scale(pred,center=T,scale=T) frst.300 <- scale(f.300,center=T,scale=T) frst.1km <- scale(f.1km,center=T,scale=T) ponds.300 <- scale(p.300,center=T,scale=T) d.forest <- scale(frst.dist,center=T,scale=T) # Make quadratic term for canopy coverage cnpy2 <- cnpy[,1]^2 # Convert to long data format # This is necessary because we have unequal sampling among ponds. # Using the long format and nested indexing accomodates this y.long <- melt(data=y.data,na.rm=TRUE,value.name="count",id.vars=("id")) y.long <- y.long[order(y.long$id),] C <- as.numeric(y.long[,3]) # Count data in long format # This sample vector will index our for loop during the analysis sample <- as.numeric(y.long[,1]) m.long <- melt(data=method,na.rm=TRUE,value.name="method",id.vars=("id")) m.long <- m.long[order(m.long$id),] m <- as.numeric(m.long[,3]) d.long <- melt(data=date,na.rm=TRUE,value.name="date",id.vars=("id")) d.long <- d.long[order(d.long$id),] dl <- d.long[,3] date <- scale(dl,center=T,scale=T) # Scale and center now that all values are in single vector t.long <- melt(data=temp,na.rm=TRUE,value.name="temp",id.vars=("id")) t.long <- t.long[order(t.long$id),] temp <- t.long[,3] temp <- scale(temp,center=T,scale=T) # Scale and center now that all values are in single vector # Cattail is used as a detection covariate, so it needs values to correspond with each survey ctail.p <- cbind(id, ctail) ctail.p <- merge(y.long, ctail.p,by="id") ctail.p <- as.numeric(ctail.p[,4]) # Slope is used as a detection covariate, so it needs values to correspond with each survey slope.p <- cbind(id,slope) slope.p <- merge(y.long, slope.p,by="id") slope.p <- as.numeric(slope.p[,4]) # Create a cattail x method interaction ctail.method <- ctail.p*m pond <- as.numeric(data[,2]) # Unique ID for each pond n.ponds <- length(pond) # Total number of ponds in the data set n.samples <- length(sample) # Total number of sample periods in the data set # Explore data Y.max <- apply(y.data[,2:7], 1, function(x) max(x, na.rm = TRUE)) hist(Y.max) boxplot(Y.max) ####################################################### # Parameterize Bayesian model # Number of Abundance covariates ab.covs <- 10 # Number of detection covariates det.covs <- 6 # Bundle data win.data <- list( C = C, sample = sample, n.ponds = n.ponds, n.samples = n.samples, fish = fish, open = open[,1], cnpy = cnpy[,1], cnpy2 = cnpy2, frst.300 = frst.300[,1], ponds.300 = ponds.300[,1], pred = pred[,1], date = date[,1], ctail.p = ctail.p, method = m, temp = temp[,1], ab.covs = ab.covs, det.covs = det.covs, slope = slope[,1], slope.p = slope.p, age = age[,1], off = log(offset)) # Function to set initial starting values Nst <- apply(y.data[,2:7], 1, max, na.rm=TRUE) + 1 inits <- function(){list( N = Nst, alpha.lam=runif(ab.covs,-1,1), beta.p=runif(det.covs,-1,1))} # Parameters monitored params <- c("alpha.lam", "beta.p", "fit", "fit.new", "N", "mean.detection", "mean.N", "sd.p", "sd.lam") # Model parameterization largely follows Kery and Schaub (2012) sink("JAGS_model.txt") cat(" model{ # Priors # Abundance priors for (i in 1:ab.covs){ alpha.lam[i] ~ dnorm(0, 0.0001) } # Detection priors for (i in 1: det.covs){ beta.p[i] ~ dnorm(0, 0.0001) } # Random effects for abundance and detection during each survey for(i in 1:n.ponds){ eps[i] ~ dnorm(0, tau.lam) # Noise/uncertainty in abundance } tau.lam <- 1/ (sd.lam * sd.lam) sd.lam ~ dunif(0,5) tau.p ~ dgamma(0.01, 0.01) sd.p <- 1 / (tau.p) ########### # ABUNDANCE MODEL # Likelihood for(i in 1:n.ponds) { # Loop over ponds N[i] ~ dpois(lambda[i]) # This is a Poisson-log-normal distribution with the 'eps' random error term log(lambda[i]) <- off[i] + alpha.lam[1] + alpha.lam[2]*fish[i]+ alpha.lam[3]*open[i] + alpha.lam[4]*cnpy[i] + alpha.lam[5]*cnpy2[i] + alpha.lam[6]*pred[i]+ alpha.lam[7]*ponds.300[i] + alpha.lam[8]*frst.300[i] + alpha.lam[9]*age[i] + alpha.lam[10]*slope[i] + eps[i] } # DETECTION MODEL for(i in 1:n.samples){ # Loop over observations C[i] ~ dbin(p[i], N[sample[i]]) p[i] <- exp(lp.lim[i])/(1 + exp(lp.lim[i])) lp.lim[i] <- min(999, max(-999, lp[i])) # Help stabilize the logit lp[i]~dnorm(FIT[i], tau.p) # Random noise is detection modeled here FIT[i] <- beta.p[1] + beta.p[2]*date[sample[i]] + beta.p[3]*temp[sample[i]] + beta.p[4]*method[sample[i]] + beta.p[5]*ctail.p[i] + beta.p[6]*slope.p[sample[i]] # Assess model fit using Chi-squared discrepancy # Compute fit statistic for observed data eval[i]<-p[i] * N[sample[i]] E[i] <- pow((C[i] - eval[i]),2) / (eval[i] + 0.5) # Generate replicate data and compute fit stats for them C.new[i] ~ dbin(p[i], N[sample[i]]) E.new[i] <- pow((C.new[i] - eval[i]),2) / (eval[i] + 0.5) } # Derived quantities totalN<- sum(N[]) mean.detection <- mean(p[]) mean.N <- mean(N[]) fit <- sum(E[]) fit.new <- sum(E.new[]) } ", fill = TRUE) sink() ########################################################## ########################################################## saved.per.chain <- 100 nc <- 3 # Number of chains nb <- 100 # Iteration discarded as burn-in nt <- 10 # Rate to thin samples of MCMC chains (ni <- saved.per.chain*nt + nb) # Settings used in original analysis: saved=3000; nb=250000; nt=250 #JAGS load.module("glm") system.time(out.test <- jags(win.data, inits, params, "JAGS_model.txt", n.chains = nc, n.thin = nt, n.iter = ni, n.burnin = nb)) # Summarize posteriors (JAGS) options(scipen=3) print(out.test$BUGSoutput$summary[,c(1,2,3,7,8,9)], dig = 3); OUT.TABLE<- out.test$BUGSoutput$summary[,c(1,2,3,7,8,9)] write.table(as.data.frame(OUT.TABLE), file="out.test", sep="\t",row.names=T, col.names=T, quote=FALSE, append=FALSE) mean(out.test$BUGSoutput$sims.list$fit.new > out.test$BUGSoutput$sims.list$fit) mean(out.test$BUGSoutput$mean$fit) / mean(out.test$BUGSoutput$mean$fit.new) sum(out.test$BUGSoutput$mean$N>=1) ## Number Occupied Sites based on analysis sum(apply(y.data[,2:7], 1, sum, na.rm=TRUE) > 0) # Apparent distribution among visited sites # Plot posteriors par(mfrow = c(3, 2)) hist(out.test$BUGSoutput$sims.list$beta.p[,2], col = "gray", main = "beta 2", xlab = "date", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[2]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[2]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$beta.p[,3], col = "gray", main = "beta 3", xlab = "temp", las=1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[3]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[3]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$beta.p[,4], col = "gray", main = "beta 4", xlab = "method", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[4]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[4]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$beta.p[,5], col = "gray", main = "beta 5", xlab = "cattail", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[5]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[5]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$beta.p[,6], col = "gray", main = "beta 7", xlab = "slope", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[6]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[6]",4], lwd = 2, col = "blue") par(mfrow = c(2, 2)) hist(out.test$BUGSoutput$sims.list$alpha.lam[,2], col = "gray", main = "alpha 2", xlab = "fish", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[2]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[2]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,3], col = "gray", main = "alpha 3", xlab = "open", las=1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[3]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[3]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,4], col = "gray", main = "alpha 4", xlab = "canopy", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[4]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[4]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,5], col = "gray", main = "alpha 5", xlab = "canopy^2", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[5]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[5]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,6], col = "gray", main = "alpha 6", xlab = "# predators", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[6]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[6]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,7], col = "gray", main = "alpha 7", xlab = "ponds w/in 300m", las=1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[7]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[7]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,8], col = "gray", main = "alpha 8", xlab = "% forest w/in 300m", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[8]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[8]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,9], col = "gray", main = "alpha 9", xlab = "Age", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[9]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[9]",4], lwd = 2, col = "blue") hist(out.test$BUGSoutput$sims.list$alpha.lam[,10], col = "gray", main = "alpha 10", xlab = "Slope", las = 1) abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[10]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[10]",4], lwd = 2, col = "blue") # Evaluation of fit par(mfrow = c(1, 1)) plot(out.test$BUGSoutput$sims.list$fit, out.test$BUGSoutput$sims.list$fit.new, main = "A. ann", xlab = "Discrepancy actual data", ylab = "Discrepancy replicate data", frame.plot = FALSE) abline(0, 1, lwd = 2, col = "black")