###############################################################
# R code to analyze repeat count data using 
# Bayesian N-mixture models in JAGS
#                                                             
# Written by Bill Peterman                                    
# August 2013                                             
##############################################################

# This analysis uses JAGS, but the code can very easily be adapted to use
# WinBUGS or OpenBUGS. 

# ****** Before starting, download and install JAGS (http://sourceforge.net/projects/mcmc-jags/files/) ******

# Run code below to check for, and install necessary R packages for the analysis
libs=c("R2jags","reshape2") # Packages used in this code
type=getOption("pkgType")                           
CheckInstallPackage <- function(packages, repos="http://cran.r-project.org",
																depend=c("Depends", "Imports", "LinkingTo", "Suggests", "Enhances"), ...) {
	installed=as.data.frame(installed.packages())
	for(p in packages) {
		if(is.na(charmatch(p, installed[,1]))) { 
			install.packages(p, repos=repos, dependencies=depend, ...) 
		}
	}
} 
CheckInstallPackage(packages=libs)

# Load packages
require(reshape2)
require(R2jags)

# Set working directory to folder where your data file is stored
setwd("C:/Rdata/")


# Import data data (Randomly subsampled data, with replacement, from A. annulatum are provided)
data <- read.csv("AnCon.dat.csv")
names(data)

##########################
# Data fields
# pond.name = sequential pond name
# id = a unique ID for each row. Note that because the data provided is sampled with replacement, 
#      some ponds are included multiple times, although each has been given a unique ID
# t1-t6 = temperature in Celsius during survey
# d1-d6 = Julian day of survey
# m1-m6 = Method used to capture larvae (0=Dipnet, 1=Minnow trap)
# s1-s6 = The total number of salamander larvae captured on a survey day
# fish = presence of fish
# hydro = 4 category hydrology measure 
# ctail = Percentage of pond covered by cattails
# open = Percentage of pond without vegetation or structure
# pred = Number of invertebrate predator species captured during surveys of ponds
# f.300 = Amount of forest (square meters) within a 300m radius of the focal pond
# f.1km = Amount of forest (square meters) within a 1km radius of the focal pond
# p.300 = Number of ponds within a 300m radius of the focal pond
# d.forest = Distance (meters) to forest
# cnpy = Percent canopy coverage over the pond
# slope = slope od the pond basin
# effort = number of traps/dipnet sweeps used during survey
# area = surface area (square meters) of pond
# age = estimated age (years) of each pond
##########################

# Sort data by numeric ID
data <- data[order(data$id),] 

# Extract variables for analysis
# Counts at ponds
vars <- paste("s", 1:6, sep="")
y.data <- data[vars]
id<- data[,2]
y.data <- cbind(id, y.data) # Attach ID for later merging

# Method of sampling
vars <- paste("m", 1:6, sep="")
method2 <- data[vars]
id<- data[,2]
method <- cbind(id, method2)

# Temp data
vars <- paste("t", 1:6, sep="")
temp <- data[vars]
id<- data[,2]
temp <- cbind(id, temp)

# Survey date
vars <- paste("d", 1:6, sep="")
date <- data[vars]
id<- data[,2]
date <- cbind(id, date)

age <- data["age"]
slope <- data["slope"]
# offset to account for variable sampling effort at each site
offset <- as.matrix(data["effort"]);colnames(offset) <- NULL; offset<-as.numeric(offset)
fish <- as.matrix(data["fish"]); colnames(fish) <- NULL; fish<-as.numeric(fish)
hydro <- as.matrix(data["hydro"]); colnames(hydro) <- NULL; hydro<-as.numeric(hydro)
ctail <- data["ctail"]  
open <- data["open"] 
pred <- data["pred"]
f.300 <- data["f.300"]
f.1km <- data["f.1km"]
p.300 <- data["p.300"] 
frst.dist <- data["d.frst"]
cnpy <- data["cnpy"]

#Impute means for missing values
ctail[is.na(ctail)] <- colMeans(ctail, na.rm=TRUE)
open[is.na(open)] <- colMeans(open, na.rm=TRUE)
cnpy[is.na(cnpy)] <- colMeans(cnpy, na.rm=TRUE)
age[is.na(age)] <- colMeans(age, na.rm=TRUE)
slope[is.na(slope)] <- colMeans(slope, na.rm=TRUE)
pred[is.na(pred)] <- colMeans(pred, na.rm=TRUE)

# Scale and center each covariate
ctail <- scale(ctail,center=T,scale=T)
open <- scale(open,center=T,scale=T)
cnpy <- scale(cnpy,center=T,scale=T)
age <- scale(age,center=T,scale=T)
slope <- scale(slope,center=T,scale=T)
pred <- scale(pred,center=T,scale=T)
frst.300 <- scale(f.300,center=T,scale=T)
frst.1km <- scale(f.1km,center=T,scale=T)
ponds.300 <- scale(p.300,center=T,scale=T)
d.forest <- scale(frst.dist,center=T,scale=T)

# Make quadratic term for canopy coverage
cnpy2 <- cnpy[,1]^2

# Convert to long data format
# This is necessary because we have unequal sampling among ponds. 
# Using the long format and nested indexing accomodates this
y.long <- melt(data=y.data,na.rm=TRUE,value.name="count",id.vars=("id"))
y.long <- y.long[order(y.long$id),] 
C <- as.numeric(y.long[,3]) # Count data in long format

# This sample vector will index our for loop during the analysis
sample <- as.numeric(y.long[,1])

m.long <- melt(data=method,na.rm=TRUE,value.name="method",id.vars=("id"))
m.long <- m.long[order(m.long$id),]
m <- as.numeric(m.long[,3])

d.long <- melt(data=date,na.rm=TRUE,value.name="date",id.vars=("id"))
d.long <- d.long[order(d.long$id),]
dl <- d.long[,3]
date <- scale(dl,center=T,scale=T) # Scale and center now that all values are in single vector

t.long <- melt(data=temp,na.rm=TRUE,value.name="temp",id.vars=("id"))
t.long <- t.long[order(t.long$id),]
temp <- t.long[,3]
temp <- scale(temp,center=T,scale=T) # Scale and center now that all values are in single vector

# Cattail is used as a detection covariate, so it needs values to correspond with each survey
ctail.p <- cbind(id, ctail)
ctail.p <- merge(y.long, ctail.p,by="id")
ctail.p <- as.numeric(ctail.p[,4])

# Slope is used as a detection covariate, so it needs values to correspond with each survey
slope.p <- cbind(id,slope)
slope.p <- merge(y.long, slope.p,by="id")
slope.p <- as.numeric(slope.p[,4])

# Create a cattail x method interaction
ctail.method <- ctail.p*m

pond <- as.numeric(data[,2]) # Unique ID for each pond
n.ponds <- length(pond) # Total number of ponds in the data set
n.samples <- length(sample) # Total number of sample periods in the data set

# Explore data
Y.max <- apply(y.data[,2:7], 1, function(x) max(x, na.rm = TRUE))
hist(Y.max)
boxplot(Y.max)

#######################################################
# Parameterize Bayesian model
# Number of Abundance covariates
ab.covs <- 10

# Number of detection covariates
det.covs <- 6

# Bundle data
win.data <- list(
	C = C,
	sample = sample,
	n.ponds = n.ponds,
	n.samples = n.samples,
	fish = fish, 
	open = open[,1], 
	cnpy = cnpy[,1], 
	cnpy2 = cnpy2,
	frst.300 = frst.300[,1], 
	ponds.300 = ponds.300[,1],
	pred = pred[,1],
	date = date[,1], 
	ctail.p = ctail.p,
	method = m, 
	temp = temp[,1], 
	ab.covs = ab.covs,
	det.covs = det.covs,
	slope = slope[,1],
	slope.p = slope.p,
	age = age[,1],
	off = log(offset))

# Function to set initial starting values
Nst <- apply(y.data[,2:7], 1, max, na.rm=TRUE) + 1 
inits <- function(){list(
	N = Nst, 
	alpha.lam=runif(ab.covs,-1,1),
	beta.p=runif(det.covs,-1,1))}


# Parameters monitored
params <- c("alpha.lam", "beta.p", "fit", "fit.new", "N", "mean.detection", "mean.N", "sd.p", "sd.lam")


# Model parameterization largely follows Kery and Schaub (2012)
sink("JAGS_model.txt")
cat("
		model{
		# Priors
		# Abundance priors
		for (i in 1:ab.covs){
				alpha.lam[i] ~ dnorm(0, 0.0001)
		}
		
		# Detection priors
		for (i in 1: det.covs){
				beta.p[i] ~ dnorm(0, 0.0001)
		}
		
    # Random effects for abundance and detection during each survey
		for(i in 1:n.ponds){
		  eps[i] ~ dnorm(0, tau.lam) # Noise/uncertainty in abundance
		}
		
		tau.lam <- 1/ (sd.lam * sd.lam)	
		sd.lam ~ dunif(0,5)
		tau.p ~ dgamma(0.01, 0.01)
		sd.p <- 1 / (tau.p)
###########
		
		# ABUNDANCE MODEL	
		# Likelihood
		for(i in 1:n.ponds) {			# Loop over ponds
		N[i] ~ dpois(lambda[i])
		
    # This is a Poisson-log-normal distribution with the 'eps' random error term
  		log(lambda[i]) <-  off[i] + alpha.lam[1] + alpha.lam[2]*fish[i]+ alpha.lam[3]*open[i] + alpha.lam[4]*cnpy[i] + alpha.lam[5]*cnpy2[i] + alpha.lam[6]*pred[i]+ alpha.lam[7]*ponds.300[i] + alpha.lam[8]*frst.300[i] + alpha.lam[9]*age[i] + alpha.lam[10]*slope[i] + eps[i]		
		}	
		
		# DETECTION MODEL
		for(i in 1:n.samples){				# Loop over observations
		C[i] ~ dbin(p[i], N[sample[i]])
		
		p[i] <- exp(lp.lim[i])/(1 + exp(lp.lim[i]))   
		lp.lim[i] <- min(999, max(-999, lp[i])) # Help stabilize the logit
		lp[i]~dnorm(FIT[i], tau.p) # Random noise is detection modeled here
		FIT[i] <- beta.p[1] + beta.p[2]*date[sample[i]] + beta.p[3]*temp[sample[i]] + beta.p[4]*method[sample[i]] + beta.p[5]*ctail.p[i] + beta.p[6]*slope.p[sample[i]] 
		
		# Assess model fit using Chi-squared discrepancy
		# Compute fit statistic for observed data
		eval[i]<-p[i] * N[sample[i]]
		E[i] <- pow((C[i] - eval[i]),2) / (eval[i] + 0.5)
		# Generate replicate data and compute fit stats for them
		C.new[i] ~ dbin(p[i], N[sample[i]])
		E.new[i] <- pow((C.new[i] - eval[i]),2) / (eval[i] + 0.5)		
		
		}
		
		# Derived quantities
		totalN<- sum(N[])
		mean.detection <- mean(p[])
		mean.N <- mean(N[])
		fit <- sum(E[])
		fit.new <- sum(E.new[])
		
		}
		", fill = TRUE)
sink()
##########################################################
##########################################################
saved.per.chain <- 100
nc <- 3 # Number of chains
nb <- 100 # Iteration discarded as burn-in
nt <- 10 # Rate to thin samples of MCMC chains
(ni <- saved.per.chain*nt + nb)

# Settings used in original analysis: saved=3000; nb=250000; nt=250

#JAGS
load.module("glm")
system.time(out.test <- jags(win.data, inits, params, "JAGS_model.txt", n.chains = nc, n.thin = nt, n.iter = ni, n.burnin = nb))

# Summarize posteriors (JAGS)
options(scipen=3)
print(out.test$BUGSoutput$summary[,c(1,2,3,7,8,9)], dig = 3); OUT.TABLE<- out.test$BUGSoutput$summary[,c(1,2,3,7,8,9)]
write.table(as.data.frame(OUT.TABLE), file="out.test", sep="\t",row.names=T, col.names=T, quote=FALSE, append=FALSE)

mean(out.test$BUGSoutput$sims.list$fit.new > out.test$BUGSoutput$sims.list$fit)
mean(out.test$BUGSoutput$mean$fit) / mean(out.test$BUGSoutput$mean$fit.new)
sum(out.test$BUGSoutput$mean$N>=1) ## Number Occupied Sites based on analysis
sum(apply(y.data[,2:7], 1, sum, na.rm=TRUE) > 0)  	# Apparent distribution among visited sites

# Plot posteriors
par(mfrow = c(3, 2))
hist(out.test$BUGSoutput$sims.list$beta.p[,2], col = "gray", main = "beta 2", xlab = "date", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[2]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[2]",4], lwd = 2, col = "blue") 
hist(out.test$BUGSoutput$sims.list$beta.p[,3], col = "gray", main = "beta 3", xlab = "temp", las=1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[3]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[3]",4], lwd = 2, col = "blue") 
hist(out.test$BUGSoutput$sims.list$beta.p[,4], col = "gray", main = "beta 4", xlab = "method", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[4]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[4]",4], lwd = 2, col = "blue") 
hist(out.test$BUGSoutput$sims.list$beta.p[,5], col = "gray", main = "beta 5", xlab = "cattail", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[5]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[5]",4], lwd = 2, col = "blue") 
hist(out.test$BUGSoutput$sims.list$beta.p[,6], col = "gray", main = "beta 7", xlab = "slope", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["beta.p[6]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["beta.p[6]",4], lwd = 2, col = "blue") 

par(mfrow = c(2, 2))
hist(out.test$BUGSoutput$sims.list$alpha.lam[,2], col = "gray", main = "alpha 2", xlab = "fish", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[2]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[2]",4], lwd = 2, col = "blue") 
hist(out.test$BUGSoutput$sims.list$alpha.lam[,3], col = "gray", main = "alpha 3", xlab = "open", las=1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[3]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[3]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,4], col = "gray", main = "alpha 4", xlab = "canopy", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[4]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[4]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,5], col = "gray", main = "alpha 5", xlab = "canopy^2", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[5]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[5]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,6], col = "gray", main = "alpha 6", xlab = "# predators", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[6]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[6]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,7], col = "gray", main = "alpha 7", xlab = "ponds w/in 300m", las=1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[7]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[7]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,8], col = "gray", main = "alpha 8", xlab = "% forest w/in 300m", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[8]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[8]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,9], col = "gray", main = "alpha 9", xlab = "Age", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[9]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[9]",4], lwd = 2, col = "blue")
hist(out.test$BUGSoutput$sims.list$alpha.lam[,10], col = "gray", main = "alpha 10", xlab = "Slope", las = 1)
abline(v = 0, lwd = 3, col = "red"); abline(v = OUT.TABLE["alpha.lam[10]",3], lwd = 2, col = "blue"); abline(v = OUT.TABLE["alpha.lam[10]",4], lwd = 2, col = "blue")

# Evaluation of fit
par(mfrow = c(1, 1))
plot(out.test$BUGSoutput$sims.list$fit, out.test$BUGSoutput$sims.list$fit.new, main = "A. ann", xlab = "Discrepancy actual data", ylab = "Discrepancy replicate data", frame.plot = FALSE)
abline(0, 1, lwd = 2, col = "black")