diff --git a/DESCRIPTION b/DESCRIPTION index 8709a00a..7100d0a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: geostan Title: Bayesian Spatial Analysis -Version: 0.7.0 -Date: 2024-09-17 +Version: 0.8.0 +Date: 2024-10-31 URL: https://connordonegan.github.io/geostan/ BugReports: https://github.com/ConnorDonegan/geostan/issues Authors@R: c( @@ -10,7 +10,7 @@ Authors@R: c( person("Mitzi", "Morris", role = "ctb"), person("Amy", "Tims", role = "ctb") ) -Description: For spatial data analysis; provides exploratory spatial analysis tools, spatial regression models, disease mapping models, model diagnostics, and special methods for inference with small area survey data (e.g., the America Community Survey (ACS)) and censored population health surveillance data. Models are pre-specified using the Stan programming language, a platform for Bayesian inference using Markov chain Monte Carlo (MCMC). References: Carpenter et al. (2017) ; Donegan (2021) ; Donegan (2022) ; Donegan, Chun and Hughes (2020) ; Donegan, Chun and Griffith (2021) ; Morris et al. (2019) . +Description: For spatial data analysis; provides exploratory spatial analysis tools, spatial regression, spatial econometric, and disease mapping models, model diagnostics, and special methods for inference with small area survey data (e.g., the America Community Survey (ACS)) and censored population health monitoring data. Models are pre-specified using the Stan programming language, a platform for Bayesian inference using Markov chain Monte Carlo (MCMC). References: Carpenter et al. (2017) ; Donegan (2021) ; Donegan (2022) ; Donegan, Chun and Hughes (2020) ; Donegan, Chun and Griffith (2021) ; Morris et al. (2019) . License: GPL (>= 3) Encoding: UTF-8 LazyData: true @@ -26,6 +26,7 @@ Imports: methods, graphics, stats, + spData, MASS, truncnorm, signs, diff --git a/NAMESPACE b/NAMESPACE index 84875be4..c2289224 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,7 @@ export(shape2mat) export(sim_sar) export(sp_diag) export(spatial) +export(spill) export(stan_car) export(stan_esf) export(stan_glm) @@ -60,12 +61,14 @@ import(ggplot2) import(graphics) import(methods) import(stats) -importFrom(MASS,mvrnorm) importFrom(Matrix,Diagonal) importFrom(Matrix,Matrix) +importFrom(Matrix,Schur) importFrom(Matrix,colMeans) +importFrom(Matrix,diag) importFrom(Matrix,isSymmetric) importFrom(Matrix,rowSums) +importFrom(Matrix,solve) importFrom(Matrix,sparseMatrix) importFrom(Matrix,summary) importFrom(Matrix,t) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 82b3f6cc..0dca83e3 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -89,45 +89,154 @@ aple <- function(x, w, digits = 3) { #' #' @md #' -#' @param m The number of samples required. Defaults to \code{m=1} to return an \code{n}-length vector; if \code{m>1}, an \code{m x n} matrix is returned (i.e. each row will contain a sample of correlated values). +#' @param m The number of samples required. Defaults to \code{m=1} to return an \code{n}-length vector; if \code{m>1}, an \code{m x n} matrix is returned (i.e. each row will contain a sample of auto-correlated values). +#' #' @param mu An \code{n}-length vector of mean values. Defaults to a vector of zeros with length equal to \code{nrow(w)}. -#' @param w Row-standardized \code{n x n} spatial weights matrix. -#' @param rho Spatial autocorrelation parameter in the range (-1, 1). Typically a scalar value; otherwise an n-length numeric vector. -#' @param sigma Scale parameter (standard deviation). Defaults to \code{sigma = 1}. Typically a scalar value; otherwise an n-length numeric vector. +#' +#' @param rho Spatial autocorrelation parameter in the range (-1, 1). A single numeric value. +#' +#' @param sigma Scale parameter (standard deviation). Defaults to \code{sigma = 1}. A single numeric value. +#' +#' @param w \code{n x n} spatial weights matrix; typically row-standardized. +#' +#' @param type Type of SAR model: spatial error model ("SEM") or spatial lag model ("SLM"). +#' +#' @param quick Use power of matrix W to approximate the inverse term? +#' +#' @param K Number of matrix powers to use if `quick = TRUE`. +#' #' @param ... further arguments passed to \code{MASS::mvrnorm}. #' #' @return #' -#' If \code{m = 1} a vector of the same length as \code{mu}, otherwise an \code{m x length(mu)} matrix with one sample in each row. +#' If \code{m = 1} then `sim_sar` returns a vector of the same length as \code{mu}, otherwise an \code{m x length(mu)} matrix with one sample in each row. +#' +#' @details +#' +#' This function takes `n = nrow(w)` draws from the normal distribution using `rnorm` to obtain vector `x`; if `type = 'SEM', it then pre-multiplies `x` by the inverse of the matrix `(I - rho * W)` to obtain spatially autocorrelated values. For `type = 'SLM', the multiplier matrix is applied to `x + mu` to produce the desired values. #' -#' @details Calls \code{MASS::mvrnorm} internally to draw from the multivariate normal distribution. The covariance matrix is specified following the simultaneous autoregressive (SAR, aka spatial error) model. +#' The `quick` method approximates the matrix inversion using the method described by LeSage and Pace (2009, p. 40). #' #' @seealso \code{\link[geostan]{aple}}, \code{\link[geostan]{mc}}, \code{\link[geostan]{moran_plot}}, \code{\link[geostan]{lisa}}, \code{\link[geostan]{shape2mat}} #' #' @examples -#' data(georgia) -#' w <- shape2mat(georgia, "W") -#' x <- sim_sar(w = w, rho = 0.5) -#' aple(x, w) -#' -#' x <- sim_sar(w = w, rho = 0.7, m = 10) +#' # spatially autocorrelated data on a regular grid +#' library(sf) +#' row = 10 +#' col = 10 +#' sar_parts <- prep_sar_data2(row = row, col = col) +#' w <- sar_parts$W +#' x <- sim_sar(rho = 0.65, w = w) +#' dat <- data.frame(x = x) +#' +#' # create grid +#' sfc = st_sfc(st_polygon(list(rbind(c(0,0), c(col,0), c(col,row), c(0,0))))) +#' grid <- st_make_grid(sfc, cellsize = 1, square = TRUE) +#' st_geometry(dat) <- grid +#' plot(dat) +#' +#' # draw form SAR (SEM) model +#' z <- sim_sar(rho = 0.9, w = w) +#' moran_plot(z, w) +#' grid$z <- z +#' +#' # multiple sets of observations +#' # each row is one N-length draw from the SAR model +#' x <- sim_sar(rho = 0.7, w = w, m = 4) +#' nrow(w) #' dim(x) #' apply(x, 1, aple, w = w) -#' @importFrom MASS mvrnorm +#' apply(x, 1, mc, w = w) +#' +#' # Spatial lag model (SLM): y = rho*Wy + beta*x + epsilon +#' x <- sim_sar(rho = 0.5, w = w) +#' y <- sim_sar(mu = x, rho = 0.7, w = w, type = "SLM") +#' +#' # Spatial Durbin lag model (SLM with spatial lag of x) +#' # SDLM: y = rho*Wy + beta*x + gamma*Wx + epsilon +#' x = sim_sar(w = w, rho = 0.5) +#' mu <- -0.5*x + 0.5*(w %*% x)[,1] +#' y <- sim_sar(mu = mu, w = w, rho = 0.6, type = "SLM") #' -sim_sar <- function(m = 1, mu = rep(0, nrow(w)), w, rho, sigma = 1, ...) { +#' +#' @source +#' +#' LeSage, J. and Pace, R. K. (2009). *An Introduction to Spatial Econometrics*. CRC Press. +#' +#' @importFrom Matrix solve +sim_sar <- function(m = 1, + mu = rep(0, nrow(w)), + rho, + sigma = 1, + w, + type = c("SEM", "SLM"), + quick = FALSE, + K = 20, + ...) { check_sa_data(mu, w) - stopifnot(all(round(Matrix::rowSums(w), 10) == 1)) + type <- match.arg(type) + SLM <- grepl("SLM", type) N <- nrow(w) - if (!inherits(rho, "numeric") || !length(rho) %in% c(1, N)) stop("rho must be a single numeric value, or a k-length numeric vector where K=nrow(W).") - if (!inherits(sigma, "numeric") || !length(sigma) %in% c(1, N) || !all(sigma > 0)) stop("sigma must be a positive numeric value, or k-length numeric vector, with K=nrow(W).") - I <- diag(N) - S <- sigma^2 * solve( (I - rho * Matrix::t(w)) %*% (I - rho * w) ) - # or: # S <- crossprod(solve(I - rho * w)) * sigma^2 - x <- MASS::mvrnorm(n = m, mu = mu, Sigma = S, ...) - return(x) + stopifnot(inherits(rho, "numeric") & length(rho) == 1) + if (!inherits(sigma, "numeric") || length(sigma) != 1 || !all(sigma > 0)) stop("sigma must be a positive numeric value.") + + I <- Matrix::diag(N) + W <- as(w, "dMatrix") + + if (quick) { + # matrix powers to approximate the inverse + Multip <- I + rho * W + W_k <- W + for (j in 2:K) { + W_k <- W %*% W_k + Multip = Multip + rho^j * W_k + } + } else { + # matrix inverse + Multip <- Matrix::solve(I - rho * W) + } + + x <- t(sapply(1:m, function(s) { + rnorm(N, mean = 0, sd = sigma) + })) + + if (rho == 0) return (x) + + .rsem <- function(s) { + mu + (Multip %*% x[s,])[,1] + } + + .rslm <- function(s) { + z <- mu + x[s,] + (Multip %*% z)[,1] + } + + if (SLM == TRUE) { + res <- t(sapply(1:m, .rslm)) + } else { + res <- t(sapply(1:m, .rsem)) + } + + if (m == 1) res <- res[1,] + + return (res) } + + + + + + + + + + + + + + + #' Visual displays of spatial data and spatial models #' #' @description Visual diagnostics for areal data and model residuals @@ -717,9 +826,13 @@ count_neighbors <- function(z) { #' Row-standardize a matrix; safe for zero row-sums. #' #' @param C A matrix -#' @param warn Print `msg` if `warn = TRUE`. -#' @param msg A warning message to print. +#' +#' @param warn Print message `msg` if `warn = TRUE`. +#' +#' @param msg A warning message; used internally by geostan. +#' #' @return A row-standardized matrix, W (i.e., all row sums equal 1, or zero). +#' #' @examples #' A <- shape2mat(georgia) #' head(Matrix::summary(A)) @@ -728,10 +841,12 @@ count_neighbors <- function(z) { #' W <- row_standardize(A) #' head(Matrix::summary(W)) #' Matrix::rowSums(W) +#' #' @importFrom Matrix rowSums +#' #' @export -row_standardize <- function(C, warn = TRUE, msg = "Row standardizing connectivity matrix") { - stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) +row_standardize <- function(C, warn = FALSE, msg = "Row standardizing connectivity matrix") { + stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) if (warn) message(msg) Ni <- Matrix::rowSums(C) Ni[Ni == 0] <- 1 @@ -1066,7 +1181,7 @@ prep_icar_data <- function(C, scale_factor = NULL) { #' @export #' @md #' @importFrom rstan extract_sparse_parts -#' @importFrom Matrix isSymmetric Matrix rowSums summary +#' @importFrom Matrix isSymmetric Matrix rowSums summary Schur prep_car_data <- function(A, style = c("WCAR", "ACAR", "DCAR"), k = 1, @@ -1110,27 +1225,32 @@ prep_car_data <- function(A, C <- A / Ni M_diag <- 1 / Ni } + + stopifnot( Matrix::isSymmetric(C %*% Matrix::Diagonal(x = M_diag), check.attributes = FALSE) ) + if (stan_fn == "wcar_normal_lpdf") { - stopifnot( Matrix::isSymmetric(C %*% Matrix::Diagonal(x = M_diag), check.attributes = FALSE) ) car.dl <- rstan::extract_sparse_parts(A) names(car.dl) <- paste0("A_", names(car.dl)) car.dl$nA_w <- length(car.dl$A_w) car.dl$WCAR <- 1 } else { - stopifnot( Matrix::isSymmetric(C %*% Matrix::Diagonal(x = M_diag), check.attributes = FALSE) ) car.dl <- rstan::extract_sparse_parts(C) names(car.dl) <- paste0("A_", names(car.dl)) car.dl$nA_w <- length(car.dl$A_w) car.dl$WCAR <- 0 } + car.dl$Delta_inv <- 1 / M_diag car.dl$style <- style car.dl$log_det_Delta_inv = base::determinant(diag(car.dl$Delta_inv), log = TRUE)$modulus car.dl$n <- n + if (lambda) { MCM <- Matrix::Diagonal(x = 1 / sqrt(M_diag)) %*% C %*% Matrix::Diagonal(x = sqrt(M_diag)) stopifnot(Matrix::isSymmetric(MCM, check.attributes = FALSE)) - lambda <- sort(eigen(MCM)$values) + ##lambda <- sort(eigen(MCM)$values) + lambda <- Matrix::Schur(MCM, vectors = FALSE)$EValues + lambda <- sort(Re(lambda)) rho_lims <- 1 / range(lambda) if (!quiet) { r_rho_lims <- round( rho_lims, 3) @@ -1189,10 +1309,17 @@ prep_sar_data <- function(W, quiet = FALSE) { sar.dl <- rstan::extract_sparse_parts(W) names(sar.dl) <- paste0("W_", names(sar.dl)) sar.dl$nW_w <- length(sar.dl$W_w) - sar.dl$eigenvalues_w <- sort(as.numeric( eigen(W)$values )) + #if (quiet == TRUE) { + #evw <- suppressWarnings(eigen(W)$values) + #} else { + #evw <- eigen(W)$values + #} + evw <- Matrix::Schur(W, vectors = FALSE)$EValues + evw <- sort(Re(evw)) + sar.dl$eigenvalues_w <- evw sar.dl$n <- N sar.dl$W <- W - rho_lims <- 1/range(sar.dl$eigenvalues_w) + rho_lims <- 1/range(evw) if (!quiet) { r_rho_lims <- round(rho_lims, 3) message("Range of permissible rho values: ", r_rho_lims[1], ", ", r_rho_lims[2]) diff --git a/R/geostan_fit-methods.R b/R/geostan_fit-methods.R index 865b854e..737f35e6 100644 --- a/R/geostan_fit-methods.R +++ b/R/geostan_fit-methods.R @@ -67,7 +67,6 @@ print.geostan_fit <- function(x, cat("Likelihood function: ", x$family$family, "\n") cat("Link function: ", x$family$link, "\n") if (!is.null(x$diagnostic$Residual_MC)) cat("Residual Moran Coefficient: ", x$diagnostic$Residual_MC, "\n") - if (!is.null(x$diagnostic$WAIC)) cat("WAIC: ", x$diagnostic$WAIC, "\n") cat("Observations: ", x$N, "\n") if (x$ME$has_me) { cat("Data models (ME): ") diff --git a/R/impacts.R b/R/impacts.R index de318ce3..7ff8a3d3 100644 --- a/R/impacts.R +++ b/R/impacts.R @@ -1,95 +1,172 @@ +#' Spillover/diffusion effects for spatial lag models +#' #' @export #' @rdname impacts #' @md -#' @importFrom Matrix solve rowSums diag #' #' @param beta Coefficient for covariates (numeric vector) -#' @param gamma Coefficient for spatial lag of covariates (numeric vector) +#' +#' @param gamma Coefficient for spatial lag of covariates +#' #' @param W Spatial weights matrix -#' @param rho Estimate of spatial dependence parameter -#' @param quick Use short-cut method to calculate inverse of matrix (I - rho * W)? -#' @param K Degree of polynomial in the expansion to use for the 'quick' matrix inverse method. +#' +#' @param rho Spatial dependence parameter (single numeric value) +#' +#' @param method Use 'quick' for a computationally efficient approximation (after LeSage and Pace 2009, pp. 114--115) and 'proper' to compute the matrix inversion using `Matrix::solve`. +#' +#' @param K Degree of polynomial in the expansion to use for the 'quick' method. #' #' @details -#' The `spill` function is for quickly calculating spillover effects given point estimates of parameters. This is used internally by `impacts`, and can be applied iteratively to a matrix of MCMC samples (for beta, gamma, and rho) to obtain MCMC samples for impacts. +#' +#' These functions are for interpreting the results of the spatial lag model ('SLM') and the spatial Durbin lag model ('SDLM'). The models can be fitted using \link[geostan]{stan_sar}. The equation for these SAR models specifies simultaneous feedback between all units, such that changing the outcome in one location has a spill-over effect that may extend to all other locations (a ripple or diffusion effect); the induced changes will also react back onto the first unit. (This presumably takes time, even if the observations are cross-sectional.) +#' +#' Assuming that the model specification is in fact reasonable for your application, these spill-overs have to be incorporated into the interpretation of the regression coefficients of SLM and SDLM models. A unit change in the value of \code{X} in one location will impact \code{y} in that same place ('direct' impact) and will also impact \code{y} elsewhere through the diffusion process ('indirect' impact). The 'total' impact of a unit change in \code{X} is the sum of the direct and indirect effects (after LeSage and Pace 2009). +#' +#' The `spill` function is for quickly calculating average spillover effects given point estimates of parameters. (This can be used to verify the nearness of the approximate method to the proper method.) +#' +#' The `impacts` function calculates the (average) direct, indirect, and total effects once for every MCMC sample to produce samples from the posterior distribution for the impacts; the samples are returned together with a summary of the distribution (mean, median, and select quantiles). +#' +#' These methods are only needed for the spatial lag and spatial Durbin lag models (SLM and SDLM). Any other model with spatially lagged covariates (SLX) might be read as having indirect impacts (as always, interpretation is subject to plausibility per causal reasoning) but the impacts are then equal to the SLX coefficients, the direct effects are equal to beta (the coefficient for X), and the 'total' impact is their sum (see the example below for calculation with MCMC samples). +#' +#' @source +#' +#' LeSage, James and Pace, R. Kelley (2009). *Introduction to Spatial Econometrics*. CRC Press. +#' +#' LeSage, James (2014). What Regional Scientists Need to Know about Spatial Econometrics. *The Review of Regional Science* 44: 13-32 (2014 Southern Regional Science Association Fellows Address). +#' +#' @importFrom Matrix solve rowSums diag +#' +#' @examples +#' ## +#' ## SDLM data +#' ## #' -spill <- function(beta, gamma = 0, W, rho, quick = FALSE, K = 15) { +#' parts <- prep_sar_data2(row = 9, col = 9, quiet = TRUE) +#' W <- parts$W +#' x <- sim_sar(w=W, rho=.6) +#' Wx <- (W %*% x)[,1] +#' mu <- .5 * x + .25 * Wx +#' y <- sim_sar(w=W, rho=0.6, mu = mu, type = "SLM") +#' dat <- cbind(y, x) +#' +#' # impacts per the above parameters +#' spill(0.5, 0.25, 0.6, W) +#' +#' ## +#' ## impacts for SDLM +#' ## +#' +#' fit <- stan_sar(y ~ x, data = dat, sar = parts, +#' type = "SDLM", iter = 500, +#' slim = TRUE, quiet = TRUE) +#' +#' impax <- impacts(fit) +#' impax$summary +#' +#' # plot posterior distributions +#' og = par(mfrow = c(1, 3), +#' mar = c(3, 3, 1, 1)) +#' S <- impax$samples[[1]] +#' hist(S[,1], main = 'Direct') +#' hist(S[,2], main = 'Indirect') +#' hist(S[,3], main = 'Total') +#' par(og) +#' +#' +spill <- function(beta, gamma = 0, rho, W, method = c('quick', 'proper'), K = 20) { - N <- nrow(W) - I <- diag(rep(1, N)) + method <- match.arg(method) + + if (method == 'quick') { + T <- matrix(0, nrow = 2, ncol = K + 1) + T[1, 1] <- 1 + for (i in 2:K) T[1, i + 1] <- mean(diag_power(W, i)) + for (i in 2:(K+1)) T[2, i] <- mean(diag_power(W, i)) + return( impacts_multiplier(beta, gamma, rho, T, K) ) + } - if (short == TRUE) { + # matrix powers: slower method of approximation + ## M_tmp <- I + rho * W + ## W_k <- W + ## for (j in 2:K) { + ## W_k <- W %*% W_k + ## M_tmp = M_tmp + rho^j * W_k - M_tmp <- I + rho * W - W_k <- W - for (j in 2:K) { - W_k <- W %*% W_k - M_tmp = M_tmp + rho^j * W_k - } - - } else { - - imrw <- I - rho * W - M_tmp <- Matrix::solve(imrw) - - } + N <- nrow(W) + I <- diag(rep(1, N)) + imrw <- I - rho * W + M_tmp <- Matrix::solve(imrw) M <- M_tmp %*% (I * beta + W * gamma) dir = mean(Matrix::diag(M)) total <- mean(Matrix::rowSums(M)) indir <- total - dir - spills = c(dir, indir, total) + spills = c(direct = dir, + indirect = indir, + total = total) return (spills) } -#' Calculate local and global spillover effects -#' -#' @description -#' -#' This needs updating to handle SLM which has 'impacts' -#' but has no slx terms. -#' -#' @param object A fitted geostan SAR model that has spillover effects -#' -#' @param W Spatial weights matrix used to fit the model -#' -#' @param quick Used short-cut matrix inverse? -#' -#' @param K For the 'quick' method. Number of powers to use in the Taylor expansion for the short-cut matrix inverse. -#' -#' @param samples Return MCMC samples together with summary in a list? If \code{FALSE}, only the summary is returned. + +#' @param object A fitted spatial lag model (from `stan_sar`) #' #' @md #' #' @export #' -impacts <- function(object, W = object$C, quick = FALSE, K = 6, samples = TRUE) { - +impacts <- function(object, method = c('quick', 'proper'), K = 20) { + stopifnot(object$spatial$method == "SAR") - B <- as.matrix(object, "beta") - G <- as.matrix(object, "gamma") + stopifnot(grepl("SLM|SDLM", object$sar_type)) + method <- match.arg(method) + + W <- object$C rho <- as.matrix(object, "sar_rho")[,1] + B <- as.matrix(object, "beta") Blabs <- colnames(B) - Gidx <- match( gsub("^w.", "", colnames(G)), Blabs ) - M <- length(Gidx) + M <- ncol(B) S <- nrow(B) + G <- matrix(0, nrow = S, ncol = M) + has_gamma <- inherits(object$slx, "formula") + + if (has_gamma) { + gamma <- as.matrix(object, "gamma") + gamma_idx <- match( gsub("^w.", "", colnames(gamma)), Blabs ) + for (j in seq_along(gamma_idx)) G[ , gamma_idx[j] ] <- gamma[ , j ] + } + + impax <- vector("list", length = M) - for (m in 1:M) { - for (i in 1:S) { - impax[[m]] <- rbind( - impax[[m]], - spill( - beta = B[i,m], - gamma = G[i,Gidx[m]], - W = W, - rho = rho[i], - quick = quick, - K = K) - ) + if (method == "proper") { + + for (m in 1:M) { + impax[[m]] <- sapply(1:S, function(s) + spill(beta = B[s, m], + gamma = G[s, m], + rho = rho[s], + W = W, + method, + K = K) + ) |> + t() } + + } else { + + T <- matrix(0, nrow = 2, ncol = K + 1) + T[1, 1] <- 1 + for (i in 2:K) T[1, i + 1] <- mean(diag_power(W, i)) + for (i in 2:(K+1)) T[2, i] <- mean(diag_power(W, i)) + + for (m in 1:M) { + impax[[m]] <- sapply(1:S, function(s) + impacts_multiplier(B[s,m], G[s,m], rho[s], T, K)) |> + t() + } + } summary <- vector("list", length = M) @@ -102,13 +179,48 @@ impacts <- function(object, W = object$C, quick = FALSE, K = 6, samples = TRUE) res <- cbind(mean = est, median = est2, sd, lwr, upr) row.names(res) <- c('Direct', 'Indirect', 'Total') summary[[m]] <- res - names(summary)[m] <- Blabs[Gidx[m]] + names(summary)[m] <- Blabs } - if (samples == TRUE) return(list(summary = summary, samples = impax)) - - return(summary) + return(list(summary = summary, samples = impax)) } +#' After LeSage and Pace 2009 pp. 114--115 +#' @noRd +impacts_multiplier <- function(beta, gamma, rho, T, K) { + + g = numeric(K + 1) + for (i in 0:K) g[i + 1] <- rho^i + G <- diag(g) + + P <- cbind(beta, gamma) + + # direct + direct = sum(P %*% T %*% G) + + # total + total <- (beta + gamma) * sum(g) + + # indirect + indirect <- total - direct + + return (c(direct = direct, indirect = indirect, total = total)) +} +#' diagonal entries of matrix powers e..g, diag( W^{20} ) +#' @noRd +diag_power <- function(W, K = 20, trace = FALSE) { + N <- nrow(W) + ones <- matrix(1, nrow = N, ncol = 1) + P <- K - 1 + mat <- W + if (K > 2) { + for (j in 2:P) { + mat <- W %*% mat + } + } + dwk <- as.numeric( (W * t(mat)) %*% ones ) + if (trace) return (sum(dwk)) + return (dwk) +} diff --git a/R/posterior_predict.R b/R/posterior_predict.R index a3c9fa2e..32d49707 100644 --- a/R/posterior_predict.R +++ b/R/posterior_predict.R @@ -1,4 +1,4 @@ -#' Draw samples from the posterior predictive distribution +#' Sample from the posterior predictive distribution #' #' @description Draw samples from the posterior predictive distribution of a fitted \code{geostan} model. #' @@ -9,43 +9,75 @@ #' @param summary Should the predictive distribution be summarized by its means and central quantile intervals? If \code{summary = FALSE}, an `S` x `N` matrix of samples will be returned. If \code{summary = TRUE}, then a `data.frame` with the means and `100*width` credible intervals is returned. #' #' @param width Only used if \code{summary = TRUE}, to set the quantiles for the credible intervals. Defaults to `width = 0.95`. +#' +#' @param quick For SAR models only; `quick = TRUE` uses an approximation method for the inverse of matrix `(I - rho * W)`. +#' +#' @param K For SAR models only; number of matrix powers to for the matrix inverse approximation. +#' +#' @param preserve_order If `TRUE`, the order of posterior draws will remain fixed; the default is to permute the MCMC samples so that (with small sample size `S`) each successive call to `posterior_predict` will return a different sample from the posterior probability distribution. #' #' @param seed A single integer value to be used in a call to \code{\link[base]{set.seed}} before taking samples from the posterior distribution. #' #' @return A matrix of size S x N containing samples from the posterior predictive distribution, where S is the number of samples drawn and N is the number of observations. If `summary = TRUE`, a `data.frame` with N rows and 3 columns is returned (with column names `mu`, `lwr`, and `upr`). #' +#' @details +#' +#' The `quick = FALSE` method requires a call to `Matrix::solve(I - rho * W)` for each MCMC sample; the `quick = TRUE` method uses an approximation based on matrix powers (LeSage and Pace 2009). #' #' @examples -#' fit <- stan_glm(sents ~ offset(log(expected_sents)), +#' E <- sentencing$expected_sents +#' sentencing$log_E <- log(E) +#' fit <- stan_glm(sents ~ offset(log_E), #' re = ~ name, #' data = sentencing, #' family = poisson(), #' chains = 2, iter = 600) # for speed only +#' #' #' yrep <- posterior_predict(fit, S = 65) -#' plot(density(yrep[1,])) -#' for (i in 2:nrow(yrep)) lines(density(yrep[i,]), col = 'gray30') -#' lines(density(sentencing$sents), col = 'darkred', lwd = 2) +#' plot(density(yrep[1,] / E )) +#' for (i in 2:nrow(yrep)) lines(density(yrep[i,] / E), col = 'gray30') +#' lines(density(sentencing$sents / E), col = 'darkred', lwd = 2) +#' +#' sars <- prep_sar_data2(row = 9, col = 9) +#' W <- sars$W +#' y <- sim_sar(rho = .9, w = W) +#' fit <- stan_sar(y ~ 1, data = data.frame(y=y), sar = sars, +#' iter = 650, quiet = TRUE) +#' yrep <- posterior_predict(fit, S = 15) +#' #' @export posterior_predict <- function(object, S, summary = FALSE, width = 0.95, + quick = TRUE, + K = 20, + preserve_order = FALSE, seed ) { stopifnot(inherits(object, "geostan_fit")) family <- object$family$family if (!missing(seed)) set.seed(seed) - mu <- as.matrix(object, pars = "fitted") + mu <- as.matrix(object, pars = "fitted") + M <- nrow(mu) + + if (missing(S)) { + S <- M + } - M <- nrow(mu) - if (missing(S)) S <- M if (S > M) { warning (paste0("Cannot draw more samples than were taken from the posterior. Using S = ", M)) - S <- M - } - idx <- sample(M, S) + S <- M + } + + if (preserve_order) { + idx <- seq(S) + } else { + idx <- sample(M, S) + } + mu <- mu[idx,] if (family == "auto_gaussian") { @@ -59,7 +91,7 @@ posterior_predict <- function(object, if (object$spatial$method == "SAR") { rho <- as.matrix(object, "sar_rho")[idx, ] tau <- as.matrix(object, "sar_scale")[idx, ] - preds <- .pp_sar_normal(mu, rho, tau, object$sar_parts$W) + preds <- .pp_sar_normal(mu, rho, tau, object$sar_parts$W, quick = quick, K = K) } } @@ -97,22 +129,77 @@ posterior_predict <- function(object, M <- Matrix::Diagonal(x = 1 / car_parts$Delta_inv) C <- Matrix::Matrix(car_parts$C) t(sapply(1:nrow(mu), function(s) { - Sigma <- Matrix::solve(I - rho[s] * C) %*% M * tau[s]^2 - MASS::mvrnorm(n = 1, mu = mu[s,], Sigma = Sigma) + Sigma <- Matrix::solve(I - rho[ s ] * C) %*% M * tau[ s ]^2 + MASS::mvrnorm(n = 1, mu = mu[ s, ], Sigma = Sigma) })) } -#' @importFrom Matrix Diagonal Matrix solve -.pp_sar_normal <- function(mu, rho, tau, W) { + +.pp_sar_normal <- function(mu, rho, sigma, W, quick, K) { + + if (!quick) { + Draws <- sapply(1:nrow(mu), function(s) { + sim_sar(mu = mu[ s, ], + rho = rho[ s ], + sigma = sigma[ s ], + w = W, + type = "SEM", # Always SEM. + quick = FALSE) + }) |> + t() + + return (Draws) + } + N <- nrow(W) - I <- Matrix::Diagonal(N) - Wt <- Matrix::t(W) - t(sapply(1:nrow(mu), function(s) { - Sigma <- Matrix::solve((I - rho[s] * Wt) %*% (I - rho[s] * W)) - MASS::mvrnorm(n = 1, mu = mu[s,], Sigma = Sigma * tau[s]^2) - })) + S <- nrow(mu) + Q <- K + 1 + P = 0:K + I <- Matrix::diag(N) + M <- list() + M[[1]] <- I + M[[2]] <- W + W_k <- W + for (q in 3:Q) { + W_k <- W %*% W_k + M[[q]] <- W_k + } + + .rsem <- function(s) { + eps <- rnorm(N, mean = 0, sd = sigma[ S ]) + rho_powers <- rho[ s ]^P + Mpowers <- lapply(seq(Q), function(j) M[[j]] * rho_powers[j]) + Multiplier <- Reduce(`+`, Mpowers) + res <- mu[ s, ] + (Multiplier %*% eps)[,1] + return (res) + } + + Draws <- sapply(1:S, .rsem) |> + t() + + return (Draws) } + + +## #' @importFrom Matrix Diagonal Matrix solve +## .pp_sar_normal <- function(mu, rho, tau, W, quick, K) { +## ## N <- nrow(W) +## ## I <- Matrix::Diagonal(N) +## ## Wt <- Matrix::t(W) +## t(sapply(1:nrow(mu), function(s) { +## sim_sar(mu = mu[ s, ], +## rho = rho[ s ], +## sigma = tau[ s ], +## w = W, +## type = "SEM", # Always SEM. +## quick = quick, # change to FALSE when done with .sar2 +## K = k) +## #Sigma <- Matrix::solve((I - rho[s] * Wt) %*% (I - rho[s] * W)) +## #MASS::mvrnorm(n = 1, mu = mu[s,], Sigma = Sigma * tau[s]^2) +## })) +## } + .pp_gaussian <- function(mu, sigma) { t(sapply(1:nrow(mu), function(s) { rnorm(ncol(mu), mu[s,], sigma[s]) diff --git a/R/stan_car.R b/R/stan_car.R index 2782bf2b..fb52508d 100644 --- a/R/stan_car.R +++ b/R/stan_car.R @@ -150,7 +150,7 @@ #' @return An object of class class \code{geostan_fit} (a list) containing: #' \describe{ #' \item{summary}{Summaries of the main parameters of interest; a data frame.} -#' \item{diagnostic}{Widely Applicable Information Criteria (WAIC) with a measure of effective number of parameters (\code{eff_pars}) and mean log pointwise predictive density (\code{lpd}), and mean residual spatial autocorrelation as measured by the Moran coefficient.} +#' \item{diagnostic}{Residual spatial autocorrelation as measured by the Moran coefficient.} #' \item{stanfit}{an object of class \code{stanfit} returned by \code{rstan::stan}} #' \item{data}{a data frame containing the model data} #' \item{family}{the user-provided or default \code{family} argument used to fit the model} @@ -183,38 +183,56 @@ #' #' #' @examples +#' +#' ## +#' ## model mortality risk +#' ## +#' +#' # simple spatial model for log rates #' -#' # model mortality risk #' data(georgia) #' C <- shape2mat(georgia, style = "B") -#' cp <- prep_car_data(C) -#' +#' cars <- prep_car_data(C) +#' +#' fit <- stan_car(log(rate.male) ~ 1, data = georgia, +#' car_parts = cars, iter = 400, quiet = TRUE) +#' +#' # model diagnostics +#' sp_diag(fit, georgia) +#' +#' # A more appropriate model for mortality rates: +#' # hierarchical spatial Poisson model #' fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)), -#' car_parts = cp, +#' car_parts = cars, #' data = georgia, #' family = poisson(), -#' iter = 800, chains = 1 # for example speed only -#' ) -#' rstan::stan_rhat(fit$stanfit) -#' rstan::stan_mcse(fit$stanfit) -#' print(fit) +#' iter = 400, quiet = TRUE) +#' +#' # model diagnostics #' sp_diag(fit, georgia) #' #' \donttest{ -#' ## DCAR specification (inverse-distance based) +#' +#' ## +#' ## Distance-based weights matrix: +#' ## the 'DCAR' model +#' ## +#' #' library(sf) #' A <- shape2mat(georgia, "B") #' D <- sf::st_distance(sf::st_centroid(georgia)) -#' A <- D * A -#' cp <- prep_car_data(A, "DCAR", k = 1) +#' D <- D * A +#' dcars <- prep_car_data(D, "DCAR", k = 1) #' -#' fit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)), +#' Dfit <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)), #' data = georgia, -#' car = cp, +#' car = dcars, #' family = poisson(), -#' iter = 800, chains = 1 # for example speed only -#' ) -#' print(fit) +#' iter = 400, quiet = TRUE) +#' +#' sp_diag(Dfit, georgia, dcars$C) +#' dic(Dfit); dic(fit) +#' #' } #' #' @export @@ -259,9 +277,9 @@ stan_car <- function(formula, stopifnot(all(dim(C) == nrow(data))) } else { C <- car_parts$C - if (car_parts$WCAR == 0) { - message("Since you did not provide C, calculation of residual SA and any spatial-lag of X terms will use the matrix found in car_parts$C.") - } + # if (car_parts$WCAR == 0) { + # message("Since you did not provide C, calculation of residual SA and any spatial-lag of X terms will use the matrix found in car_parts$C.") + # } } tmpdf <- as.data.frame(data) n <- nrow(tmpdf) @@ -319,7 +337,7 @@ stan_car <- function(formula, W <- C if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") xrs <- Matrix::rowSums(W) - if (!all(xrs == 1)) W <- row_standardize(W, msg = "Row standardizing connectivity matrix to calculate spatially lagged covaraite(s)") + if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") # efficient transform to CRS representation for W.list (via transpose) Wij <- as(W, "TsparseMatrix") Tw <- Matrix::sparseMatrix(i = Wij@j + 1, @@ -469,9 +487,6 @@ stan_car <- function(formula, rmc <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) out$diagnostic$Residual_MC <- rmc } - if (any(pars == 'fitted')) { - out$diagnostic$WAIC <- as.numeric(waic(out)[1]) - } return (out) } diff --git a/R/stan_esf.R b/R/stan_esf.R index 100a4045..a9412969 100644 --- a/R/stan_esf.R +++ b/R/stan_esf.R @@ -104,7 +104,7 @@ #' @return An object of class class \code{geostan_fit} (a list) containing: #' \describe{ #' \item{summary}{Summaries of the main parameters of interest; a data frame} -#' \item{diagnostic}{Widely Applicable Information Criteria (WAIC) with a measure of effective number of parameters (\code{eff_pars}) and mean log pointwise predictive density (\code{lpd}), and mean residual spatial autocorrelation as measured by the Moran coefficient.} +#' \item{diagnostic}{Residual spatial autocorrelation as measured by the Moran coefficient.} #' \item{data}{a data frame containing the model data} #' \item{EV}{A matrix of eigenvectors created with \code{w} and \code{geostan::make_EV}} #' \item{C}{The spatial weights matrix used to construct EV} @@ -273,7 +273,7 @@ stan_esf <- function(formula, x_full <- xraw } else { stopifnot(inherits(slx, "formula")) - W <- row_standardize(C, msg = "Row standardizing connectivity matrix to calculate spatially lagged covaraite(s)") + W <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") W.list <- rstan::extract_sparse_parts(W) Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W) dwx <- ncol(Wx) @@ -407,11 +407,8 @@ stan_esf <- function(formula, R <- resid(out, summary = FALSE) rmc <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) out$diagnostic$Residual_MC <- rmc - } - if (any(pars == 'fitted')) { - out$diagnostic$WAIC <- as.numeric(waic(out)[1]) - } - + } + return (out) } diff --git a/R/stan_glm.R b/R/stan_glm.R index 5b0a029e..1969ae53 100644 --- a/R/stan_glm.R +++ b/R/stan_glm.R @@ -163,7 +163,7 @@ #' An object of class class \code{geostan_fit} (a list) containing: #' \describe{ #' \item{summary}{Summaries of the main parameters of interest; a data frame} -#' \item{diagnostic}{Widely Applicable Information Criteria (WAIC) with a measure of effective number of parameters (\code{eff_pars}) and mean log pointwise predictive density (\code{lpd}), and mean residual spatial autocorrelation as measured by the Moran coefficient.} +#' \item{diagnostic}{Residual spatial autocorrelation as measured by the Moran coefficient.} #' \item{stanfit}{an object of class \code{stanfit} returned by \code{rstan::stan}} #' \item{data}{a data frame containing the model data} #' \item{family}{the user-provided or default \code{family} argument used to fit the model} @@ -187,15 +187,67 @@ #' Donegan, Connor (2021). Building spatial conditional autoregressive (CAR) models in the Stan programming language. *OSF Preprints*. \doi{10.31219/osf.io/3ey65}. #' #' @examples +#' ## +#' ## Linear regression model +#' ## +#' +#' N = 100 +#' x <- rnorm(N) +#' y <- .5 * x + rnorm(N) +#' dat <- cbind(y, x) +#' +#' # no. of MCMC samples +#' iter = 600 +#' +#' # fit model +#' fit <- stan_glm(y ~ x, data = dat, iter = iter, quiet = TRUE) +#' +#' # see results with MCMC diagnostics +#' print(fit) +#' +#' ## +#' ## Custom prior distributions +#' ## +#' +#' PL <- list( +#' intercept = normal(0, 1), +#' beta = normal(0, 1), +#' sigma = student_t(10, 0, 2) +#' ) +#' +#' fit2 <- stan_glm(y ~ x, data = dat, prior = PL, iter = iter, +#' quiet = TRUE) +#' +#' print(fit2) +#' +#' ## +#' ## Poisson model for count data +#' ## with county 'random effects' +#' ## +#' #' data(sentencing) #' +#' # note: 'name' is county identifier +#' head(sentencing) +#' +#' # denominator in standardized rate Y/E +#' # (observed count Y over expected count E) +#' # (use the log-denominator as the offest term) #' sentencing$log_e <- log(sentencing$expected_sents) +#' +#' # fit model #' fit.pois <- stan_glm(sents ~ offset(log_e), #' re = ~ name, #' family = poisson(), -#' data = sentencing, -#' chains = 2, iter = 800) # for speed only +#' data = sentencing, +#' iter = iter, quiet = TRUE) +#' +#' # Spatial autocorrelation/residual diagnostics +#' sp_diag(fit.pois, sentencing) #' +#' # summary of results with MCMC diagnostics +#' print(fit.pois) +#' #' # MCMC diagnostics plot: Rhat values should all by very near 1 #' rstan::stan_rhat(fit.pois$stanfit) #' @@ -206,15 +258,51 @@ #' # or for a particular parameter #' rstan::stan_ess(fit.pois$stanfit, "alpha_re") #' -#' # Spatial autocorrelation/residual diagnostics -#' sp_diag(fit.pois, sentencing) -#' -#' ## Posterior predictive distribution +#' ## +#' ## Visualize the posterior predictive distribution +#' ## +#' +#' # plot observed values and model replicate values #' yrep <- posterior_predict(fit.pois, S = 65) #' y <- sentencing$sents -#' plot(density(yrep[1,])) -#' for (i in 2:nrow(yrep)) lines(density(yrep[i,]), col = "gray30") +#' ltgray <- rgb(0.3, 0.3, 0.3, 0.5) +#' +#' plot(density(yrep[1,]), col = ltgray, +#' ylim = c(0, 0.014), xlim = c(0, 700), +#' bty = 'L', xlab = NA, main = NA) +#' +#' for (i in 2:nrow(yrep)) lines(density(yrep[i,]), col = ltgray) +#' #' lines(density(sentencing$sents), col = "darkred", lwd = 2) +#' +#' legend("topright", legend = c('Y-observed', 'Y-replicate'), +#' col = c('darkred', ltgray), lwd = c(1.5, 1.5)) +#' +#' # plot replicates of Y/E +#'E <- sentencing$expected_sents +#' +#' # set plot margins +#' old_pars <- par(mar=c(2.5, 3.5, 1, 1)) +#' +#' # plot yrep +#' plot(density(yrep[1,] / E), col = ltgray, +#' ylim = c(0, 0.9), xlim = c(0, 7), +#' bty = 'L', xlab = NA, ylab = NA, main = NA) +#' +#' for (i in 2:nrow(yrep)) lines(density(yrep[i,] / E), col = ltgray) +#' +#' # overlay y +#' lines(density(sentencing$sents / E), col = "darkred", lwd = 2) +#' +#' # legend, y-axis label +#' legend("topright", legend = c('Y-observed', 'Y-replicate'), +#' col = c('darkred', ltgray), lwd = c(1.5, 1.5)) +#' +#' mtext(side = 2, text = "Density", line = 2.5) +#' +#' # return margins to previous settings +#' par(old_pars) +#' #' @importFrom rstan extract_sparse_parts stan_glm <- function(formula, slx, @@ -305,7 +393,7 @@ stan_glm <- function(formula, W <- C if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") xrs <- Matrix::rowSums(W) - if (!all(xrs == 1)) W <- row_standardize(W, msg = "Row standardizing connectivity matrix to calculate spatially lagged covaraite(s)") + if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") # efficient transform to CRS representation for W.list (via transpose) Wij <- as(W, "TsparseMatrix") Tw <- Matrix::sparseMatrix(i = Wij@j + 1, @@ -433,9 +521,6 @@ stan_glm <- function(formula, rmc <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) out$diagnostic$Residual_MC <- rmc } - if (any(pars == 'fitted')) { - out$diagnostic$WAIC <- as.numeric(waic(out)[1]) - } return(out) diff --git a/R/stan_icar.R b/R/stan_icar.R index 45489468..37e34e23 100644 --- a/R/stan_icar.R +++ b/R/stan_icar.R @@ -166,7 +166,7 @@ #' @return An object of class class \code{geostan_fit} (a list) containing: #' \describe{ #' \item{summary}{Summaries of the main parameters of interest; a data frame} -#' \item{diagnostic}{Widely Applicable Information Criteria (WAIC) with a measure of effective number of parameters (\code{eff_pars}) and mean log pointwise predictive density (\code{lpd}), and mean residual spatial autocorrelation as measured by the Moran coefficient.} +#' \item{diagnostic}{Residual spatial autocorrelation as measured by the Moran coefficient.} #' \item{stanfit}{an object of class \code{stanfit} returned by \code{rstan::stan}} #' \item{data}{a data frame containing the model data} #' \item{edges}{The edge list representing all unique sets of neighbors and the weight attached to each pair (i.e., their corresponding element in the connectivity matrix C} @@ -333,7 +333,7 @@ stan_icar <- function(formula, x_full <- xraw } else { stopifnot(inherits(slx, "formula")) - W <- row_standardize(C, msg = "Row standardizing connectivity matrix to calculate spatially lagged covaraite(s)") + W <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") W.list <- rstan::extract_sparse_parts(W) Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W) dwx <- ncol(Wx) @@ -459,9 +459,7 @@ stan_icar <- function(formula, rmc <- mean( apply(R, 1, mc, w = out$C, warn = FALSE, na.rm = TRUE) ) out$diagnostic$Residual_MC <- rmc } - if (any(pars == 'fitted')) { - out$diagnostic$WAIC <- as.numeric(waic(out)[1]) - } + return (out) } diff --git a/R/stan_sar.R b/R/stan_sar.R index df1ff40b..e0691bf2 100644 --- a/R/stan_sar.R +++ b/R/stan_sar.R @@ -1,6 +1,6 @@ #' Simultaneous autoregressive (SAR) models #' -#' @description Fit data to an spatial Gaussian SAR (spatial error) model, or model a vector of spatially-autocorrelated parameters using a SAR prior model. +#' @description Fit data to a simultaneous spatial autoregressive (SAR) model, or use the SAR model as the prior model for a parameter vector in a hierarchical model. #' #' @param formula A model formula, following the R \code{\link[stats]{formula}} syntax. Binomial models can be specified by setting the left hand side of the equation to a data frame of successes and failures, as in \code{cbind(successes, failures) ~ x}. #' @@ -15,13 +15,13 @@ #' #' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data. #' -#' @param C Spatial weights matrix (conventionally referred to as \eqn{W} in the SAR model). Typically, this will be created using `geostan::shape2mat(shape, style = "W")`. This will be passed internally to \code{\link[geostan]{prep_sar_data}}, and will also be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms; it will automatically be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}. +#' @param C Spatial weights matrix (conventionally referred to as \eqn{W} in the SAR model). Typically, this will be created using `geostan::shape2mat(shape, style = "W")`. This will be passed internally to \code{\link[geostan]{prep_sar_data}}, and will also be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. See \code{\link[geostan]{shape2mat}}. #' -#' @param sar_parts Optional. If not provided, then \code{\link[geostan]{prep_sar_data}} will be used automatically to create `sar_parts` using the user-provided spatial weights matrix. +#' @param sar_parts List of data constructed by \code{\link[geostan]{prep_sar_data}}. If not provided, then `C` will automatically be passed to \code{\link[geostan]{prep_sar_data}} to create `sar_parts`. #' -#' @param family The likelihood function for the outcome variable. Current options are \code{auto_gaussian()}, \code{binomial(link = "logit")}, and \code{poisson(link = "log")}; if `family = gaussian()` is provided, it will automatically be converted to `auto_gaussian()`. +#' @param family The likelihood function for the outcome variable. Current options are \code{auto_gaussian()}, \code{binomial()} (with logit link function) and \code{poisson()} (with log link function); if `family = gaussian()` is provided, it will automatically be converted to `auto_gaussian()`. #' -#' @param type Type of SAR model (character string): spatial error model ('SEM'), spatial Durbin error model ('SDEM'), spatial Durbin model ('SDM'), or spatial lag of y model ('Lag'). Note that regression coefficients for the two 'spatial lag of y' models ('SDM' and 'Lag') do not admit of the usual interpretation; see Details below. +#' @param type Type of SAR model (character string): spatial error model ('SEM'), spatial Durbin error model ('SDEM'), spatial Durbin lag model ('SDLM'), or spatial lag model ('SLM'). see Details below. #' #' @param prior A named list of parameters for prior distributions (see \code{\link[geostan]{priors}}): #' \describe{ @@ -39,19 +39,19 @@ #' #' } #' -#' @param ME To model observational uncertainty (i.e. measurement or sampling error) in any or all of the covariates, provide a list of data as constructed by the \code{\link[geostan]{prep_me_data}} function. +#' @param ME To model observational uncertainty in any or all of the covariates (i.e. measurement or sampling error), provide a list of data constructed by the \code{\link[geostan]{prep_me_data}} function. #' -#' @param centerx To center predictors on their mean values, use `centerx = TRUE`. If the ME argument is used, the modeled covariate (i.e., latent variable), rather than the raw observations, will be centered. When using the ME argument, this is the recommended method for centering the covariates. +#' @param centerx To center predictors on their mean values, use `centerx = TRUE`. This increases sampling speed. If the ME argument is used, the modeled covariate (i.e., the latent variable), rather than the raw observations, will be centered. #' -#' @param censor_point Integer value indicating the maximum censored value; this argument is for modeling censored (suppressed) outcome data, typically disease case counts or deaths. +#' @param censor_point Integer value indicating the maximum censored value; this argument is for modeling censored (suppressed) outcome data, typically disease case counts or deaths which are left-censored to protect confidentiality when case counts are very low. #' #' @param prior_only Logical value; if \code{TRUE}, draw samples only from the prior distributions of parameters. #' @param chains Number of MCMC chains to use. -#' @param iter Number of samples per chain. +#' @param iter Number of MCMC samples per chain. #' @param refresh Stan will print the progress of the sampler every \code{refresh} number of samples. Set \code{refresh=0} to silence this. -#' @param pars Optional; specify any additional parameters you'd like stored from the Stan model. -#' @param keep_all If `keep_all = TRUE` then samples for all parameters in the Stan model will be kept; this is necessary if you want to do model comparison with Bayes factors and the `bridgesampling` package. -#' @param slim If `slim = TRUE`, then the Stan model will not collect the most memory-intensive parameters (including n-length vectors of fitted values, log-likelihoods, and ME-modeled covariate values). This will disable many convenience functions that are otherwise available for fitted \code{geostan} models, such as the extraction of residuals, fitted values, and spatial trends, WAIC, and spatial diagnostics, and ME diagnostics; many quantities of interest, such as fitted values and spatial trends, can still be calculated manually using given parameter estimates. The "slim" option is designed for data-intensive routines, such as regression with raster data, Monte Carlo studies, and measurement error models. For more control over which parameters are kept or dropped, use the `drop` argument instead of `slim`. +#' @param pars Specify any additional parameters you'd like stored from the Stan model. +#' @param keep_all If `keep_all = TRUE` then samples for all parameters in the Stan model will be kept; this is necessary if you want to do model comparison with Bayes factors using the `bridgesampling` package. +#' @param slim If `slim = TRUE`, then the Stan model will not save the most memory-intensive parameters (including n-length vectors of fitted values, other 'random effects', and ME-modeled covariate values). This will disable some convenience functions that are otherwise available for fitted \code{geostan} models, such as the extraction of residuals, fitted values, and spatial trends, spatial diagnostics, and ME diagnostics. The "slim" option is designed for data-intensive routines, such as regression with raster data, Monte Carlo studies, and measurement error models. #' @param drop Provide a vector of character strings to specify the names of any parameters that you do not want MCMC samples for. Dropping parameters in this way can improve sampling speed and reduce memory usage. The following parameter vectors can potentially be dropped from SAR models: #' \describe{ #' \item{fitted}{The N-length vector of fitted values} @@ -59,57 +59,64 @@ #' \item{log_lambda_mu}{Linear predictor inside the SAR model (for Poisson and binomial models)} #' \item{x_true}{N-length vector of 'latent'/modeled covariate values created for measurement error (ME) models.} #' } -#' Using `drop = c('fitted', 'alpha_re', 'x_true')` is equivalent to `slim = TRUE`. Note that if `slim = TRUE`, then `drop` will be ignored---so only use one or the other. +#' Using `drop = c('fitted', 'alpha_re', 'x_true', 'log_lambda_mu')` is equivalent to `slim = TRUE`. Note that if `slim = TRUE`, then `drop` will be ignored---so only use one or the other. #' @param control A named list of parameters to control the sampler's behavior. See \code{\link[rstan]{stan}} for details. #' #' @param ... Other arguments passed to \code{\link[rstan]{sampling}}. #' -#' @param quiet Controls (most) automatic printing to the console. By default, any prior distributions that have not been assigned by the user are printed to the console. If `quiet = TRUE`, these will not be printed. Using `quiet = TRUE` will also force `refresh = 0`. +#' @param quiet Controls (most) automatic printing to the console. By default, any prior distributions that have not been assigned by the user are printed to the console; if `quiet = TRUE`, these will not be printed. Using `quiet = TRUE` will also force `refresh = 0`. #' #' @details #' #' Discussions of SAR models may be found in Cliff and Ord (1981), Cressie (2015, Ch. 6), LeSage and Pace (2009), and LeSage (2014). The Stan implementation draws from Donegan (2021). #' -#' There are two SAR specification options which are commonly known as the spatial error ('SEM') and the spatial lag ('SLM') models. Additionally, it is common to include spatially-lagged covariates in these models; then the model is referred to as a spatial Durbin model ('spatial Durbin error model' or 'spatial Durbin lag model', SDEM/SDLM). You can also obtain the Durbin model by using either 'SEM' or 'SLM' and then adding all (or some) of your covariates to the \code{slx} (spatial lag of X) argument; in other words, the SDEM and SDLM options are simply for convenience. +#' There are two SAR specification options which are commonly known as the spatial error ('SEM') and the spatial lag ('SLM') models. When the spatial-lag of the covariates are included, then the model is referred to as a spatial Durbin model; depending on the model type, it becomes a spatial Durbin error model ('SDEM') or a spatial Durbin lag model ('SDLM'). +#' +#' The mathematics and typical interpretation of the SLM/SDLM is unusual and the conventional interpretation of regression coefficients does not apply! Use the \link[geostan]{impacts} method to interpret results from the SLM and SDLM models (that is, granted that this model form is at least plausible for the application). +#' +#' Use of the 'SDEM' and 'SDLM' options are for convenience only: you can also obtain the Durbin models using the \code{slx} (spatial lag of X) argument. The \code{slx} argument allows control over which covariates will be added in spatial-lag form; the Durbin options include the spatial lag of all covariates. #' #' The spatial error specification ('SEM') is -#' \deqn{y = \mu + ( I - \rho W)^{-1} \epsilon} +#' \deqn{y = \mu + ( I - \rho C)^{-1} \epsilon} #' \deqn{\epsilon \sim Gauss(0, \sigma^2)} -#' where \eqn{W} is the spatial weights matrix, \eqn{I} is the n-by-n identity matrix, and \eqn{\rho} is a spatial autocorrelation parameter. In words, the errors of the regression equation are spatially autocorrelated. +#' where \eqn{C} is the spatial connectivity matrix, \eqn{I} is the n-by-n identity matrix, and \eqn{\rho} is a spatial autocorrelation parameter. In words, the errors of the regression equation are spatially autocorrelated. #' #' Re-arranging terms, the model can also be written as follows: -#' \deqn{y = \mu + \rho W (y - \mu) + \epsilon} -#' which perhaps shows more intuitively the implicit spatial trend component, \eqn{\rho W (y - \mu)}. +#' \deqn{y = \mu + \rho C (y - \mu) + \epsilon} +#' which perhaps shows more intuitively the implicit spatial trend component, \eqn{\rho C (y - \mu)}. #' #' The second SAR specification type is the 'spatial lag of y' ('SLM'). This model describes a diffusion or contagion process: -#' \deqn{y = \rho W y + \mu + \epsilon} +#' \deqn{y = \rho C y + \mu + \epsilon} #' \deqn{\epsilon \sim Gauss(0, \sigma^2)} -#' This is attractive for modeling actual contagion processes. Here the 'spatial trend' part is simply \eqn{\rho W y}. +#' This is attractive for modeling actual contagion processes. Here the 'spatial trend' part is simply \eqn{\rho C y}. #' -#' Beware that the mathematics and typical interpretation of the SLM is unusual. The equation specifies feedback between all units, such that changing the outcome in one location has a spill-over effect that can extend to all other locations (a ripple effect). These spill-overs have to be incorporated into your understanding of the regression coefficients: a unit change in of \code{X} in one place will 'directly' impact \code{y} in that same place and will 'indirectly' impact \code{y} elsewhere through the diffusion process. The 'total' impact of a unit change in \code{X} needs to account for both direct and indirect effects. +#' Both model types have a covariance matrix of: #' -#' Most often, the SAR model is applied directly to observations (referred to below as the auto-normal or auto-Gaussian model). The SAR model can also be applied to a vector of parameters inside a hierarchical model. The latter enables spatial autocorrelation to be modeled when the observations are discrete counts with a natural denominator (e.g., hierarchical models for disease incidence rates). +#' \deqn{\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.} #' +#' But the expected values of the models differ. The expected value for the SEM is the usual \eqn{\mu} (the intercept plus \code{X*beta}); the expected value of the SLM is \eqn{(I - rho C)^{-1} \mu}. +#' +#' Most often, the SAR model is applied directly to observations (referred to below as the auto-normal or auto-Gaussian model). The SAR model can also be applied to a vector of parameters inside a hierarchical model. The latter enables spatial or network autocorrelation to be modeled when the observations are discrete counts (e.g., hierarchical models for disease incidence rates). #' #' ### Auto-normal #' #' When \code{family = auto_gaussian()}, the SAR model is specified as follows: #' #' \deqn{y \sim Gauss(\mu, \Sigma)} -#' \deqn{\Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}} +#' \deqn{\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}} # -#' where \eqn{\mu} is the mean vector (with intercept, covariates, etc.), \eqn{W} is a spatial weights matrix (usually row-standardized), and \eqn{\sigma} is a scale parameter. +#' where \eqn{\mu} is the mean vector (with intercept, covariates, etc.), \eqn{C} is a spatial weights or connectivity matrix (usually row-standardized), and \eqn{\sigma} is a scale parameter. #' #' The SAR model contains an implicit spatial trend (i.e., spatial autocorrelation) component \eqn{\phi} which is calculated as follows: #' \deqn{ -#' \phi = \rho W (y - \mu) +#' \phi = \rho C (y - \mu) #' } #' #' This term can be extracted from a fitted auto-Gaussian model using the \link[geostan]{spatial} method. #' #' When applied to a fitted auto-Gaussian model, the \link[geostan]{residuals.geostan_fit} method returns 'de-trended' residuals \eqn{R} by default. That is, #' \deqn{ -#' R = y - \mu - \rho W (y - \mu). +#' R = y - \mu - \rho C (y - \mu). #' } #' To obtain "raw" residuals (\eqn{y - \mu}), use `residuals(fit, detrend = FALSE)`. Similarly, the fitted values obtained from the \link[geostan]{fitted.geostan_fit} will include the spatial trend term by default. #' @@ -119,21 +126,21 @@ #' #' \deqn{y \sim Poisson(e^{O + \lambda})} #' \deqn{\lambda \sim Gauss(\mu, \Sigma)} -#' \deqn{\Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}.} +#' \deqn{\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.} #' -#' If the raw outcome consists of a rate \eqn{\frac{y}{p}} with observed counts \eqn{y} and denominator \eqn{p} (often this will be the size of the population at risk), then the offset term \eqn{O=log(p)} is the log of the denominator. +#' `O` is a constant/offset term. If the raw outcome consists of a rate \eqn{\frac{y}{p}} with observed counts \eqn{y} and denominator \eqn{p} (often this will be the size of the population at risk), then the offset term \eqn{O=log(p)} is the log of the denominator. #' #' This is often written (equivalently) as: #' #' \deqn{y \sim Poisson(e^{O + \mu + \phi})} #' \deqn{ \phi \sim Gauss(0, \Sigma) } -#' \deqn{ \Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}.} +#' \deqn{ \Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.} #' #' For Poisson models, the \link[geostan]{spatial} method returns the parameter vector \eqn{\phi}. #' -#' In the Poisson SAR model, \eqn{\phi} contains a latent spatial trend as well as additional variation around it. If you would like to extract the latent/implicit spatial trend from \eqn{\phi}, you can do so by calculating: +#' In the Poisson SAR model, \eqn{\phi} contains a latent (smooth) spatial trend as well as additional variation around it. If you would like to extract the latent/implicit spatial trend from \eqn{\phi}, you can do so by calculating: #' \deqn{ -#' \rho W \phi. +#' \rho C \phi. #' } #' #' ### Binomial @@ -142,7 +149,7 @@ #' #' \deqn{y \sim Binomial(N, \lambda) } #' \deqn{logit(\lambda) \sim Gauss(\mu, \Sigma) } -#' \deqn{\Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}.} +#' \deqn{\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.} #' #' where outcome data \eqn{y} are counts, \eqn{N} is the number of trials, and \eqn{\lambda} is the rate of 'success'. Note that the model formula should be structured as: `cbind(sucesses, failures) ~ 1` (for an intercept-only model), such that `trials = successes + failures`. #' @@ -152,7 +159,7 @@ #' #' As is also the case for the Poisson model, \eqn{\phi} contains a latent spatial trend as well as additional variation around it. If you would like to extract the latent/implicit spatial trend from \eqn{\phi}, you can do so by calculating: #' \deqn{ -#' \rho W \phi. +#' \rho C \phi. #' } #' #' ## Additional functionality @@ -163,7 +170,7 @@ #' @return An object of class class \code{geostan_fit} (a list) containing: #' \describe{ #' \item{summary}{Summaries of the main parameters of interest; a data frame.} -#' \item{diagnostic}{Widely Applicable Information Criteria (WAIC) with a measure of effective number of parameters (\code{eff_pars}) and mean log pointwise predictive density (\code{lpd}), and mean residual spatial autocorrelation as measured by the Moran coefficient.} +#' \item{diagnostic}{Residual spatial autocorrelation as measured by the Moran coefficient.} #' \item{stanfit}{an object of class \code{stanfit} returned by \code{rstan::stan}} #' \item{data}{a data frame containing the model data} #' \item{family}{the user-provided or default \code{family} argument used to fit the model} @@ -198,32 +205,97 @@ #' LeSage, James, & Pace, Robert Kelley (2009). *Introduction to Spatial Econometrics*. Chapman and Hall/CRC. #' #' @examples -#' # model mortality risk +#' +#' ## +#' ## simulate SAR data on a regular grid +#' ## +#' +#' sars <- prep_sar_data2(row = 10, col = 10, quiet = TRUE) +#' w <- sars$W +#' +#' # draw x +#' x <- sim_sar(w = w, rho = 0.5) +#' +#' # draw y = mu + rho*W*(y - mu) + epsilon +#' # beta = 0.5, rho = 0.5 +#' y <- sim_sar(w = w, rho = .5, mu = 0.5 * x) +#' dat <- data.frame(y = y, x = x) +#' +#' ## +#' ## fit SEM +#' ## +#' +#' fit_sem <- stan_sar(y ~ x, data = dat, sar = sars, +#' chains = 1, iter = 800) +#' print(fit_sem) +#' +#' ## +#' ## data for SDEM +#' ## +#' +#' # mu = x*beta + wx*gamma; beta=1, gamma=-0.25 +#' x <- sim_sar(w = w, rho = 0.5) +#' mu <- 1 * x - 0.25 * (w %*% x)[,1] +#' y <- sim_sar(w = w, rho = .5, mu = mu) +#' # or for SDLM: +#' # y <- sim_sar(w = w, rho = 0.5, mu = mu, type = "SLM") +#' dat <- data.frame(y=y, x=x) +#' +#' # +#' ## fit models +#' ## +#' +#' # DSEM +#' # y = mu + rho*W*(y - mu) + epsilon +#' # mu = beta*x + gamma*Wx +#' fit_sdem <- stan_sar(y ~ x, data = dat, +#' sar_parts = sars, type = "SDEM", +#' iter = 800, chains = 1, +#' quiet = TRUE) +#' +#' # SDLM +#' # y = rho*Wy + beta*x + gamma*Wx + epsilon +#' fit_sdlm <- stan_sar(y ~ x, data = dat, +#' sar_parts = sars, +#' type = "SDLM", +#' iter = 800, +#' chains = 1, +#' quiet = TRUE) +#' +#' # compare by DIC +#' dic(fit_sdem) +#' dic(fit_sdlm) +#' +#' \donttest{ +#' ## +#' ## Modeling mortality rates +#' ## +#' +#' # simple spatial regression #' data(georgia) #' W <- shape2mat(georgia, style = "W") #' #' fit <- stan_sar(log(rate.male) ~ 1, #' C = W, #' data = georgia, -#' chains = 1, # for ex. speed only -#' iter = 700 +#' iter = 900 #' ) #' -#' rstan::stan_rhat(fit$stanfit) -#' rstan::stan_mcse(fit$stanfit) -#' print(fit) -#' plot(fit) +#' # view fitted vs. observed, etc. #' sp_diag(fit, georgia) #' -#' \donttest{ -#' # a more appropriate model for count data: +#' # A more appropriate model for count data: +#' # hierarchical spatial poisson model #' fit2 <- stan_sar(deaths.male ~ offset(log(pop.at.risk.male)), #' C = W, #' data = georgia, #' family = poisson(), #' chains = 1, # for ex. speed only -#' iter = 700 +#' iter = 900, +#' quiet = TRUE #' ) +#' +#' # view fitted vs. observed, etc. #' sp_diag(fit2, georgia) #' } #' @export @@ -338,8 +410,8 @@ stan_sar <- function(formula, W <- C if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") - ## xrs <- Matrix::rowSums(W) - ## if (!all(xrs == 1)) W <- row_standardize(W, msg = "Row standardizing connectivity matrix to calculate spatially lagged covaraite(s)") + xrs <- Matrix::rowSums(W) + if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") # nb: in stan_sar, W is taken from sar_parts, not calculated here. Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W, Durbin = Durbin) @@ -478,9 +550,7 @@ stan_sar <- function(formula, rmc <- mean( apply(R, 1, mc, w = C, warn = FALSE, na.rm = TRUE) ) out$diagnostic$Residual_MC <- rmc } - if (any(pars == 'fitted')) { - out$diagnostic$WAIC <- as.numeric(waic(out)[1]) - } + return (out) } diff --git a/README.Rmd b/README.Rmd index d1ef8d63..166b4b88 100644 --- a/README.Rmd +++ b/README.Rmd @@ -25,7 +25,7 @@ Introductions to the software can be found at [r-spatial.org](https://r-spatial. Features include: -* **Spatial regression and disease mapping** Statistical models for data recorded across areal units like states, counties, or census tracts. +* **Spatial regression and disease mapping** Statistical models for data recorded across areal units (states, counties, or census tracts) or networks, including spatial econometric models. * **Spatial analysis tools** Tools for visualizing and measuring spatial autocorrelation and map patterns, for exploratory analysis and model diagnostics. * **Observational uncertainty** Incorporate information on data reliability, such as standard errors of American Community Survey estimates, into any geostan model. * **Missing and Censored observations** Vital statistics and disease surveillance systems like CDC Wonder censor case counts that fall below a threshold number; geostan can model disease or mortality risk for small areas with censored observations or with missing observations. diff --git a/_pkgdown.yml b/_pkgdown.yml index 9cd41094..e24c8342 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -8,6 +8,7 @@ home: articles: - title: "Package vignettes" contents: + - spatial-analysis - spatial-weights-matrix - measuring-sa - raster-regression @@ -35,15 +36,19 @@ reference: - mc - moran_plot - n_eff - - sp_diag -- title: "Spatial models" + - sp_diag +- title: "Model prep" - contents: - make_EV - - starts_with("prep_") + - starts_with("prep_") +- title: "Spatial models" +- contents: - starts_with("stan_") -- title: "Methods and diagnostics for spatial models" +- title: "Methods and diagnostics" - contents: - matches("geostan_fit") + - impacts + - spill - me_diag - spatial - posterior_predict diff --git a/docs/404.html b/docs/404.html index 9f9248ab..9adc22b2 100644 --- a/docs/404.html +++ b/docs/404.html @@ -39,7 +39,7 @@ geostan - 0.7.0 + 0.8.0 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 0201ab44..ef459082 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ geostan - 0.7.0 + 0.8.0 diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 7d800022..110a0745 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -17,7 +17,7 @@ geostan - 0.7.0 + 0.8.0 diff --git a/docs/articles/index.html b/docs/articles/index.html index b4287946..17104171 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ geostan - 0.7.0 + 0.8.0 @@ -53,7 +53,9 @@

Articles

Package vignettes

-
Spatial weights matrix
+
Spatial analysis with geostan
+
+
Spatial weights matrix
Exploratory spatial data analysis
diff --git a/docs/articles/measuring-sa.html b/docs/articles/measuring-sa.html index 9159e840..ad6cacd4 100644 --- a/docs/articles/measuring-sa.html +++ b/docs/articles/measuring-sa.html @@ -304,10 +304,7 @@

Model diagnostics#> *Setting prior parameters for alpha_tau #> Distribution: student_t #> df location scale -#> 1 10 0 3 -#> Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable. -#> Running the chains for more iterations may help. See -#> https://mc-stan.org/misc/warnings.html#bulk-ess +#> 1 10 0 3

For a summary of model results:

diff --git a/docs/articles/measuring-sa_files/figure-html/unnamed-chunk-14-1.png b/docs/articles/measuring-sa_files/figure-html/unnamed-chunk-14-1.png index 3a23650d..5e4a05e0 100644 Binary files a/docs/articles/measuring-sa_files/figure-html/unnamed-chunk-14-1.png and b/docs/articles/measuring-sa_files/figure-html/unnamed-chunk-14-1.png differ diff --git a/docs/articles/raster-regression.html b/docs/articles/raster-regression.html index 53048e94..d3b813bb 100644 --- a/docs/articles/raster-regression.html +++ b/docs/articles/raster-regression.html @@ -88,19 +88,20 @@

April 27, 2023

-

This vignette provides a tutorial for fitting spatial regression models to raster data using geostan. The term “raster” is used here to refer to any regularly spaced set of observations such that the data can be represented spatially by a rectangular grid. Remotely sensed imagery is a common form of raster data.

-

geostan can be used for spatial regression with fairly large raster data layers, although the functionality of these models will often be limited to the estimation of regression coefficients and spatial autocorrelation parameters. Limited experience thus far finds that geostan’s spatial autoregressive models can be fit to raster layers with two hundred thousand observations using a laptop computer and fewer than ten minutes of sampling time.

+

This vignette provides a tutorial for fitting spatial regression models to raster data using geostan. The term “raster” is used here to refer to any regularly spaced set of observations such that the data can be represented spatially by a rectangular grid or lattice. Remotely sensed imagery is a common form of raster data.

+

For an irregular spatial lattice and moderately big N, the best one can do (for now) is to save the sar_list or car_list in a file (using saveRDS(sar_list, "sar-parts.rds") or similar) so that it only needs to be calculated once.

Demonstration

-

Start by loading some necessary R packages.

+

Start by loading the geostan and sf packages:

We will create a small raster data layer for the purpose of illustration.

-row <- 40
+# creating a grid
+row <- 40
 col <- 30
 c(N <- row * col)
## [1] 1200
@@ -108,53 +109,51 @@

Demonstrationsfc = st_sfc(st_polygon(list(rbind(c(0,0), c(col,0), c(col,row), c(0,0))))) grid <- st_make_grid(sfc, cellsize = 1, square = TRUE) grid <- st_as_sf(grid) -W <- shape2mat(grid, style = "W", queen = FALSE)

-
## Warning in shape2mat(grid, style = "W", queen = FALSE): The 'queen' argument is
-## deprecated. Use 'method' instead.
-
## Contiguity condition: rook
-
## Number of neighbors per unit, summary:
-
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##   2.000   4.000   4.000   3.883   4.000   4.000
-
## 
-## Spatial weights, summary:
-
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-##  0.2500  0.2500  0.2500  0.2575  0.2500  0.5000
-
-grid$z <- sim_sar(w = W, rho = 0.9)
-grid$y <- -0.5 * grid$z + sim_sar(w = W, rho = .9, sigma = .3)
-
-plot(grid[,'z'])
+ +# create connectivity matrix +W <- shape2mat(grid, style = "W", method = "rook", quiet = TRUE) + +# draw data from a spatial autoregressive model +set.seed(100) +grid$z <- sim_sar(rho = 0.8, w = W) +grid$y <- sim_sar(mu = -0.5 * grid$z, rho = .9, sigma = .3, w = W) +
+plot(grid[ , 'y' ])

The following R code will fit a spatial autoregressive model to these data:

-
+
 fit <- stan_sar(y ~ z, data = grid, C = W)
-

The stan_sar function will take the spatial weights matrix W and pass it through a function called prep_sar_data which will calculate the eigenvalues of the spatial weights matrix using base::eigen, as required for computational reasons. This step is prohibitive for large data sets (e.g., \(N = 100,000\)).

+

The stan_sar function will take the spatial weights matrix W and pass it through a function called prep_sar_data which will calculate the eigenvalues of the spatial weights matrix using Matrix::Schur. This step can be prohibitive for large data sets (e.g., \(N = 100,000\)).

The following code would normally be used to fit a conditional autoregressive (CAR) model:

-
+
 C <- shape2mat(grid, style = "B", queen = FALSE)
 car_list <- prep_car_data(C, "WCAR")
 fit <- stan_car(y ~ z, data = grid, car_parts = car_list)
-

Here, the prep_car_data function calculates the eigenvalues of the spatial weights matrix using base::eigen, which is not feasible for large N.

-

The prep_sar_data2 and prep_car_data2 functions are designed for raster layers. As input, they require the dimensions of the grid (number of rows and number of columns). The eigenvalues are produced very quickly using Equation 5 from Griffith (2000). The methods have certain restrictions. First, this is only applicable to raster layers—regularly spaced, rectangular grids of observations. Second, to define which observations are adjacent to one another, the “rook” criteria is used (spatially, only observations that share an edge are defined as neighbors to one another). Third, the spatial adjacency matrix will be row-standardized. This is standard (and required) for SAR models, and it corresponds to the “WCAR” specification of the CAR model (see Donegan 2022).

-

The following code will fit a SAR model to our grid data, and is suitable for much larger raster layers:

-
-sar_list <- prep_sar_data2(row = row, col = col)
-
## Range of permissible rho values: -1, 1
-
-fit <- stan_sar(y ~ z, 
+

Here, the prep_car_data function calculates the eigenvalues of the spatial weights matrix using Matrix::Schur, which is not feasible for large N.

+

The prep_sar_data2 and prep_car_data2 functions are designed for large raster layers. As input, they require the dimensions of the grid (number of rows and number of columns). The eigenvalues are produced very quickly using Equation 5 from Griffith (2000). The methods have certain restrictions. First, this is only applicable to raster layers—regularly spaced, rectangular grids of observations. Second, to define which observations are adjacent to one another, the “rook” criteria is used (spatially, only observations that share an edge are defined as neighbors to one another). Third, the spatial adjacency matrix will be row-standardized. This is common anyways for SAR and CAR models (it corresponds to the “WCAR” specification of the CAR model (see Donegan 2022)).

+

The following code will fit a SAR model to our grid data (without any use of shape2mat) and is suitable for larger raster layers:

+
+# create connectivity matrix and its eigenvalues
+sars <- prep_sar_data2(row = row, col = col, quiet = TRUE)
+
+# if you want the matrix
+W <- sars$W
+
+# fit model
+fit <- stan_sar(y ~ z, 
       data = grid,
-          centerx = TRUE,
-          sar_parts = sar_list,
-          iter = 500,
-          chains = 4,
-          slim = TRUE #,
-          # cores = 4, # for multi-core processing
-        )
+ centerx = TRUE, + sar_parts = sars, + iter = 500, + chains = 2, # for demo speed + # cores = 4, # multi-core processing + slim = TRUE + )
## 
 ## *Setting prior parameters for intercept
## Distribution: normal
##   location scale
-## 1    0.035     5
+## 1 0.18 5
## 
 ## *Setting prior parameters for beta
 ## Distribution: normal
@@ -173,8 +172,8 @@

Demonstration## ## SAMPLING FOR MODEL 'foundation' NOW (CHAIN 1). ## Chain 1: -## Chain 1: Gradient evaluation took 0.003847 seconds -## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 38.47 seconds. +## Chain 1: Gradient evaluation took 0.002153 seconds +## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 21.53 seconds. ## Chain 1: Adjust your expectations accordingly! ## Chain 1: ## Chain 1: @@ -182,15 +181,15 @@

Demonstration## Chain 1: Iteration: 251 / 500 [ 50%] (Sampling) ## Chain 1: Iteration: 500 / 500 [100%] (Sampling) ## Chain 1: -## Chain 1: Elapsed Time: 2.749 seconds (Warm-up) -## Chain 1: 1.958 seconds (Sampling) -## Chain 1: 4.707 seconds (Total) +## Chain 1: Elapsed Time: 3.107 seconds (Warm-up) +## Chain 1: 2.082 seconds (Sampling) +## Chain 1: 5.189 seconds (Total) ## Chain 1: ## ## SAMPLING FOR MODEL 'foundation' NOW (CHAIN 2). ## Chain 2: -## Chain 2: Gradient evaluation took 0.001092 seconds -## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 10.92 seconds. +## Chain 2: Gradient evaluation took 0.001209 seconds +## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 12.09 seconds. ## Chain 2: Adjust your expectations accordingly! ## Chain 2: ## Chain 2: @@ -198,44 +197,15 @@

Demonstration## Chain 2: Iteration: 251 / 500 [ 50%] (Sampling) ## Chain 2: Iteration: 500 / 500 [100%] (Sampling) ## Chain 2: -## Chain 2: Elapsed Time: 2.375 seconds (Warm-up) -## Chain 2: 1.965 seconds (Sampling) -## Chain 2: 4.34 seconds (Total) -## Chain 2: -## -## SAMPLING FOR MODEL 'foundation' NOW (CHAIN 3). -## Chain 3: -## Chain 3: Gradient evaluation took 0.001192 seconds -## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 11.92 seconds. -## Chain 3: Adjust your expectations accordingly! -## Chain 3: -## Chain 3: -## Chain 3: Iteration: 1 / 500 [ 0%] (Warmup) -## Chain 3: Iteration: 251 / 500 [ 50%] (Sampling) -## Chain 3: Iteration: 500 / 500 [100%] (Sampling) -## Chain 3: -## Chain 3: Elapsed Time: 2.396 seconds (Warm-up) -## Chain 3: 1.556 seconds (Sampling) -## Chain 3: 3.952 seconds (Total) -## Chain 3: -## -## SAMPLING FOR MODEL 'foundation' NOW (CHAIN 4). -## Chain 4: -## Chain 4: Gradient evaluation took 0.00098 seconds -## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 9.8 seconds. -## Chain 4: Adjust your expectations accordingly! -## Chain 4: -## Chain 4: -## Chain 4: Iteration: 1 / 500 [ 0%] (Warmup) -## Chain 4: Iteration: 251 / 500 [ 50%] (Sampling) -## Chain 4: Iteration: 500 / 500 [100%] (Sampling) -## Chain 4: -## Chain 4: Elapsed Time: 2.569 seconds (Warm-up) -## Chain 4: 1.491 seconds (Sampling) -## Chain 4: 4.06 seconds (Total) -## Chain 4:

-
-print(fit)      
+## Chain 2: Elapsed Time: 3.389 seconds (Warm-up) +## Chain 2: 2.063 seconds (Sampling) +## Chain 2: 5.452 seconds (Total) +## Chain 2: +
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
+## Running the chains for more iterations may help. See
+## https://mc-stan.org/misc/warnings.html#bulk-ess
+
+print(fit)
## Spatial Model Results 
 ## Formula: y ~ z
 ## Spatial method (outcome):  SAR 
@@ -244,23 +214,40 @@ 

Demonstration## Observations: 1200 ## Data models (ME): none ## Inference for Stan model: foundation. -## 4 chains, each with iter=500; warmup=250; thin=1; -## post-warmup draws per chain=250, total post-warmup draws=1000. +## 2 chains, each with iter=500; warmup=250; thin=1; +## post-warmup draws per chain=250, total post-warmup draws=500. ## ## mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat -## intercept 0.034 0.003 0.078 -0.121 -0.031 0.033 0.101 0.181 709 0.998 -## z -0.510 0.000 0.009 -0.527 -0.518 -0.510 -0.502 -0.490 1129 0.998 -## sar_rho 0.874 0.001 0.016 0.843 0.859 0.873 0.887 0.904 758 0.998 -## sar_scale 0.312 0.000 0.007 0.298 0.306 0.312 0.317 0.326 955 0.999 +## intercept 0.185 0.004 0.078 0.028 0.117 0.187 0.247 0.341 419 1.002 +## z -0.501 0.000 0.008 -0.517 -0.508 -0.501 -0.494 -0.484 478 0.997 +## sar_rho 0.888 0.001 0.013 0.862 0.877 0.890 0.900 0.914 495 0.998 +## sar_scale 0.306 0.000 0.006 0.295 0.301 0.306 0.311 0.319 595 0.999 ## -## Samples were drawn using NUTS(diag_e) at Tue Sep 17 21:10:00 2024. +## Samples were drawn using NUTS(diag_e) at Wed Oct 30 13:33:12 2024. ## For each parameter, n_eff is a crude measure of effective sample size, ## and Rhat is the potential scale reduction factor on split chains (at ## convergence, Rhat=1).

The user first creates the data list using prep_sar_data2 and then passes it to stan_sar using the sar_parts argument. Also, slim = TRUE is invoked to prevent the model from collecting N-length parameter vectors and quantities of interest (such as fitted values and log-likelihoods).

-

For large data sets and complex models, slim = TRUE can bring about computational improvements at the cost of losing some functionality (including the loss of convenience functions like sp_diag, me_diag, spatial, resid, and fitted). Many quantities of interest, such as fitted values and spatial trend terms, can still be calculated manually using the data and parameter estimates (intercept, coefficients, and spatial autocorrelation parameters).

-

The favorable MCMC diagnostics for this model (sufficiently large effective sample sizes n_eff, and Rhat values very near to 1), based on just 250 post-warmup iterations per chain with four MCMC chains, provides some indication as to how computationally efficient these spatial autoregressive models can be.

-

Also, note that Stan usually samples more efficiently when variables have been mean-centered. Using the centerx = TRUE argument in stan_sar (or any other model-fitting function in geostan) can be very helpful in this respect. Also note that the SAR models in geostan are (generally) no less computationally-efficient than the CAR models, and may even be slightly more efficient.

+
+
+

Discussion +

+

For large data sets and complex models, slim = TRUE can bring about computational improvements at the cost of losing access to some convenience functions (such as sp_diag, me_diag, spatial, resid, and fitted). Many quantities of interest, such as fitted values and spatial trend terms, can still be calculated manually using the data and parameter estimates (intercept, coefficients, and spatial autocorrelation parameters).

+

The favorable MCMC diagnostics for the above model based on just 200 post-warmup iterations per chain (sufficiently large effective sample sizes n_eff, and Rhat values very near to 1) provides some indication as to the computational efficiency of the models. The point is that the basic spatial autoregressive models can work well for larger data because you only need a modest number of MCMC samples.

+

Also, note that Stan usually samples more efficiently when variables have been mean-centered. Using the centerx = TRUE argument in stan_sar (or any other model-fitting function in geostan) can be very helpful in this respect. Also note that the SAR models in geostan are (generally) no less computationally-efficient than the CAR models, and may even be slightly more efficient.

+
+
+

Simulating spatial data +

+

To simulate spatially-autocorrelated data on a regular grid use the quick = TRUE argument in sim_sar:

+
+row = 100
+col = 100
+sar_list <- prep_sar_data2(row = row, col = col)
+W <- sar_list$W
+z <- sim_sar(rho = .8, w = W, quick = TRUE)
+y <- sim_sar(mu = -.5 * z, rho = .9, w = W, quick = TRUE)
+dat <- cbind(y, z)

References diff --git a/docs/articles/raster-regression_files/figure-html/unnamed-chunk-2-1.png b/docs/articles/raster-regression_files/figure-html/unnamed-chunk-2-1.png index 135ff5c2..c1a17d8b 100644 Binary files a/docs/articles/raster-regression_files/figure-html/unnamed-chunk-2-1.png and b/docs/articles/raster-regression_files/figure-html/unnamed-chunk-2-1.png differ diff --git a/docs/articles/spatial-analysis.html b/docs/articles/spatial-analysis.html new file mode 100644 index 00000000..c579978d --- /dev/null +++ b/docs/articles/spatial-analysis.html @@ -0,0 +1,640 @@ + + + + + + + +Spatial analysis with geostan • geostan + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

This vignette was first publisehd as a post on r-spatial.org.

+
+

Summary +

+

This vignette introduces the geostan R package for spatial analysis. The package is mainly oriented towards areal data, although some models may also be used for other spatial data types, such as network data. The package implements the spatial error/simultaneous spatial autoregressive (SAR) model, conditional autoregressive (CAR) model, and eigenvector spatial filter (ESF) models for spatial regression. A version of ESF modelling also appears in the ecology literature as principle coordinate analysis of neighbor matrices (PCNM) (Griffith and Peres-Neto 2006).

+

geostan also supports the application of the above regression methods to hierarchical models for count data, as is common in analyses of disease incidence or mortality in small areas (‘disease mapping’). Additional features of the software include models for sampling/measurement error in covariates and methods for handling censored count data, such as when mortality or disease counts have been censored for privacy. The models were built using the Stan modeling language, so all inference is completed using Markov chain Monte Carlo (MCMC) sampling (Stan Development Team 2023; Gabry et al. 2024). The spatial autoregressive models use custom-built Stan functions that speed up MCMC sampling considerably (Donegan 2021).

+

This post will walk through an example analysis using international data on life expectancy and per capita GDP. Package vignettes can be found with the online documentation, including an introduction to spatial weights matrices, exploratory spatial data analysis, spatial measurement error models, raster regression, and using geostan to build custom spatial models with Stan. A paper in the Journal of Open Source Software reports these and other features and provides the recommended citation when using geostan (Donegan 2022).

+
+
+

Installation +

+

geostan is currently on CRAN, although that may not always be the case. You can also install directly from geostan’s github repository.

+
+

From github +

+

You can install from the package github repository:

+
+if (!require('devtools')) install.packages('devtools')
+devtools::install_github("connordonegan/geostan")
+

If you are using Windows and installing using install_github, you may need to install Rtools first. Rtools is not needed when installing from CRAN. You may also contact the author by e-mail for a pre-compiled version that you can use without Rtools.

+

If you are using Mac and installing with install_github then you may need to install Xcode Command Line Tools first.

+
+
+

From CRAN +

+

Using your R console, you can install from CRAN as follows:

+
+install.packages("geostan")
+
+
+
+

Getting started +

+

To begin, load the geostan and sf packages into your R environment, as well as the world data:

+
+library(geostan)
+library(sf)
+data(world, package = "spData")
+

The world data contains life expectancy and gross domestic product (GDP) per capita (presumably measured in current $US) for 161 countries as of 2014, gathered from the World Bank. The rest of this post is going to be structured around a bivariate analysis of these variables.

+

We are going to apply the Robinson map projection for the countries:

+
+world <- st_transform(world, crs = 'ESRI:54030')
+

At least a couple of the missing values can be filled in using World Bank data, so we will do that:

+
+## https://data.worldbank.org
+france <- grep("France", world$name_long)
+world$gdpPercap[ france ] <- 43068
+world$lifeExp[ france ] <- 82
+
+norway <- grep("Norway", world$name_long)
+world$gdpPercap[ norway ] <- 97666
+world$lifeExp[ norway ] <- 82.1
+

And we will also remove Antarctica:

+
+world <- subset(world, name_long != "Antarctica")
+

Mapping the variables shows the familiar geography of high-, middle-, and low-income countries and a similar geography of longevity:

+
+# store geometry for countries
+world_geom <- st_geometry(world)
+
+# show two maps at once, with nice font
+ogpar <- par(mfrow = c(2, 1),
+             mar = rep(0, 4))
+
+# GDP per capita
+pars <- map_pars(world$gdpPercap / 1e3)
+plot(world_geom,
+     col = pars$col,
+     lwd = .2)
+legend("bottomleft",
+       fill = pars$pal,
+       title = 'GDP per capita\n($1,000s)',
+       legend = pars$lbls,
+       bty = 'n'
+)
+rm(pars)
+
+# life expectancy
+pars <- map_pars(world$lifeExp)
+plot(world_geom,
+  col = pars$col,
+  lwd = .2)
+legend("left",
+     fill = pars$pal,
+     title = 'Life Expectancy',
+     legend = pars$lbls,
+     bty = 'n'
+     )
+
+*Choropleth maps of GDP per capita and life expectancy.*

+Choropleth maps of GDP per capita and life expectancy. +

+
+
+par(ogpar)
+

The map_pars function breaks the variables into quantiles and returns breaks, colors, labels for the maps; it can be found at the end of the post. There will be no discussion of substantive (non-statistical) issues here, for which one can consult any number of texts on global power and inequality (such as Birn, Pillay, and Holz’s Textbook of International Health or Paul Farmer’s Infections and Inequalities).

+

By conventional methods, the correlation coefficient for life expectancy and log GDP per capita is 0.81:

+
+log_x <- log10( world$gdpPercap )
+y <- world$lifeExp
+cor.test(log_x, y)
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  log_x and y
+## t = 17.394, df = 160, p-value < 2.2e-16
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  0.7478143 0.8561778
+## sample estimates:
+##       cor 
+## 0.8087528
+

The conventional assessment is based on the proposition that we have 161 independent observations. The visible geography of the variables, and any level of social awareness, indicates that these are not independent observations. Rather, there are various functional regions of countries that share basic political-economic conditions. A lot, but not all, of the variation can be described as variation across continents and regions. We will want to account for this dependence using a spatial model (for background see Chun and Griffith 2012; Donegan 2024). The first step will be to construct a spatial weights matrix.

+
+
+

Adjacency matrix +

+

This section will illustrate use of two geostan functions for creating an revising a spatial weights matrix: shape2mat and edges. The shape2mat function may be helpful for some users but one can always do this using spdep or other methods, especially if shape2mat does not provide the exact method you’re looking for.

+

We are going to start by removing the 15 countries that are missing values:

+
+## remove missing values
+world <- subset(world, !is.na(gdpPercap) & !is.na(lifeExp))
+
+## leaving 162 observations
+nrow(world)
+
## [1] 162
+

Now we can apply the shape2mat function to obtain an adjacency matrix that encodes spatial adjacency relations for countries into a binary N-by-N matrix. The function uses spdep to find adjacency relations and returns results as a sparse matrix (using the Matrix package):

+
+A <- shape2mat(world, "B", method = "rook")
+
## Warning in spdep::poly2nb(shape, queen = queen, snap = snap): some observations have no neighbours;
+## if this seems unexpected, try increasing the snap argument.
+
## Warning in spdep::poly2nb(shape, queen = queen, snap = snap): neighbour object has 19 sub-graphs;
+## if this sub-graph count seems unexpected, try increasing the snap argument.
+
## Contiguity condition: rook
+
## Number of neighbors per unit, summary:
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   0.000   2.000   3.000   3.605   5.000  13.000
+
## 
+## Spatial weights, summary:
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       1       1       1       1       1       1
+

Visualizing the connections in the matrix is important for uncovering unexpected results. geostan’s edges function converts the matrix into a list of nodes and edges that we can plot. For this we need to supply the function with the adjacency matrix, A, and the associated spatial object, world:

+
+# edges with geometry
+E <- edges(A, shape = world) 
+graph <- st_geometry(E)
+
+ogpar <- par(mar = rep(0, 4))
+# plot countries
+plot(world_geom, lwd = .1)
+# add graph nodes
+plot(graph, add = TRUE, type = 'p')
+# add graph edges
+plot(graph, add = TRUE, type = 'l')
+

+
+par(ogpar)
+

This reveals quite a few unexpected results. French Guiana is stored in the world data as part of France (a multi-part polygon); this is correct of course but it leads to Brazil and Suriname being listed as neighbors of France, which is not sensible. Besides removing those connections, there are a number of island nations that we might want to connect to nearby places.

+

To connect Mozambique to Madagascar, we just replace the zeroes with ones in the slots that correspond to those countries. First we grab their index positions in the matrix:

+
+moz_idx <- grep("Mozambique", world$name_long)
+mad_idx <- grep("Madagascar", world$name_long)
+

And then we assign the correct slots in the matrix a value of 1 (or TRUE), remembering that the adjacency matrix is symmetric:

+
+A[moz_idx, mad_idx] <- A[mad_idx, moz_idx] <- TRUE
+

This can become tedious but it is important. Before moving on, we will make a series of adjustments. This will be made a bit easier with this convenience function:

+
+connect <- function(country_a, country_b,
+    names_vec = world$name_long, matrix = A, add = TRUE) {
+  stopifnot( country_a %in% names_vec )
+  stopifnot( country_b %in% names_vec )
+  a_idx <- which(names_vec == country_a)
+  b_idx <- which( names_vec == country_b)
+  matrix[a_idx, b_idx] <- matrix[b_idx, a_idx] <- add
+  return( matrix )
+}
+

The following are at least reasonable changes to make; they also ensure that every country has at least one neighbor:

+
+A <- connect("Mozambique", "Madagascar")
+A <- connect("Australia", "New Zealand")
+A <- connect("Philippines", "Malaysia")
+A <- connect("Japan", "Republic of Korea")
+A <- connect("Fiji", "Vanuatu")
+A <- connect("Solomon Islands", "Vanuatu")
+A <- connect("Solomon Islands", "Papua New Guinea")
+A <- connect("Australia", "Papua New Guinea")
+A <- connect("Haiti", "Jamaica")
+A <- connect("Bahamas", "United States")
+A <- connect("Dominican Republic", "Puerto Rico")
+A <- connect("Trinidad and Tobago", "Venezuela")
+A <- connect("Sri Lanka", "India")
+A <- connect("Cyprus", "Turkey")
+A <- connect("Cyprus", "Lebanon")
+A <- connect("Norway", "Iceland")
+
+## remove connections between South American and France
+A <- connect("Suriname", "France", add = FALSE)
+A <- connect("Brazil", "France", add = FALSE)
+

We should look at the revised adjacency matrix:

+
+graph <- st_geometry( edges(A, shape = world) )
+ogpar <- par(mar = rep(0, 4))
+plot(world_geom, lwd = .1)
+plot(graph, add = TRUE, type = 'p')
+plot(graph, add = TRUE, type = 'l')
+

+
+par(ogpar)
+

Sometimes it can help to examine the edge list interactively using a proper geographic information system like QGIS. For those who are familiar with (non-R) GIS software, you can save the edges list as a GeoPackage and then open it up in your GIS to examine the connections ‘by hand’ with a base map or other data:

+
+E <- edges(A, shape = world)
+st_write(E, "world.gpkg", layer = "edge list")
+
+
+

Non-spatial regression +

+

Fitting regression models with geostan is similar to using base R’s glm function: the user provides a model formula, data, and the model family or distribution. We can fit a normal linear model using the stan_glm function:

+
+fit_lm <- stan_glm(lifeExp ~ log(gdpPercap), data = world, iter = 800, quiet = TRUE)
+

The iter argument will be discussed below; we used iter = 800 (lower than the default) to keep this demo running quickly. We can examine parameter estimates by printing to the console:

+
+print(fit_lm)
+
## Spatial Model Results 
+## Formula: lifeExp ~ log(gdpPercap)
+## Spatial method (outcome):  none 
+## Likelihood function:  gaussian 
+## Link function:  identity 
+## Observations:  162 
+## 
+## Inference for Stan model: foundation.
+## 4 chains, each with iter=800; warmup=400; thin=1; 
+## post-warmup draws per chain=400, total post-warmup draws=1600.
+## 
+##                  mean se_mean    sd   2.5%    20%    50%    80%  97.5% n_eff
+## intercept      20.636   0.125 2.835 14.536 18.388 20.723 22.935 26.020   513
+## log(gdpPercap)  5.501   0.014 0.306  4.926  5.246  5.489  5.739  6.152   512
+## sigma           4.904   0.010 0.258  4.431  4.687  4.900  5.121  5.428   682
+##                 Rhat
+## intercept      1.001
+## log(gdpPercap) 1.001
+## sigma          1.001
+## 
+## Samples were drawn using NUTS(diag_e) at Wed Oct 30 14:14:46 2024.
+## For each parameter, n_eff is a crude measure of effective sample size,
+## and Rhat is the potential scale reduction factor on split chains (at 
+## convergence, Rhat=1).
+

The output printed to the console provides a summary of the posterior probability distributions of the model parameters. The distributions can also be visualized using plot(fit_lm):

+
+plot(fit_lm)
+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+

The mean of the distribution is reported in the mean column. For those who are more familiar with concepts from sampling theory, the mean may be understood as the estimate of the parameter. Each distribution’s standard deviation is found in the sd column; this describes the width of the posterior distribution. The sd is analogous to the standard error of the estimate. The quantiles also summarize the width of the posterior distributions; the 2.5% and 97.5% values form a 95% credible interval for the parameter value.

+
+

MCMC output +

+

The effective sample size (ESS), n_eff, tells us how many independent MCMC samples the inference is based on after adjusting for serial autocorrelation in the MCMC samples. This is an important quantity to pay attention to and generally one might like to see these numbers above 400 (or around 100 samples per MCMC chain). The standard error of the mean, se_mean, reports how much MCMC sampling error to expect in the mean (se_mean is calculated using n_eff). The R-hat statistic, Rhat, should always be very close to 1, preferably less than 1.01. The R-hat diagnostic tests that the MCMC chains are all depicting the same distribution. If they diverge from one another, it either means that you need to draw a higher number of MCMC samples (run the chains for longer) or that there is a problem fitting the model to your data.

+

By default, geostan models run four independent MCMC chains for 2,000 iterations each, half of which is discarded as warm-up. The number of iterations is controlled by the iter argument, the default being iter = 2e3. For some models this may be too low and you will want to increase this. Other times this might be more than is needed in which case you can reduce the computation time by using fewer iterations. What matters most is not your number of iterations but your ESS and R-hat statistics. When it comes to reporting results, it is generally best to use at least the default of four MCMC chains (chains = 4).

+
+
+

Methods +

+

A number of familiar methods are available for working with geostan models including fitted, resid, and predict.

+

The fitted method returns a data.frame with summaries of the fitted values. The probability distribution for each fitted value is summarized by its posterior mean, standard deviation, and quantiles:

+
+fdf <- fitted(fit_lm)
+head(fdf)
+
##               mean        sd     2.5%      20%      50%      80%    97.5%
+## fitted[1] 70.22391 0.3870898 69.47704 69.90635 70.21942 70.53512 70.98524
+## fitted[2] 63.45509 0.5750963 62.28044 62.97688 63.46336 63.92982 64.60126
+## fitted[3] 79.33440 0.5945789 78.16096 78.84701 79.33105 79.80633 80.51190
+## fitted[4] 80.36143 0.6392709 79.13138 79.83129 80.36124 80.86828 81.63577
+## fitted[5] 76.02107 0.4691229 75.07082 75.62826 76.02685 76.40738 76.94502
+## fitted[6] 67.88134 0.4247286 67.06831 67.52983 67.87242 68.23277 68.74445
+

The resid method behaves similarly. Examining the Moran scatter plot using the residuals shows a moderate degree of positive SA as well as some skewness:

+
+rdf <- resid(fit_lm)
+moran_plot(rdf$mean, A)
+

+
+
+
+

Spatial regression +

+

Options for spatial regression models currently include conditional autoregressive (CAR), simultaneous autoregressive (SAR/spatial error), and eigenvector spatial filtering (ESF). For count data, common variations on the intrinsic autoregressive (ICAR) model are also available.

+

All of the spatial models require at least a spatial weights matrix as input. All additional requirements for data preparation are handled by geostan’s prep_ functions: prep_car_data, prep_icar_data, prep_sar_data.

+

The make_EV function is used to create Moran’s eigenvectors for ESF regression; if you want to create your own eigenvectors (say, following the PCNM method) you can provide those directly to the ESF model (see ?stan_esf).

+

For the CAR model, we always provide the binary adjacency matrix as input to prep_car_data. See the prep_car_data documentation for options. Here we will fit an intercept-only CAR model to the life expectancy data:

+
+cars <- prep_car_data(A, quiet = TRUE)
+fit_car <- stan_car(lifeExp ~ 1, data = world, car_parts = cars, iter = 800, quiet = TRUE)
+
+print(fit_car)
+
## Spatial Model Results 
+## Formula: lifeExp ~ 1
+## Spatial method (outcome):  CAR 
+## Likelihood function:  auto_gaussian 
+## Link function:  identity 
+## Residual Moran Coefficient:  -0.3561462 
+## Observations:  162 
+## 
+## Inference for Stan model: foundation.
+## 4 chains, each with iter=800; warmup=400; thin=1; 
+## post-warmup draws per chain=400, total post-warmup draws=1600.
+## 
+##             mean se_mean    sd   2.5%    20%    50%    80%  97.5% n_eff  Rhat
+## intercept 70.459   0.094 2.681 65.455 68.404 70.448 72.563 75.814   806 1.003
+## car_rho    0.981   0.000 0.011  0.956  0.972  0.982  0.990  0.996   766 1.003
+## car_scale  7.839   0.014 0.463  6.981  7.459  7.801  8.221  8.817  1050 0.999
+## 
+## Samples were drawn using NUTS(diag_e) at Wed Oct 30 14:14:49 2024.
+## For each parameter, n_eff is a crude measure of effective sample size,
+## and Rhat is the potential scale reduction factor on split chains (at 
+## convergence, Rhat=1).
+

Notice that using iter = 1000 was more than adequate for inference in this case.

+

The CAR model has a spatial dependence parameter car_rho. This parameter does not have an interpretation similar to a correlation coefficient, and it is often near 1; this is not a problem unless one misinterprets it or desires a value similar to the correlation coefficient. The spatial dependence parameter in the SAR model does provide that kind of interpretation.

+
+

A filtering approach +

+

Returning to the correlation coefficient estimated above, one way to adjust our estimate for spatial dependence is to filter out the spatial trend from each of the two variables and then calculate the correlation coefficient using the detrended values (Chun and Griffith 2012, 71). This spatial ‘filtering’ or ‘pre-whitening’ method is not particularly common in practice but its a good trick to know given the familiarity of the correlation coefficient. We will use it here to demonstrate some basic features of the software.

+

The spatial trend term can be extracted from any spatial geostan model using the spatial method.

+
+theta <- spatial(fit_car)$mean
+
+pars <- map_pars(theta)
+ogpar <- par(mar = rep(0, 4))
+plot(st_geometry(world),
+  col = pars$col,
+  lwd = .2)
+legend("left",
+     fill = pars$pal,
+     title = 'Spatial trend (LE)',
+     legend = pars$lbls,
+     bty = 'n'
+     )
+

+
+par(ogpar)
+

We can obtain detrended values most simply by taking the residuals from an intercept-only spatial model:

+
+# lifeExpt detrended
+dy <- resid(fit_car)$mean
+
+# log per capita GDP detrended
+fit_carx <- stan_car(log(gdpPercap) ~ 1, data = world, car = cars, iter = 1e3, quiet = TRUE)
+dx <- resid(fit_carx)$mean
+

Using cor.test with those provides an estimate of correlation adjusted for spatial autocorrelation:

+
+# adjusted correlation
+cor.test(dx, dy)
+
## 
+##  Pearson's product-moment correlation
+## 
+## data:  dx and dy
+## t = 9.1399, df = 160, p-value = 2.696e-16
+## alternative hypothesis: true correlation is not equal to 0
+## 95 percent confidence interval:
+##  0.4743189 0.6785916
+## sample estimates:
+##       cor 
+## 0.5856791
+

The adjusted estimate of .59 is considerably different from the naive estimate of .80 and is outside the naive confidence intervals. (The adjusted estimate is .62 if we use SAR models.)

+
+
+

A bivariate model +

+

Here we will use the SAR model to illustrate its use. Fitting the spatial error or SAR model requires nearly the same steps as above.

+

Unlike prep_car_data, be sure to row-standardize the adjacency matrix before passing it to prep_sar_data.

+
+W <- row_standardize(A)
+sars <- prep_sar_data(W)
+

When fitting the model, we are going to add centerx = TRUE to center the covariate. (Internally this will call center(x, center = TRUE, scale = FALSE).) This will not change coefficient estimates but it does often improve MCMC sampling efficiency, sometimes considerably so. It does change interpretation of the intercept: the intercept will be an estimate of the average life expectancy (or the expected life expectancy when all covariates are at their average values).

+
+fit_sar <- stan_sar(lifeExp ~ log(gdpPercap), data = world, sar_parts = sars, centerx = TRUE,
+                    iter = 800, quiet = TRUE)
+

Lets plot the results this time:

+
+plot(fit_sar)
+
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+

+

The spatial dependence parameter is around 0.7, which indicates moderately strong SA. The mean life expectancy is about 71 (probably somewhere between about 69 and 74). And the coefficient for log GDP is around 4 (or somewhere between 3 and 5). The residual variation has a standard deviation of around 3.6 years.

+

If we scale both variables before fitting the bivariate spatial regression model (so that their variances both equal 1) then we get approximately the same estimate as the adjusted correlation coefficient (above). The credible interval is slightly wider because uncertainty in rho is (appropriately) incorporated here:

+
+world <- transform(world,
+                  sx = scale(log(gdpPercap), scale = T, center = T),
+                  sy = scale(lifeExp, scale = T, center = T)
+                   )
+
+fit_scaled <- stan_sar(sy ~ sx, data = world, sar_parts = sars, iter = 800, quiet = TRUE)
+print(fit_scaled)
+
## Spatial Model Results 
+## Formula: sy ~ sx
+## Spatial method (outcome):  SAR (SEM) 
+## Likelihood function:  auto_gaussian 
+## Link function:  identity 
+## Residual Moran Coefficient:  -0.0532525 
+## Observations:  162 
+## 
+## Inference for Stan model: foundation.
+## 4 chains, each with iter=800; warmup=400; thin=1; 
+## post-warmup draws per chain=400, total post-warmup draws=1600.
+## 
+##            mean se_mean    sd   2.5%    20%   50%   80% 97.5% n_eff  Rhat
+## intercept 0.021   0.003 0.124 -0.218 -0.080 0.025 0.114 0.274  1397 1.001
+## sx        0.586   0.002 0.060  0.474  0.537 0.586 0.634 0.700  1146 1.004
+## sar_rho   0.701   0.002 0.057  0.577  0.656 0.705 0.746 0.808  1261 1.002
+## sar_scale 0.437   0.001 0.025  0.391  0.415 0.435 0.458 0.489  1193 1.000
+## 
+## Samples were drawn using NUTS(diag_e) at Wed Oct 30 14:14:57 2024.
+## For each parameter, n_eff is a crude measure of effective sample size,
+## and Rhat is the potential scale reduction factor on split chains (at 
+## convergence, Rhat=1).
+
+
+

Predicted values +

+

We can visualize the model results by plotting the expected life expectancy across the full range of GDP per capita. We use the predict function for this. As input, it requires our fitted model and a data.frame with covariate values.

+

We will start by creating a data.frame with GDP per capita values that span from the minimum to maximum values in the world data:

+
+gdp <- range(world$gdpPercap)
+min_gdp <- gdp[1]
+max_gdp <- gdp[2]
+pdf <- data.frame(gdpPercap = seq(min_gdp, max_gdp, length.out = 200))
+

The column names in this data.frame have to match the variable names that were present in the data that we first provided to the model. In this case, the name of the columns should match those from the world data. Likewise, we provide the new GDP data on its original (un-transformed) scale, just as we did when we fit the model using stan_sar (the log transformation will be applied by predict because it is specified in the model formula). Because we centered the covariate using the centerx = TRUE argument, we will also allow the predict function to handle the centering automatically using information that is stored with the fitted model (stan_sar$x_center).

+

Now we pass this new data to predict:

+
+preds <- predict(fit_sar, newdata = pdf)
+

The output includes our pdf data plus some new columns. The new columns provide a summary of the predicted values. As usual, the mean is the estimate and the estimate is accompanied by other values that can be used to taken as credible intervals for the predicted value. The output reflects uncertainty in the model parameter estimates.

+
+head(preds)
+
##   gdpPercap     mean       sd     2.5%      20%      50%      80%    97.5%
+## 1  597.1352 60.04600 1.527957 57.24746 58.73926 60.05580 61.36163 63.13021
+## 2 1201.4715 62.85738 1.324776 60.42041 61.71248 62.84481 63.99644 65.54702
+## 3 1805.8079 64.49581 1.222242 62.21818 63.44873 64.47933 65.53424 67.05662
+## 4 2410.1442 65.65660 1.158882 63.46673 64.66982 65.64620 66.61413 68.05121
+## 5 3014.4805 66.55628 1.116048 64.44358 65.62260 66.53537 67.48145 68.87081
+## 6 3618.8169 67.29101 1.085637 65.24257 66.38960 67.27671 68.19381 69.52115
+

These ‘predicted’ values represent the expectation of the outcome variable at the given level of the covariates. So we would expect actual observations to form a cloud of points around the ‘predicted’ values. To calculate these predicted values, the predict function only includes covariates and the intercept, it does not include any spatial autocorrelation components. Its purpose is mainly to examine implications of the coefficient estimates on recognizable scales of variation, not to predict values for particular places. (The log-linear model can also be interpreted in terms of percent changes in the covariate, such as ’a 10% increase in GDP per capita, e.g., from 10,000 to 11,000, is associated with around 4 * log(11/10) = 0.38 additional years of life expectancy on average.)

+
+# scale GDP
+preds <- transform(preds, gdpPercap = gdpPercap / 1e3)
+
+yrange <- c(min(preds$`2.5%`), max(preds$`97.5%`))
+plot(preds$gdpPercap, preds$mean,
+     t = 'l',
+     ylim = yrange,
+     axes = F, 
+     xlab = "GDP per capita ($1,000s)",
+     ylab = "Life expectancy")
+axis(1)
+axis(2)
+
+# add credible intervals
+lines(preds$gdpPercap, preds$`2.5%`, lty = 3)
+lines(preds$gdpPercap, preds$`97.5%`, lty = 3)
+

+

Per this dataset, about 50% of the world population lives in countries with GDP per capita below $12,300.

+
+
+
+

Future work and support +

+

You can submit any questions, requests, or issues on the package issues page or the discussions page. geostan is still actively being developed so users are encouraged to check the package news page for updates.

+

If you are interesting contributing to the package you are encouraged to send an e-mail to the author or use the discussions page. You can submit a pull request with any bug fixes. Contributions that would make the package more useful to fields other than geostan’s current focus (human geography and public health), such as ecology, would be especially welcome.

+
+
+

Appendix +

+
+# function for getting colors, breaks, and labels for mapping
+map_pars <- function(x, 
+                     brks = quantile(x, probs = seq(0, 1, by = 0.2), na.rm = TRUE), 
+                     cols = c("#A8554EFF", "gray95", "#5D74A5FF")) {
+
+  # put x values into bins
+  x_cut <- cut(x, breaks = brks, include.lowest = TRUE)
+  
+  # labels for each bin
+  lbls <- levels( cut(x, brks, include.lowest = TRUE) )
+  
+  # colors 
+  rank <- as.numeric( x_cut )  
+  max_rank <- max( rank , na.rm = TRUE )
+  pal_fun <- colorRampPalette( cols )
+  pal <- pal_fun( max_rank )
+  colors <-  pal[ rank ]
+
+  # return list
+  ls <- list(brks = brks, lbls = lbls, pal = pal, col = colors)
+  return( ls )
+}
+
+
+

References +

+
+
+

Chun, Yongwan, and Daniel A Griffith. 2012. “Spatial Statistics and Geostatistics: Theory and Applications for Geographic Information Science and Technology.”

+
+
+

Donegan, Connor. 2021. “Building Spatial Conditional Autoregressive (CAR) Models in the Stan Programming Language.” https://osf.io/3ey65/.

+
+
+

———. 2022. “Geostan: An R Package for Bayesian Spatial Analysis.” Journal of Open Source Software 7 (79): 4716. https://doi.org/10.21105/joss.04716.

+
+
+

———. 2024. “Plausible Reasoning and Spatial-Statistical Theory: A Critique of Recent Writings on ‘Spatial Confounding’.” Geographical Analysis Early view. https://doi.org/10.1111/gean.12408.

+
+
+

Gabry, Jonah, Ben Goodrich, Martin Lysy, and Andrew Johnson. 2024. Rstantools: Tools for Developing R Packages Interfacing with ’Stan’. https://CRAN.R-project.org/package=rstantools.

+
+
+

Griffith, Daniel A, and Pedro R Peres-Neto. 2006. “Spatial Modeling in Ecology: The Flexibility of Eigenfunction Spatial Analyses.” Ecology 87 (10): 2603–13.

+
+
+

Stan Development Team. 2023. Stan User’s Guide. https://mc-stan.org.

+
+
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.7.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/spatial-analysis_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/spatial-analysis_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 00000000..ca349fd6 --- /dev/null +++ b/docs/articles/spatial-analysis_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-12-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-12-1.png new file mode 100644 index 00000000..8d30637d Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-17-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-17-1.png new file mode 100644 index 00000000..e06de8ae Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-17-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-21-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-21-1.png new file mode 100644 index 00000000..b274beff Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-21-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-23-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-23-1.png new file mode 100644 index 00000000..6874bc16 Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-23-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-27-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-27-1.png new file mode 100644 index 00000000..0e08e47f Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-27-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-32-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-32-1.png new file mode 100644 index 00000000..1af4ea9b Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-32-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-37-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-37-1.png new file mode 100644 index 00000000..046d30d4 Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-37-1.png differ diff --git a/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-8-1.png b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-8-1.png new file mode 100644 index 00000000..de0ab5b3 Binary files /dev/null and b/docs/articles/spatial-analysis_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/docs/articles/spatial-me-models.html b/docs/articles/spatial-me-models.html index fb3fd4ed..ffd57ff3 100644 --- a/docs/articles/spatial-me-models.html +++ b/docs/articles/spatial-me-models.html @@ -190,19 +190,16 @@

Spatial ME model car_parts = cars, iter = 650, # for demo speed quiet = TRUE, - )

-
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
-## Running the chains for more iterations may help. See
-## https://mc-stan.org/misc/warnings.html#tail-ess
-
-print(fit)
+ ) + +print(fit)
## Spatial Model Results 
 ## Formula: deaths.male ~ offset(log(pop.at.risk.male)) + insurance
 ## Spatial method (outcome):  CAR 
 ## Likelihood function:  poisson 
 ## Link function:  log 
-## Residual Moran Coefficient:  0.0003330769 
-## WAIC:  1316.72 
+## Residual Moran Coefficient:  -0.0009738462 
+## WAIC:  1320.98 
 ## Observations:  159 
 ## Data models (ME): insurance
 ##  Data model (ME prior): CAR (auto Gaussian)
@@ -211,18 +208,18 @@ 

Spatial ME model## post-warmup draws per chain=325, total post-warmup draws=1300. ## ## mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat -## intercept -4.165 0.003 0.058 -4.272 -4.205 -4.167 -4.125 -4.046 508 1.006 -## insurance -2.727 0.024 0.734 -4.124 -3.359 -2.711 -2.125 -1.293 946 1.002 -## car_rho 0.868 0.003 0.089 0.645 0.799 0.888 0.944 0.984 998 0.999 -## car_scale 0.447 0.001 0.033 0.387 0.421 0.447 0.474 0.514 898 1.002 +## intercept -4.167 0.002 0.057 -4.276 -4.209 -4.168 -4.127 -4.045 978 1.001 +## insurance -2.759 0.022 0.739 -4.316 -3.388 -2.748 -2.119 -1.349 1181 1.001 +## car_rho 0.866 0.003 0.089 0.655 0.799 0.886 0.938 0.982 1110 0.999 +## car_scale 0.448 0.001 0.033 0.389 0.420 0.446 0.474 0.518 1019 1.000 ## -## Samples were drawn using NUTS(diag_e) at Tue Sep 17 21:10:14 2024. +## Samples were drawn using NUTS(diag_e) at Tue Oct 29 10:06:32 2024. ## For each parameter, n_eff is a crude measure of effective sample size, ## and Rhat is the potential scale reduction factor on split chains (at ## convergence, Rhat=1).

The CAR models in geostan can be highly efficient in terms of MCMC sampling, but the required number of iterations will depend on many characteristics of the model and data. Often the default iter = 2000 is more than sufficient (with cores = 4). To speed up sampling with multi-core processing, use cores = 4 (to sample 4 chains in parallel).

In the following section, methods for examining MCMC samples of the modeled covariate values \(\boldsymbol x\) will be illustrated. Note that the process of storing MCMC samples for \(\boldsymbol x\) can become computationally burdensome if you have multiple covariates and moderately large N. If you do not need the samples of \(\boldsymbol x\) you can use the drop argument, as in:

-
+
 fit2 <- stan_car(deaths.male ~ offset(log(pop.at.risk.male)) + insurance,
                 centerx = TRUE,
                 ME = ME_list,
@@ -240,7 +237,7 @@ 

Visual diagnostics\(z_i\) an the modeled values \(x_i\): \[\delta_i = z_i - x_i.\] We want to look for any strong spatial pattern in these \(\delta_i\) values, because that would be an indication of a bias. However, the magnitude of the \(\delta_i\) value is important to consider—there may be a pattern, but if the amount of shrinkage is very small, that pattern may not matter. (The model returns \(M\) samples from the posterior distribution of each \(x_i\); or, indexing by MCMC sample \(x_i^m\) (\(m\) is an index, not an exponent). The reported \(\delta_i\) values is the MCMC mean \(\delta_i = \frac{1}{M} \sum_m z_i - x_i^m\).)

Two figures are provided to evaluate patterns in \(\delta_i\): first, a Moran scatter plot and, second, a map.

-
+
 me_diag(fit, 'insurance', georgia)

In this case, the results do not look too concerning insofar as there are no conspicuous patterns. However, a number of the credible intervals on the modeled values are large, which indicates low data reliability. The fact that some of the \(\delta_i\) values are substantial also points to low data quality for those estimates.

@@ -250,54 +247,54 @@

Working with MCMC samples from

geostan consists of pre-compiled Stan models, and users can always access the Markov chain Monte Carlo (MCMC) samples returned by Stan. When extracted as a matrix of samples (as below), each row of the matrix represents a draw from the joint probability distribution for all model parameters, and each column consists of samples from the marginal distribution of each parameter.

The ME models return samples for \(x_i\) as well as the model parameters \(\mu\) (“mu_x_true”), \(\rho\) (“car_rho_x_true”), and \(\tau\) (“sigma_x_true”). We can access these using as.matrix (also as.array and as.data.frame).

-
+
 mu.x <- as.matrix(fit, pars = "mu_x_true")
 dim(mu.x)
## [1] 1300    1
-
+
 head(mu.x)
##           parameters
 ## iterations mu_x_true[1]
-##       [1,]     1.802968
-##       [2,]     1.780662
-##       [3,]     1.850830
-##       [4,]     1.824972
-##       [5,]     1.687569
-##       [6,]     1.782920
-
+##       [1,]     1.870591
+##       [2,]     1.752737
+##       [3,]     1.790702
+##       [4,]     1.804061
+##       [5,]     1.612196
+##       [6,]     1.919102
+
 mean(mu.x)
-
## [1] 1.787651
+
## [1] 1.787378

We can visualize these using plot or print a summary:

-
+
 print(fit$stanfit, pars = c("mu_x_true", "car_rho_x_true", "sigma_x_true"))
## Inference for Stan model: foundation.
 ## 4 chains, each with iter=650; warmup=325; thin=1; 
 ## post-warmup draws per chain=325, total post-warmup draws=1300.
 ## 
 ##                   mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
-## mu_x_true[1]      1.79       0 0.08 1.64 1.74 1.79 1.83  1.96  1004 1.00
-## car_rho_x_true[1] 0.91       0 0.06 0.75 0.88 0.92 0.96  0.99   673 1.01
-## sigma_x_true[1]   0.48       0 0.04 0.42 0.46 0.48 0.51  0.56  1122 1.00
+## mu_x_true[1]      1.79       0 0.07 1.65 1.75 1.79 1.83  1.93   790    1
+## car_rho_x_true[1] 0.91       0 0.06 0.77 0.87 0.92 0.95  0.99   965    1
+## sigma_x_true[1]   0.48       0 0.04 0.42 0.46 0.48 0.51  0.56  1031    1
 ## 
-## Samples were drawn using NUTS(diag_e) at Tue Sep 17 21:10:14 2024.
+## Samples were drawn using NUTS(diag_e) at Tue Oct 29 10:06:32 2024.
 ## For each parameter, n_eff is a crude measure of effective sample size,
 ## and Rhat is the potential scale reduction factor on split chains (at 
 ## convergence, Rhat=1).

To extract samples from the joint probability distribution for \(\boldsymbol x\), use the generic parameter name “x_true”:

-
+
 x <- as.matrix(fit, pars = "x_true")
 dim(x)
## [1] 1300  159

If we wanted to calculate the mean of each of these marginal distributions (one for every \(x_i\)), we could use apply with MARGIN = 2 to summarize by column:

-
+
 x.mu <- apply(x, 2, mean)
 head(x.mu)
## x_insurance[1] x_insurance[2] x_insurance[3] x_insurance[4] x_insurance[5] 
-##      0.8362762      0.8001541      0.8517274      0.8520282      0.9181405 
+##      0.8368136      0.7995135      0.8517007      0.8521054      0.9181175 
 ## x_insurance[6] 
-##      0.8701711
+## 0.8702437

The vector x.mu contains estimates (posterior means) for \(x_i\). We might want to use these to plot the residuals or fitted values against the predictor:

-
+
 res <- resid(fit)$mean
 plot(x.mu, res,
      xlab = 'Insurance rate',
@@ -315,7 +312,7 @@ 

Working with MCMC samples from

Non-spatial ME models

If the ME list doesn’t have a slot with car_parts, geostan will automatically use a non-spatial Student’s t model instead of the CAR model: \[ p( x | \mathcal{M}) = Student(\boldsymbol x | \nu, \mu, \sigma), \] with degrees of freedom \(\nu\), mean \(\mu\), and scale \(\sigma\).

-
+
 ME_nsp <- prep_me_data(
   se = data.frame(insurance = georgia$insurance.se),
   logit = TRUE
@@ -326,7 +323,7 @@ 

Non-spatial ME modelsMultiple covariates

To model multiple covariates, simply add them to the data frame of standard errors:

-
+
 georgia$college <- georgia$college / 100
 georgia$college.se <- georgia$college.se / 100
 
@@ -349,7 +346,7 @@ 

Log transforms

Income and other magnitudes are often log transformed. In that case, the survey standard errors also need to be transformed. The se_log function provide approximate standard errors for this purpose.

When using se_log, pass in the variable itself (untransformed) and its standard errors. Here is a full workflow for a spatial mortality model with covariate measurement error:

-
+
 data(georgia)
 
 # income in $1,000s
diff --git a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png
index 823d5774..58a13893 100644
Binary files a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png and b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-14-1.png differ
diff --git a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png
index 08ce4001..85dde196 100644
Binary files a/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/spatial-me-models_files/figure-html/unnamed-chunk-9-1.png differ
diff --git a/docs/authors.html b/docs/authors.html
index 24475403..4b1e7983 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -17,7 +17,7 @@
       
       
         geostan
-        0.7.0
+        0.8.0
       
     
diff --git a/docs/index.html b/docs/index.html index 4de1d1c1..3592e456 100644 --- a/docs/index.html +++ b/docs/index.html @@ -18,7 +18,7 @@ - + Spillover/diffusion effects for spatial lag models — spill • geostan + + +
+
+ + + +
+
+ + +
+

Spillover/diffusion effects for spatial lag models

+
+ +
+
spill(beta, gamma = 0, rho, W, method = c("quick", "proper"), K = 20)
+
+impacts(object, method = c("quick", "proper"), K = 20)
+
+ +
+

Source

+

LeSage, James and Pace, R. Kelley (2009). Introduction to Spatial Econometrics. CRC Press.

+

LeSage, James (2014). What Regional Scientists Need to Know about Spatial Econometrics. The Review of Regional Science 44: 13-32 (2014 Southern Regional Science Association Fellows Address).

+
+
+

Arguments

+
beta
+

Coefficient for covariates (numeric vector)

+ + +
gamma
+

Coefficient for spatial lag of covariates

+ + +
rho
+

Spatial dependence parameter (single numeric value)

+ + +
W
+

Spatial weights matrix

+ + +
method
+

Use 'quick' for a computationally efficient approximation (after LeSage and Pace 2009, pp. 114--115) and 'proper' to compute the matrix inversion using Matrix::solve.

+ + +
K
+

Degree of polynomial in the expansion to use for the 'quick' method.

+ + +
object
+

A fitted spatial lag model (from stan_sar)

+ +
+
+

Details

+

These functions are for interpreting the results of the spatial lag model ('SLM') and the spatial Durbin lag model ('SDLM'). The models can be fitted using stan_sar. The equation for these SAR models specifies simultaneous feedback between all units, such that changing the outcome in one location has a spill-over effect that may extend to all other locations (a ripple or diffusion effect); the induced changes will also react back onto the first unit. (This presumably takes time, even if the observations are cross-sectional.)

+

Assuming that the model specification is in fact reasonable for your application, these spill-overs have to be incorporated into the interpretation of the regression coefficients of SLM and SDLM models. A unit change in the value of X in one location will impact y in that same place ('direct' impact) and will also impact y elsewhere through the diffusion process ('indirect' impact). The 'total' impact of a unit change in X is the sum of the direct and indirect effects (after LeSage and Pace 2009).

+

The spill function is for quickly calculating average spillover effects given point estimates of parameters. (This can be used to verify the nearness of the approximate method to the proper method.)

+

The impacts function calculates the (average) direct, indirect, and total effects once for every MCMC sample to produce samples from the posterior distribution for the impacts; the samples are returned together with a summary of the distribution (mean, median, and select quantiles).

+

These methods are only needed for the spatial lag and spatial Durbin lag models (SLM and SDLM). Any other model with spatially lagged covariates (SLX) might be read as having indirect impacts (as always, interpretation is subject to plausibility per causal reasoning) but the impacts are then equal to the SLX coefficients, the direct effects are equal to beta (the coefficient for X), and the 'total' impact is their sum (see the example below for calculation with MCMC samples).

+
+ +
+

Examples

+
##
+## SDLM data
+##
+
+parts <- prep_sar_data2(row = 9, col = 9, quiet = TRUE)
+W <- parts$W
+x <- sim_sar(w=W, rho=.6)
+Wx <- (W %*% x)[,1]
+mu <- .5 * x + .25 * Wx
+y <- sim_sar(w=W, rho=0.6, mu = mu, type = "SLM")
+dat <- cbind(y, x)
+
+# impacts per the above parameters
+spill(0.5, 0.25, 0.6, W)
+#>    direct  indirect     total 
+#> 0.6233251 1.2516337 1.8749589 
+
+##
+## impacts for SDLM
+##
+
+fit <- stan_sar(y ~ x, data = dat, sar = parts,
+                type = "SDLM", iter = 500,
+                slim = TRUE, quiet = TRUE) 
+
+impax <- impacts(fit)
+impax$summary
+#> $x
+#>               mean    median        sd         lwr       upr
+#> Direct   0.6449821 0.6490855 0.1049408  0.43419471 0.8582216
+#> Indirect 0.5566986 0.5468910 0.3373754 -0.06383069 1.2498130
+#> Total    1.2016807 1.1913060 0.3887596  0.45583982 2.0123733
+#> 
+
+# plot posterior distributions
+og = par(mfrow = c(1, 3),
+         mar = c(3, 3, 1, 1))
+S <- impax$samples[[1]]
+hist(S[,1], main = 'Direct')
+hist(S[,2], main = 'Indirect')
+hist(S[,3], main = 'Total')
+
+par(og)
+
+
+
+
+
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.7.

+
+ +
+ + + + + + + + diff --git a/docs/reference/index.html b/docs/reference/index.html index e36f038d..b0187c1e 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ geostan - 0.7.0 + 0.8.0
@@ -126,7 +126,7 @@

Spatial autocorrelation -

Spatial models

+

Model prep

@@ -157,7 +157,11 @@

Spatial models prep_sar_data2()

Prepare data for SAR model: raster analysis

- + +

Spatial models

+

+ +

stan_car()

Conditional autoregressive (CAR) models

@@ -178,7 +182,7 @@

Spatial models

Simultaneous autoregressive (SAR) models

-

Methods and diagnostics for spatial models

+

Methods and diagnostics

@@ -205,6 +209,10 @@

Methods and diagnostics for

sp_diag()

Visual displays of spatial data and spatial models

+ +

spill() impacts()

+ +

Spillover/diffusion effects for spatial lag models

me_diag()

@@ -212,7 +220,7 @@

Methods and diagnostics for

posterior_predict()

-

Draw samples from the posterior predictive distribution

+

Sample from the posterior predictive distribution

uniform() normal() student_t() gamma2() hs()

diff --git a/docs/reference/posterior_predict.html b/docs/reference/posterior_predict.html index 9c1ebcac..60674dd1 100644 --- a/docs/reference/posterior_predict.html +++ b/docs/reference/posterior_predict.html @@ -1,5 +1,5 @@ -Draw samples from the posterior predictive distribution — posterior_predict • geostanSample from the posterior predictive distribution — posterior_predict • geostan @@ -46,7 +46,7 @@
@@ -56,7 +56,16 @@

Draw samples from the posterior predictive distribution

-
posterior_predict(object, S, summary = FALSE, width = 0.95, car_parts, seed)
+
posterior_predict(
+  object,
+  S,
+  summary = FALSE,
+  width = 0.95,
+  quick = TRUE,
+  K = 20,
+  preserve_order = FALSE,
+  seed
+)
@@ -77,8 +86,16 @@

Arguments

Only used if summary = TRUE, to set the quantiles for the credible intervals. Defaults to width = 0.95.

-
car_parts
-

Data for CAR model specification; only required for stan_car with family = auto_gaussian().

+
quick
+

For SAR models only; quick = TRUE uses an approximation method for the inverse of matrix (I - rho * W).

+ + +
K
+

For SAR models only; number of matrix powers to for the matrix inverse approximation.

+ + +
preserve_order
+

If TRUE, the order of posterior draws will remain fixed; the default is to permute the MCMC samples so that (with small sample size S) each successive call to posterior_predict will return a different sample from the posterior probability distribution.

seed
@@ -91,19 +108,34 @@

Value

A matrix of size S x N containing samples from the posterior predictive distribution, where S is the number of samples drawn and N is the number of observations. If summary = TRUE, a data.frame with N rows and 3 columns is returned (with column names mu, lwr, and upr).

+
+

Details

+

The quick = FALSE method requires a call to Matrix::solve(I - rho * W) for each MCMC sample; the quick = TRUE method uses an approximation based on matrix powers (LeSage and Pace 2009).

+

Examples

-
 fit <- stan_glm(sents ~ offset(log(expected_sents)),
+    
E <- sentencing$expected_sents
+sentencing$log_E <- log(E)
+ fit <- stan_glm(sents ~ offset(log_E),
                   re = ~ name,
                   data = sentencing,
                   family = poisson(),
                   chains = 2, iter = 600) # for speed only
 
+
  yrep <- posterior_predict(fit, S = 65)
- plot(density(yrep[1,]))
- for (i in 2:nrow(yrep)) lines(density(yrep[i,]), col = 'gray30')
- lines(density(sentencing$sents), col = 'darkred', lwd = 2)
+ plot(density(yrep[1,] / E )) + for (i in 2:nrow(yrep)) lines(density(yrep[i,] / E), col = 'gray30') + lines(density(sentencing$sents / E), col = 'darkred', lwd = 2) + +sars <- prep_sar_data2(row = 9, col = 9) +W <- sars$W +y <- sim_sar(rho = .9, w = W) +fit <- stan_sar(y ~ 1, data = data.frame(y=y), sar = sars, + iter = 650, quiet = TRUE) +yrep <- posterior_predict(fit, S = 15) +

+
+

Source

+

Goulard, Michael, Thibault Laurent, and Christine Thomas-Agnan (2017). About predictions in spatial autoregressive models: optimal and almost optimal strategies. Spatial Economic Analysis 12 (2-3): 304-325.

+

LeSage, James (2014). What Regional Scientists Need to Know about Spatial Econometrics. The Review of Regional Science 44: 13-32 (2014 Southern Regional Science Association Fellows Address).

+

LeSage, James, & Robert kelley Pace (2009). Introduction to Spatial Econometrics. Chapman and Hall/CRC.

+

Arguments

object
@@ -75,15 +82,15 @@

Arguments

newdata
-

A data frame in which to look for variables with which to predict, presumably for the purpose of viewing marginal effects. Note that if the model formula includes an offset term, newdata must contain the offset. Note also that any spatially-lagged covariate terms will be ignored if they were provided using the slx argument. If covariates in the model were centered using the centerx argument, the predict.geostan_fit method will automatically center the predictors in newdata using the values stored in object$x_center. If newdata is missing, the fitted values of the model will be returned.

+

A data frame in which to look for variables with which to predict. Note that if the model formula includes an offset term, newdata must contain the offset column (see examples below). If covariates in the model were centered using the centerx argument, the predict.geostan_fit method will automatically center the predictors in newdata using the values stored in object$x_center. If newdata is missing, the fitted values of the model will be returned.

alpha
-

An N-by-1 matrix of MCMC samples for the intercept; this is provided by default. However, this argument might be used if there is a need to incorporate the spatial trend term, in which case it may be thought of as a spatially-varying intercept. If used, note that the intercept needs to be provided on the scale of the linear predictor.

+

An N-by-1 matrix of MCMC samples for the intercept; this is provided by default. If used, note that the intercept needs to be provided on the scale of the linear predictor. This argument might be used if there is a need to incorporate the spatial trend term (as a spatially-varying intercept).

center
-

Optional vector of numeric values or a logical scalar to pass to scale. Defaults to using object$x_center. If the model was fit using centerx = TRUE, then covariates were centered and their mean values are stored in object$x_center and the predict method will use them to automatically center newdata; if the model was fit with centerx = FALSE, then object$x_center = FALSE and newdata will not be centered.

+

Optional vector of numeric values or a logical scalar to pass to scale. Defaults to using object$x_center. If the model was fit using centerx = TRUE, then covariates were centered and their mean values are stored in object$x_center and the predict method will use them automatically to center newdata; if the model was fit with centerx = FALSE, then object$x_center = FALSE and newdata will not be centered.

summary
@@ -94,6 +101,10 @@

Arguments

By default, results from predict are on the scale of the linear predictor (type = "link")). The alternative (type = "response") is on the scale of the response variable. For example, the default return values for a Poisson model on the log scale, and using type = "response" will return the original scale of the outcome variable (by exponentiating the log values).

+
add_slx
+

Logical. If add_slx = TRUE, any spatially-lagged covariates that were specified through the 'slx' argument (of the model fitting function, e.g., stan_glm) will be added to the linear predictor. The spatial lag terms will be calculated internally using object$C, the spatial weights matrix used to fit the model. Hence, newdata must have N = object$N rows. Predictions from spatial lag models (SAR models of type 'SLM' and 'SDLM') always include the SLX terms (i.e., any value passed to add_slx will be overwritten with TRUE).

+ +
...

Not used

@@ -106,9 +117,19 @@

Value

Details

-

The purpose of the predict method is to explore marginal effects of (combinations of) covariates.

-

The model formula will be taken from object$formula, and then a model matrix will be created by passing newdata to the model.frame function (as in: model.frame(newdata, object$formula). Parameters are taken from as.matrix(object, pars = c("intercept", "beta")).

-

Be aware that in generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model. If the model includes any spatially-lagged covariates (introduced using the slx argument) or a spatial autocorrelation term (for example, you used a spatial CAR, SAR, or ESF model), these terms will essentially be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the alpha argument.

+

The primary purpose of the predict method is to explore marginal effects of covariates.

+

The model formula will be taken from object$formula, and then a model matrix will be created by passing newdata to the model.frame function (as in: model.frame(newdata, object$formula). Parameters are taken from as.matrix(object, pars = c("intercept", "beta")). If add_slx = TRUE, SLX coefficients will be taken from as.matrix(object, pars = "gamma").

+

Spatially-lagged covariates added via the slx argument ('spillover effects') will be included if add_slx = TRUE or if a spatial lag model is provided (a SAR model of type 'SLM' or 'SDLM'). In either of those cases, newdata must have the same number of rows as were used to fit the original data. For details on these 'spillover effects', see LeSage and Pace (2009) and LeSage (2014).

+

Predictions for the spatial lag model (SAR models of type 'SLM') are equal to: +$$ + (I - \rho W)^{-1} X \beta +$$ +where \(X \beta\) contains the intercept and covariates. (For intercept-only models, the above term is equal to the constant intercept.) Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to: +$$ + (I - \rho W)^{-1} (X \beta + WX \gamma) +$$ +where \(WX \gamma\) are spatially lagged covariates multiplied by their coefficients. The SLM and SDLM differ from all other model types in that the spatial component of the model cannot be separated from the linear predictor and is, therefore, automatically incorporated into the predictions.

+

In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR (error model), or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the alpha argument.

@@ -146,7 +167,8 @@

Examples

preds$mean, type = "l", ylab = "Deaths per 10,000", - xlab = "Income ($1,000s)")
+ xlab = "Income ($1,000s)") +
-
row_standardize(C, warn = TRUE, msg = "Row standardizing connectivity matrix")
+
row_standardize(C, warn = FALSE, msg = "Row standardizing connectivity matrix")
@@ -66,11 +66,11 @@

Arguments

warn
-

Print msg if warn = TRUE.

+

Print message msg if warn = TRUE.

msg
-

A warning message to print.

+

A warning message; used internally by geostan.

@@ -82,13 +82,51 @@

Value

Examples

-
A <- shape2mat(georgia)
-head(Matrix::summary(A))
-Matrix::rowSums(A)
-
-W <- row_standardize(A)
-head(Matrix::summary(W))
-Matrix::rowSums(W)
+
A <- shape2mat(georgia)
+#> Contiguity condition: queen
+#> Number of neighbors per unit, summary:
+#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#>   1.000   4.000   5.000   5.409   6.000  10.000 
+#> 
+#> Spatial weights, summary:
+#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#>       1       1       1       1       1       1 
+head(Matrix::summary(A))
+#> 159 x 159 sparse Matrix of class "ngCMatrix", with 860 entries
+#>     i j
+#> 1  23 1
+#> 2  58 1
+#> 3  59 1
+#> 4 131 1
+#> 5 148 1
+#> 6 159 1
+Matrix::rowSums(A)
+#>   [1]  6  5  6  5  3  5  7  4  5  5  5  7  6  4  5  4  7  7  4  5  7  6  6  6  4
+#>  [26]  5  6  3  8  6  5  7  3  3  8  6  6  7  5  8  8  5  6  6  5  4  5  6  7  5
+#>  [51]  2  7  6  5  3  1  3  7  6  2  5  5  8  6  7  7  5  6  4  4  6  4  6 10  8
+#>  [76]  7  7  6  6  3  6 10  6  6  5  2  6  5  4  7  4  8  5  4  5  4  4  7  6  6
+#> [101]  6  5  4  4  6  8  4  6  6  4  5  4  4  4  9  3  4  7  7  4  3  7  4  6  4
+#> [126]  5  7  6  4  4  5  6  5  5  6  6  7  5  6  5  4  7  6  5  7  3  3  7  3  7
+#> [151]  5  6  5  4  6  3  5  7  8
+
+W <- row_standardize(A)
+head(Matrix::summary(W))
+#> 159 x 159 sparse Matrix of class "dgCMatrix", with 860 entries
+#>     i j         x
+#> 1  23 1 0.1666667
+#> 2  58 1 0.1428571
+#> 3  59 1 0.1666667
+#> 4 131 1 0.2000000
+#> 5 148 1 0.1428571
+#> 6 159 1 0.1250000
+Matrix::rowSums(W)
+#>   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+#>  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+#>  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+#> [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+#> [149] 1 1 1 1 1 1 1 1 1 1 1
+
+
-
sim_sar(m = 1, mu = rep(0, nrow(w)), w, rho, sigma = 1, ...)
+
sim_sar(
+  m = 1,
+  mu = rep(0, nrow(w)),
+  rho,
+  sigma = 1,
+  w,
+  type = c("SEM", "SLM"),
+  quick = FALSE,
+  K = 20,
+  ...
+)
+
+

Source

+

LeSage, J. and Pace, R. K. (2009). An Introduction to Spatial Econometrics. CRC Press.

+

Arguments

m
-

The number of samples required. Defaults to m=1 to return an n-length vector; if m>1, an m x n matrix is returned (i.e. each row will contain a sample of correlated values).

+

The number of samples required. Defaults to m=1 to return an n-length vector; if m>1, an m x n matrix is returned (i.e. each row will contain a sample of auto-correlated values).

mu

An n-length vector of mean values. Defaults to a vector of zeros with length equal to nrow(w).

+
rho
+

Spatial autocorrelation parameter in the range (-1, 1). A single numeric value.

+ + +
sigma
+

Scale parameter (standard deviation). Defaults to sigma = 1. A single numeric value.

+ +
w
-

Row-standardized n x n spatial weights matrix.

+

n x n spatial weights matrix; typically row-standardized.

-
rho
-

Spatial autocorrelation parameter in the range (-1, 1). Typically a scalar value; otherwise an n-length numeric vector.

+
type
+

Type of SAR model: spatial error model ("SEM") or spatial lag model ("SLM").

-
sigma
-

Scale parameter (standard deviation). Defaults to sigma = 1. Typically a scalar value; otherwise an n-length numeric vector.

+
quick
+

Use power of matrix W to approximate the inverse term?

+ + +
K
+

Number of matrix powers to use if quick = TRUE.

...
@@ -89,11 +115,12 @@

Arguments

Value

-

If m = 1 a vector of the same length as mu, otherwise an m x length(mu) matrix with one sample in each row.

+

If m = 1 then sim_sar returns a vector of the same length as mu, otherwise an m x length(mu) matrix with one sample in each row.

Details

-

Calls MASS::mvrnorm internally to draw from the multivariate normal distribution. The covariance matrix is specified following the simultaneous autoregressive (SAR, aka spatial error) model.

+

This function takes n = nrow(w) draws from the normal distribution using rnorm to obtain vector x; if type = 'SEM', it then pre-multiplies xby the inverse of the matrix(I - rho * W)to obtain spatially autocorrelated values. Fortype = 'SLM', the multiplier matrix is applied to x + mu to produce the desired values.

+

The quick method approximates the matrix inversion using the method described by LeSage and Pace (2009, p. 40).

See also

@@ -102,14 +129,45 @@

See also

Examples

-
data(georgia)
-w <- shape2mat(georgia, "W")
-x <- sim_sar(w = w, rho = 0.5)
-aple(x, w)
+    
# spatially autocorrelated data on a regular grid
+library(sf)
+row = 10
+col = 10
+sar_parts <- prep_sar_data2(row = row, col = col)
+w <- sar_parts$W
+x <- sim_sar(rho = 0.65, w = w)
+dat <- data.frame(x = x)
+
+# create grid 
+sfc = st_sfc(st_polygon(list(rbind(c(0,0), c(col,0), c(col,row), c(0,0)))))
+grid <- st_make_grid(sfc, cellsize = 1, square = TRUE)
+st_geometry(dat) <- grid
+plot(dat)
 
-x <- sim_sar(w = w, rho = 0.7, m = 10)
+# draw form SAR (SEM) model
+z <- sim_sar(rho = 0.9, w = w)
+moran_plot(z, w)
+grid$z <- z
+
+# multiple sets of observations
+# each row is one N-length draw from the SAR model
+x <- sim_sar(rho = 0.7, w = w, m = 4)
+nrow(w)
 dim(x)
-apply(x, 1, aple, w = w)
+apply(x, 1, aple, w = w) +apply(x, 1, mc, w = w) + +# Spatial lag model (SLM): y = rho*Wy + beta*x + epsilon +x <- sim_sar(rho = 0.5, w = w) +y <- sim_sar(mu = x, rho = 0.7, w = w, type = "SLM") + +# Spatial Durbin lag model (SLM with spatial lag of x) +# SDLM: y = rho*Wy + beta*x + gamma*Wx + epsilon +x = sim_sar(w = w, rho = 0.5) +mu <- -0.5*x + 0.5*(w %*% x)[,1] +y <- sim_sar(mu = mu, w = w, rho = 0.6, type = "SLM") + +
-

Fit data to an spatial Gaussian SAR (spatial error) model, or model a vector of spatially-autocorrelated parameters using a SAR prior model.

+

Fit data to a simultaneous spatial autoregressive (SAR) model, or use the SAR model as the prior model for a parameter vector in a hierarchical model.

@@ -62,8 +62,9 @@

Simultaneous autoregressive (SAR) models

re, data, C, - sar_parts = prep_sar_data(C), + sar_parts = prep_sar_data(C, quiet = TRUE), family = auto_gaussian(), + type = c("SEM", "SDEM", "SDLM", "SLM"), prior = NULL, ME = NULL, centerx = FALSE, @@ -114,15 +115,19 @@

Arguments

C
-

Spatial weights matrix (conventionally referred to as \(W\) in the SAR model). Typically, this will be created using geostan::shape2mat(shape, style = "W"). This will be passed internally to prep_sar_data, and will also be used to calculate residual spatial autocorrelation as well as any user specified slx terms; it will automatically be row-standardized before calculating slx terms. See shape2mat.

+

Spatial weights matrix (conventionally referred to as \(W\) in the SAR model). Typically, this will be created using geostan::shape2mat(shape, style = "W"). This will be passed internally to prep_sar_data, and will also be used to calculate residual spatial autocorrelation as well as any user specified slx terms. See shape2mat.

sar_parts
-

Optional. If not provided, then prep_sar_data will be used automatically to create sar_parts using the user-provided spatial weights matrix.

+

List of data constructed by prep_sar_data. If not provided, then C will automatically be passed to prep_sar_data to create sar_parts.

family
-

The likelihood function for the outcome variable. Current options are auto_gaussian(), binomial(link = "logit"), and poisson(link = "log"); if family = gaussian() is provided, it will automatically be converted to auto_gaussian().

+

The likelihood function for the outcome variable. Current options are auto_gaussian(), binomial() (with logit link function) and poisson() (with log link function); if family = gaussian() is provided, it will automatically be converted to auto_gaussian().

+ + +
type
+

Type of SAR model (character string): spatial error model ('SEM'), spatial Durbin error model ('SDEM'), spatial Durbin lag model ('SDLM'), or spatial lag model ('SLM'). see Details below.

prior
@@ -151,11 +156,11 @@

Arguments

ME
-

To model observational uncertainty (i.e. measurement or sampling error) in any or all of the covariates, provide a list of data as constructed by the prep_me_data function.

+

To model observational uncertainty in any or all of the covariates (i.e. measurement or sampling error), provide a list of data constructed by the prep_me_data function.

centerx
-

To center predictors on their mean values, use centerx = TRUE. If the ME argument is used, the modeled covariate (i.e., latent variable), rather than the raw observations, will be centered. When using the ME argument, this is the recommended method for centering the covariates.

+

To center predictors on their mean values, use centerx = TRUE. This increases sampling speed. If the ME argument is used, the modeled covariate (i.e., the latent variable), rather than the raw observations, will be centered.

prior_only
@@ -163,7 +168,7 @@

Arguments

censor_point
-

Integer value indicating the maximum censored value; this argument is for modeling censored (suppressed) outcome data, typically disease case counts or deaths.

+

Integer value indicating the maximum censored value; this argument is for modeling censored (suppressed) outcome data, typically disease case counts or deaths which are left-censored to protect confidentiality when case counts are very low.

chains
@@ -171,7 +176,7 @@

Arguments

iter
-

Number of samples per chain.

+

Number of MCMC samples per chain.

refresh
@@ -179,15 +184,15 @@

Arguments

keep_all
-

If keep_all = TRUE then samples for all parameters in the Stan model will be kept; this is necessary if you want to do model comparison with Bayes factors and the bridgesampling package.

+

If keep_all = TRUE then samples for all parameters in the Stan model will be kept; this is necessary if you want to do model comparison with Bayes factors using the bridgesampling package.

pars
-

Optional; specify any additional parameters you'd like stored from the Stan model.

+

Specify any additional parameters you'd like stored from the Stan model.

slim
-

If slim = TRUE, then the Stan model will not collect the most memory-intensive parameters (including n-length vectors of fitted values, log-likelihoods, and ME-modeled covariate values). This will disable many convenience functions that are otherwise available for fitted geostan models, such as the extraction of residuals, fitted values, and spatial trends, WAIC, and spatial diagnostics, and ME diagnostics; many quantities of interest, such as fitted values and spatial trends, can still be calculated manually using given parameter estimates. The "slim" option is designed for data-intensive routines, such as regression with raster data, Monte Carlo studies, and measurement error models. For more control over which parameters are kept or dropped, use the drop argument instead of slim.

+

If slim = TRUE, then the Stan model will not save the most memory-intensive parameters (including n-length vectors of fitted values, other 'random effects', and ME-modeled covariate values). This will disable some convenience functions that are otherwise available for fitted geostan models, such as the extraction of residuals, fitted values, and spatial trends, spatial diagnostics, and ME diagnostics. The "slim" option is designed for data-intensive routines, such as regression with raster data, Monte Carlo studies, and measurement error models.

drop
@@ -204,7 +209,7 @@

Arguments

N-length vector of 'latent'/modeled covariate values created for measurement error (ME) models.

-

Using drop = c('fitted', 'alpha_re', 'x_true') is equivalent to slim = TRUE. Note that if slim = TRUE, then drop will be ignored---so only use one or the other.

+

Using drop = c('fitted', 'alpha_re', 'x_true', 'log_lambda_mu') is equivalent to slim = TRUE. Note that if slim = TRUE, then drop will be ignored---so only use one or the other.

control
@@ -212,7 +217,7 @@

Arguments

quiet
-

Controls (most) automatic printing to the console. By default, any prior distributions that have not been assigned by the user are printed to the console. If quiet = TRUE, these will not be printed. Using quiet = TRUE will also force refresh = 0.

+

Controls (most) automatic printing to the console. By default, any prior distributions that have not been assigned by the user are printed to the console; if quiet = TRUE, these will not be printed. Using quiet = TRUE will also force refresh = 0.

...
@@ -227,7 +232,7 @@

Value

Summaries of the main parameters of interest; a data frame.

diagnostic
-

Widely Applicable Information Criteria (WAIC) with a measure of effective number of parameters (eff_pars) and mean log pointwise predictive density (lpd), and mean residual spatial autocorrelation as measured by the Moran coefficient.

+

Residual spatial autocorrelation as measured by the Moran coefficient.

stanfit

an object of class stanfit returned by rstan::stan

@@ -265,35 +270,47 @@

Value

C

Spatial weights matrix (in sparse matrix format).

+
sar_type
+

Type of SAR model: 'SEM', 'SDEM', 'SDLM', or 'SLM'.

+

Details

Discussions of SAR models may be found in Cliff and Ord (1981), Cressie (2015, Ch. 6), LeSage and Pace (2009), and LeSage (2014). The Stan implementation draws from Donegan (2021).

-

The general scheme of the SAR model for numeric vector \(y\) is -$$y = \mu + ( I - \rho W)^{-1} \epsilon$$ -$$\epsilon \sim Gauss(0, \sigma^2 I)$$ -where \(W\) is the spatial weights matrix, \(I\) is the n-by-n identity matrix, and \(\rho\) is a spatial autocorrelation parameter. In words, the errors of the regression equation are spatially autocorrelated.

+

There are two SAR specification options which are commonly known as the spatial error ('SEM') and the spatial lag ('SLM') models. When the spatial-lag of the covariates are included, then the model is referred to as a spatial Durbin model; depending on the model type, it becomes a spatial Durbin error model ('SDEM') or a spatial Durbin lag model ('SDLM').

+

The mathematics and typical interpretation of the SLM/SDLM is unusual and the conventional interpretation of regression coefficients does not apply! Use the impacts method to interpret results from the SLM and SDLM models (that is, granted that this model form is at least plausible for the application).

+

Use of the 'SDEM' and 'SDLM' options are for convenience only: you can also obtain the Durbin models using the slx (spatial lag of X) argument. The slx argument allows control over which covariates will be added in spatial-lag form; the Durbin options include the spatial lag of all covariates.

+

The spatial error specification ('SEM') is +$$y = \mu + ( I - \rho C)^{-1} \epsilon$$ +$$\epsilon \sim Gauss(0, \sigma^2)$$ +where \(C\) is the spatial connectivity matrix, \(I\) is the n-by-n identity matrix, and \(\rho\) is a spatial autocorrelation parameter. In words, the errors of the regression equation are spatially autocorrelated.

Re-arranging terms, the model can also be written as follows: -$$y = \mu + \rho W (y - \mu) + \epsilon$$ -which perhaps shows more intuitively the implicit spatial trend component, \(\rho W (y - \mu)\).

-

Most often, this model is applied directly to observations (referred to below as the auto-Gaussian model). The SAR model can also be applied to a vector of parameters inside a hierarchical model. The latter enables spatial autocorrelation to be modeled when the observations are discrete counts (e.g., disease incidence data).

-

A note on terminology: the spatial statistics literature conceptualizes the simultaneously-specified spatial autoregressive model (SAR) in relation to the conditionally-specified spatial autoregressive model (CAR) (see stan_car) (see Cliff and Ord 1981). The spatial econometrics literature, by contrast, refers to the simultaneously-specified spatial autoregressive (SAR) model as the spatial error model (SEM), and they contrast the SEM with the spatial lag model (which contains a spatially-lagged dependent variable on the right-hand-side of the regression equation) (see LeSage 2014).

-

Auto-Gaussian

+$$y = \mu + \rho C (y - \mu) + \epsilon$$ +which perhaps shows more intuitively the implicit spatial trend component, \(\rho C (y - \mu)\).

+

The second SAR specification type is the 'spatial lag of y' ('SLM'). This model describes a diffusion or contagion process: +$$y = \rho C y + \mu + \epsilon$$ +$$\epsilon \sim Gauss(0, \sigma^2)$$ +This is attractive for modeling actual contagion processes. Here the 'spatial trend' part is simply \(\rho C y\).

+

Both model types have a covariance matrix of:

+

$$\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.$$

+

But the expected values of the models differ. The expected value for the SEM is the usual \(\mu\) (the intercept plus X*beta); the expected value of the SLM is \((I - rho C)^{-1} \mu\).

+

Most often, the SAR model is applied directly to observations (referred to below as the auto-normal or auto-Gaussian model). The SAR model can also be applied to a vector of parameters inside a hierarchical model. The latter enables spatial or network autocorrelation to be modeled when the observations are discrete counts (e.g., hierarchical models for disease incidence rates).

+

Auto-normal

When family = auto_gaussian(), the SAR model is specified as follows:

$$y \sim Gauss(\mu, \Sigma)$$ -$$\Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}$$ -where \(\mu\) is the mean vector (with intercept, covariates, etc.), \(W\) is a spatial weights matrix (usually row-standardized), and \(\sigma\) is a scale parameter.

+$$\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}$$ +where \(\mu\) is the mean vector (with intercept, covariates, etc.), \(C\) is a spatial weights or connectivity matrix (usually row-standardized), and \(\sigma\) is a scale parameter.

The SAR model contains an implicit spatial trend (i.e., spatial autocorrelation) component \(\phi\) which is calculated as follows: $$ -\phi = \rho W (y - \mu) +\phi = \rho C (y - \mu) $$

This term can be extracted from a fitted auto-Gaussian model using the spatial method.

When applied to a fitted auto-Gaussian model, the residuals.geostan_fit method returns 'de-trended' residuals \(R\) by default. That is, $$ -R = y - \mu - \rho W (y - \mu). +R = y - \mu - \rho C (y - \mu). $$ To obtain "raw" residuals (\(y - \mu\)), use residuals(fit, detrend = FALSE). Similarly, the fitted values obtained from the fitted.geostan_fit will include the spatial trend term by default.

@@ -305,16 +322,16 @@

PoissonFor family = poisson(), the model is specified as:

$$y \sim Poisson(e^{O + \lambda})$$ $$\lambda \sim Gauss(\mu, \Sigma)$$ -$$\Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}.$$

-

If the raw outcome consists of a rate \(\frac{y}{p}\) with observed counts \(y\) and denominator \(p\) (often this will be the size of the population at risk), then the offset term \(O=log(p)\) is the log of the denominator.

+$$\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.$$

+

O is a constant/offset term. If the raw outcome consists of a rate \(\frac{y}{p}\) with observed counts \(y\) and denominator \(p\) (often this will be the size of the population at risk), then the offset term \(O=log(p)\) is the log of the denominator.

This is often written (equivalently) as:

$$y \sim Poisson(e^{O + \mu + \phi})$$ $$ \phi \sim Gauss(0, \Sigma) $$ -$$ \Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}.$$

+$$ \Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.$$

For Poisson models, the spatial method returns the parameter vector \(\phi\).

-

In the Poisson SAR model, \(\phi\) contains a latent spatial trend as well as additional variation around it. If you would like to extract the latent/implicit spatial trend from \(\phi\), you can do so by calculating: +

In the Poisson SAR model, \(\phi\) contains a latent (smooth) spatial trend as well as additional variation around it. If you would like to extract the latent/implicit spatial trend from \(\phi\), you can do so by calculating: $$ - \rho W \phi. + \rho C \phi. $$

@@ -325,13 +342,13 @@

BinomialFor family = binomial(), the model is specified as:

$$y \sim Binomial(N, \lambda) $$ $$logit(\lambda) \sim Gauss(\mu, \Sigma) $$ -$$\Sigma = \sigma^2 (I - \rho W)^{-1}(I - \rho W')^{-1}.$$

+$$\Sigma = \sigma^2 (I - \rho C)^{-1}(I - \rho C')^{-1}.$$

where outcome data \(y\) are counts, \(N\) is the number of trials, and \(\lambda\) is the rate of 'success'. Note that the model formula should be structured as: cbind(sucesses, failures) ~ 1 (for an intercept-only model), such that trials = successes + failures.

For fitted Binomial models, the spatial method will return the parameter vector phi, equivalent to:

$$\phi = logit(\lambda) - \mu.$$

As is also the case for the Poisson model, \(\phi\) contains a latent spatial trend as well as additional variation around it. If you would like to extract the latent/implicit spatial trend from \(\phi\), you can do so by calculating: $$ -\rho W \phi. +\rho C \phi. $$

@@ -350,34 +367,253 @@

Author

Examples

-
# model mortality risk
-data(georgia)
-W <- shape2mat(georgia, style = "W")
-
-fit <- stan_sar(log(rate.male) ~ 1,
-                C = W,
-                data = georgia,
-                chains = 1, # for ex. speed only
-                iter = 700 
-                )
-
-rstan::stan_rhat(fit$stanfit)
-rstan::stan_mcse(fit$stanfit)
-print(fit)
-plot(fit)
-sp_diag(fit, georgia)
-
-# \donttest{
- # a more appropriate model for count data:
-fit2 <- stan_sar(deaths.male ~ offset(log(pop.at.risk.male)),
-                C = W,
-                data = georgia,
-                family = poisson(),
-                chains = 1, # for ex. speed only
-                iter = 700 
-                 )
-sp_diag(fit2, georgia)
-# }
+

+##
+## simulate SAR data on a regular grid
+##
+
+sars <- prep_sar_data2(row = 10, col = 10, quiet = TRUE)
+w <- sars$W
+
+# draw x
+x <- sim_sar(w = w, rho = 0.5)
+
+# draw y = mu + rho*W*(y - mu) + epsilon
+# beta = 0.5, rho = 0.5
+y <- sim_sar(w = w, rho = .5, mu = 0.5 * x)
+dat <- data.frame(y = y, x = x)
+
+##
+## fit SEM
+##
+
+fit_sem <- stan_sar(y ~ x, data = dat, sar = sars,
+                    chains = 1, iter = 800)
+#> 
+#> *Setting prior parameters for intercept
+#> Distribution: normal
+#>   location scale
+#> 1     0.28     5
+#> 
+#> *Setting prior parameters for beta
+#> Distribution: normal
+#>   location scale
+#> 1        0     5
+#> 
+#> *Setting prior for SAR scale parameter (sar_scale)
+#> Distribution: student_t
+#>   df location scale
+#> 1 10        0     3
+#> 
+#> *Setting prior for SAR spatial autocorrelation parameter (sar_rho)
+#> Distribution: uniform
+#>   lower upper
+#> 1    -1     1
+#> 
+#> SAMPLING FOR MODEL 'foundation' NOW (CHAIN 1).
+#> Chain 1: 
+#> Chain 1: Gradient evaluation took 5.7e-05 seconds
+#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.57 seconds.
+#> Chain 1: Adjust your expectations accordingly!
+#> Chain 1: 
+#> Chain 1: 
+#> Chain 1: Iteration:   1 / 800 [  0%]  (Warmup)
+#> Chain 1: Iteration: 401 / 800 [ 50%]  (Sampling)
+#> Chain 1: Iteration: 800 / 800 [100%]  (Sampling)
+#> Chain 1: 
+#> Chain 1:  Elapsed Time: 0.087 seconds (Warm-up)
+#> Chain 1:                0.069 seconds (Sampling)
+#> Chain 1:                0.156 seconds (Total)
+#> Chain 1: 
+print(fit_sem)
+#> Spatial Model Results 
+#> Formula: y ~ x
+#> <environment: 0x560bef75df08>
+#> Spatial method (outcome):  SAR (SEM) 
+#> Likelihood function:  auto_gaussian 
+#> Link function:  identity 
+#> Residual Moran Coefficient:  -0.0147 
+#> Observations:  100 
+#> 
+#> Inference for Stan model: foundation.
+#> 1 chains, each with iter=800; warmup=400; thin=1; 
+#> post-warmup draws per chain=400, total post-warmup draws=400.
+#> 
+#>            mean se_mean    sd   2.5%   20%   50%   80% 97.5% n_eff  Rhat
+#> intercept 0.196   0.009 0.176 -0.165 0.046 0.200 0.345 0.530   388 0.998
+#> x         0.451   0.005 0.098  0.269 0.363 0.450 0.532 0.647   456 0.998
+#> sar_rho   0.408   0.006 0.112  0.182 0.316 0.417 0.504 0.614   371 0.998
+#> sar_scale 1.014   0.004 0.072  0.891 0.956 1.006 1.070 1.163   277 1.001
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Oct 29 10:41:09 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split chains (at 
+#> convergence, Rhat=1).
+
+##
+## data for SDEM
+##
+
+# mu = x*beta + wx*gamma; beta=1, gamma=-0.25
+x <- sim_sar(w = w, rho = 0.5)
+mu <- 1 * x - 0.25 * (w %*% x)[,1]
+y <- sim_sar(w = w, rho = .5, mu = mu)
+# or for SDLM:
+# y <- sim_sar(w = w, rho = 0.5, mu = mu, type = "SLM")
+dat <- data.frame(y=y, x=x)
+
+#
+## fit models
+##
+
+# DSEM
+# y = mu + rho*W*(y - mu) + epsilon
+# mu = beta*x + gamma*Wx
+fit_sdem <- stan_sar(y ~ x, data = dat,
+                    sar_parts = sars, type = "SDEM",
+                    iter = 800, chains = 1,
+                    quiet = TRUE)
+
+# SDLM
+# y = rho*Wy + beta*x + gamma*Wx + epsilon
+fit_sdlm <- stan_sar(y ~ x, data = dat,
+                    sar_parts = sars,
+                    type = "SDLM",
+                    iter = 800,
+                    chains = 1,
+                    quiet = TRUE)
+
+# compare by DIC
+dic(fit_sdem)
+#>     DIC penalty 
+#>   316.9     4.7 
+dic(fit_sdlm)
+#>     DIC penalty 
+#>   318.7     6.0 
+
+# \donttest{
+##
+## Modeling mortality rates
+##
+
+# simple spatial regression
+data(georgia)
+W <- shape2mat(georgia, style = "W")
+#> Contiguity condition: queen
+#> Number of neighbors per unit, summary:
+#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#>   1.000   4.000   5.000   5.409   6.000  10.000 
+#> 
+#> Spatial weights, summary:
+#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#>  0.1000  0.1429  0.1667  0.1849  0.2000  1.0000 
+
+fit <- stan_sar(log(rate.male) ~ 1,
+                C = W,
+                data = georgia,
+                iter = 900
+                )
+#> 
+#> *Setting prior parameters for intercept
+#> Distribution: normal
+#>   location scale
+#> 1     -4.2     5
+#> 
+#> *Setting prior for SAR scale parameter (sar_scale)
+#> Distribution: student_t
+#>   df location scale
+#> 1 10        0     3
+#> 
+#> *Setting prior for SAR spatial autocorrelation parameter (sar_rho)
+#> Distribution: uniform
+#>   lower upper
+#> 1  -1.7     1
+#> 
+#> SAMPLING FOR MODEL 'foundation' NOW (CHAIN 1).
+#> Chain 1: 
+#> Chain 1: Gradient evaluation took 9.7e-05 seconds
+#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.97 seconds.
+#> Chain 1: Adjust your expectations accordingly!
+#> Chain 1: 
+#> Chain 1: 
+#> Chain 1: Iteration:   1 / 900 [  0%]  (Warmup)
+#> Chain 1: Iteration: 451 / 900 [ 50%]  (Sampling)
+#> Chain 1: Iteration: 900 / 900 [100%]  (Sampling)
+#> Chain 1: 
+#> Chain 1:  Elapsed Time: 0.153 seconds (Warm-up)
+#> Chain 1:                0.098 seconds (Sampling)
+#> Chain 1:                0.251 seconds (Total)
+#> Chain 1: 
+#> 
+#> SAMPLING FOR MODEL 'foundation' NOW (CHAIN 2).
+#> Chain 2: 
+#> Chain 2: Gradient evaluation took 5.4e-05 seconds
+#> Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.54 seconds.
+#> Chain 2: Adjust your expectations accordingly!
+#> Chain 2: 
+#> Chain 2: 
+#> Chain 2: Iteration:   1 / 900 [  0%]  (Warmup)
+#> Chain 2: Iteration: 451 / 900 [ 50%]  (Sampling)
+#> Chain 2: Iteration: 900 / 900 [100%]  (Sampling)
+#> Chain 2: 
+#> Chain 2:  Elapsed Time: 0.143 seconds (Warm-up)
+#> Chain 2:                0.112 seconds (Sampling)
+#> Chain 2:                0.255 seconds (Total)
+#> Chain 2: 
+#> 
+#> SAMPLING FOR MODEL 'foundation' NOW (CHAIN 3).
+#> Chain 3: 
+#> Chain 3: Gradient evaluation took 5.5e-05 seconds
+#> Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.55 seconds.
+#> Chain 3: Adjust your expectations accordingly!
+#> Chain 3: 
+#> Chain 3: 
+#> Chain 3: Iteration:   1 / 900 [  0%]  (Warmup)
+#> Chain 3: Iteration: 451 / 900 [ 50%]  (Sampling)
+#> Chain 3: Iteration: 900 / 900 [100%]  (Sampling)
+#> Chain 3: 
+#> Chain 3:  Elapsed Time: 0.176 seconds (Warm-up)
+#> Chain 3:                0.116 seconds (Sampling)
+#> Chain 3:                0.292 seconds (Total)
+#> Chain 3: 
+#> 
+#> SAMPLING FOR MODEL 'foundation' NOW (CHAIN 4).
+#> Chain 4: 
+#> Chain 4: Gradient evaluation took 5.4e-05 seconds
+#> Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.54 seconds.
+#> Chain 4: Adjust your expectations accordingly!
+#> Chain 4: 
+#> Chain 4: 
+#> Chain 4: Iteration:   1 / 900 [  0%]  (Warmup)
+#> Chain 4: Iteration: 451 / 900 [ 50%]  (Sampling)
+#> Chain 4: Iteration: 900 / 900 [100%]  (Sampling)
+#> Chain 4: 
+#> Chain 4:  Elapsed Time: 0.17 seconds (Warm-up)
+#> Chain 4:                0.134 seconds (Sampling)
+#> Chain 4:                0.304 seconds (Total)
+#> Chain 4: 
+
+# view fitted vs. observed, etc.
+sp_diag(fit, georgia)
+
+
+# A more appropriate model for count data:
+# hierarchical spatial poisson model
+fit2 <- stan_sar(deaths.male ~ offset(log(pop.at.risk.male)),
+                C = W,
+                data = georgia,
+                family = poisson(),
+                chains = 1, # for ex. speed only
+                iter = 900,
+                quiet = TRUE
+                 )
+
+# view fitted vs. observed, etc.
+sp_diag(fit2, georgia)
+#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
+
+# }
+