From 503be3109035c0b3e8eb604a6f574e9fa53f1031 Mon Sep 17 00:00:00 2001
From: ConnorDonegan
Date: Tue, 12 Nov 2024 15:07:21 -0600
Subject: [PATCH] predict.slm - approx inverse
---
NAMESPACE | 1 +
R/convenience-functions.R | 19 +-
R/geostan_fit-methods.R | 105 ++++++--
R/impacts.R | 41 +--
R/moran.R | 15 +-
R/stan_car.R | 41 +--
R/stan_esf.R | 11 +-
R/stan_glm.R | 6 +-
R/stan_icar.R | 3 +-
R/stan_sar.R | 35 ++-
README.Rmd | 120 ++++++++-
README.html | 255 +++++++++++-------
README.md | 236 +++++++++++-----
docs/index.html | 217 ++++++++++-----
docs/pkgdown.yml | 2 +-
.../figures/README-unnamed-chunk-3-1.png | Bin 51917 -> 53067 bytes
.../figures/README-unnamed-chunk-5-1.png | Bin 57590 -> 58415 bytes
.../figures/README-unnamed-chunk-8-1.png | Bin 0 -> 60194 bytes
.../figures/README-unnamed-chunk-9-1.png | Bin 0 -> 38008 bytes
docs/reference/impacts.html | 17 +-
docs/reference/index.html | 2 +-
docs/reference/predict.geostan_fit.html | 73 ++++-
docs/reference/sim_sar.html | 2 +-
docs/reference/sp_diag.html | 4 +-
docs/reference/stan_car.html | 14 +-
docs/reference/stan_esf.html | 4 +-
docs/reference/stan_glm.html | 2 +-
docs/reference/stan_icar.html | 2 +-
docs/reference/stan_sar.html | 4 +-
man/figures/README-unnamed-chunk-3-1.png | Bin 51917 -> 53067 bytes
man/figures/README-unnamed-chunk-5-1.png | Bin 57590 -> 58415 bytes
man/figures/README-unnamed-chunk-8-1.png | Bin 0 -> 60194 bytes
man/figures/README-unnamed-chunk-9-1.png | Bin 0 -> 38008 bytes
man/stan_esf.Rd | 4 +-
man/stan_icar.Rd | 2 +-
tests/impacts.R | 121 +++++++++
tests/missing-y.R | 2 +-
37 files changed, 983 insertions(+), 377 deletions(-)
create mode 100644 docs/reference/figures/README-unnamed-chunk-8-1.png
create mode 100644 docs/reference/figures/README-unnamed-chunk-9-1.png
create mode 100644 man/figures/README-unnamed-chunk-8-1.png
create mode 100644 man/figures/README-unnamed-chunk-9-1.png
create mode 100644 tests/impacts.R
diff --git a/NAMESPACE b/NAMESPACE
index c2289224..e0ed96a6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,6 +8,7 @@ S3method(log_lik,geostan_fit)
S3method(plot,geostan_fit)
S3method(predict,geostan_fit)
S3method(print,geostan_fit)
+S3method(print,impacts_slm)
S3method(print,prior)
S3method(residuals,geostan_fit)
S3method(sp_diag,geostan_fit)
diff --git a/R/convenience-functions.R b/R/convenience-functions.R
index 7c9c0b5a..5cb85377 100644
--- a/R/convenience-functions.R
+++ b/R/convenience-functions.R
@@ -327,7 +327,7 @@ sp_diag.geostan_fit <- function(y,
plot = TRUE,
mc_style = c("scatter", "hist"),
style = c("W", "B"),
- w,
+ w = y$C,
rates = TRUE,
binwidth = function(x) 0.5 * stats::sd(x, na.rm = TRUE),
size = 0.1,
@@ -336,7 +336,7 @@ sp_diag.geostan_fit <- function(y,
if (inherits(y$C, "Matrix") | inherits(y$C, "matrix")) {
w <- y$C
} else {
- w <- shape2mat(shape, style = match.arg(style))
+ w <- shape2mat(shape, style = match.arg(style), quiet = TRUE)
}
}
mc_style <- match.arg(mc_style, c("scatter", "hist"))
@@ -344,11 +344,11 @@ sp_diag.geostan_fit <- function(y,
outcome <- y$data[,1]
fits <- fitted(y, summary = TRUE, rates = rates)
if (rates && y$family$family == "binomial") {
- message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.")
+ #message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.")
outcome <- outcome / (outcome + y$data[,2])
}
if (rates && y$family$family == "poisson" && "offset" %in% c(colnames(y$data))) {
- message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.")
+ #message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.")
log.at.risk <- y$data[, "offset"]
at.risk <- exp( log.at.risk )
outcome <- outcome / at.risk
@@ -365,8 +365,11 @@ sp_diag.geostan_fit <- function(y,
labs(x = "Observed",
y = "Fitted") +
theme_classic()
+
# map of marginal residuals
- marginal_residual <- apply(residuals(y, summary = FALSE, rates = rates, ...), 2, mean, na.rm = TRUE)
+ R <- residuals(y, summary = FALSE, rates = rates, ...)
+ marginal_residual <- apply(R, 2, mean, na.rm = TRUE)
+
map.y <- ggplot(shape) +
geom_sf(aes(fill = marginal_residual),
lwd = .05,
@@ -374,8 +377,8 @@ sp_diag.geostan_fit <- function(y,
scale_fill_gradient2(name = name,
label = signs::signs) +
theme_void()
+
# residual autocorrelation
- R <- residuals(y, summary = FALSE, rates = rates, ...)
R.mc <- apply(R, 1, mc, w = w, warn = FALSE, na.rm = TRUE)
if (mc_style == "scatter") {
g.mc <- moran_plot(marginal_residual, w, xlab = name, na.rm = TRUE)
@@ -391,9 +394,11 @@ sp_diag.geostan_fit <- function(y,
x = "Residual MC",
subtitle = paste0("MC (mean) = ", round(R.mc.mu, 2)))
}
+
if (length(unique(R.mc)) == 1) {
g.mc <- moran_plot(R[1,], w, xlab = name, na.rm = TRUE)
- }
+ }
+
if (plot) {
return( gridExtra::grid.arrange(ovf, g.mc, map.y, ncol = 3) )
} else {
diff --git a/R/geostan_fit-methods.R b/R/geostan_fit-methods.R
index 9413bcff..d0e5bc2e 100644
--- a/R/geostan_fit-methods.R
+++ b/R/geostan_fit-methods.R
@@ -55,17 +55,18 @@ print.geostan_fit <- function(x,
print(x$re$formula)
pars <- c(pars, "alpha_tau")
}
-
+ cat("Likelihood: ", x$family$family, "\n")
+ cat("Link: ", x$family$link, "\n")
+
meth <- x$spatial$method
if (meth == "SAR") meth <- paste0(meth, " (", x$sar_type, ")")
- cat("Spatial method (outcome): ", as.character(meth), "\n")
+ cat("Spatial method: ", as.character(meth), "\n")
if (x$spatial$method == "CAR") pars <- c(pars, "car_rho", "car_scale")
if (x$spatial$method == "SAR") pars <- c(pars, "sar_rho", "sar_scale")
if (x$spatial$method == "BYM2") pars <- c(pars, "rho", "spatial_scale")
if (x$spatial$method == "BYM") pars <- c(pars, "spatial_scale", "theta_scale")
if (x$spatial$method == "ICAR") pars <- c(pars, "spatial_scale")
- cat("Likelihood function: ", x$family$family, "\n")
- cat("Link function: ", x$family$link, "\n")
+
if (!is.null(x$diagnostic$Residual_MC)) cat("Residual Moran Coefficient: ", x$diagnostic$Residual_MC, "\n")
cat("Observations: ", x$N, "\n")
if (x$ME$has_me) {
@@ -408,6 +409,9 @@ as.array.geostan_fit <- function(x, ...){
#' @param type By default, results from `predict` are on the scale of the linear predictor (`type = "link")`). The alternative (`type = "response"`) is on the scale of the response variable. For example, the default return values for a Poisson model on the log scale, and using `type = "response"` will return the original scale of the outcome variable (by exponentiating the log values).
#'
#' @param add_slx Logical. If `add_slx = TRUE`, any spatially-lagged covariates that were specified through the 'slx' argument (of the model fitting function, e.g., `stan_glm`) will be added to the linear predictor. The spatial lag terms will be calculated internally using `object$C`, the spatial weights matrix used to fit the model. Hence, `newdata` must have `N = object$N` rows. Predictions from spatial lag models (SAR models of type 'SLM' and 'SDLM') always include the SLX terms (i.e., any value passed to `add_slx` will be overwritten with `TRUE`).
+#' @param approx For SAR models of type 'SLM' or 'SDLM' only; use an approximation for matrix inversion? See details below.
+#'
+#' @param K Number of matrix powers to use with \code{approx}.
#'
#' @param ... Not used
#'
@@ -415,21 +419,32 @@ as.array.geostan_fit <- function(x, ...){
#'
#' The primary purpose of the predict method is to explore marginal effects of covariates.
#'
-#' The model formula will be taken from `object$formula`, and then a model matrix will be created by passing `newdata` to the \link[stats]{model.frame} function (as in: \code{model.frame(newdata, object$formula}). Parameters are taken from `as.matrix(object, pars = c("intercept", "beta"))`. If `add_slx = TRUE`, SLX coefficients will be taken from `as.matrix(object, pars = "gamma")`.
+#' The model formula will be taken from `object$formula`, and then a model matrix will be created by passing `newdata` to the \link[stats]{model.frame} function (as in: \code{model.frame(object$formula, newdata)}. Parameters are taken from \code{as.matrix(object, pars = c("intercept", "beta"))}.
#'
-#' Spatially-lagged covariates added via the `slx` argument ('spillover effects') will be included if `add_slx = TRUE` or if a spatial lag model is provided (a SAR model of type 'SLM' or 'SDLM'). In either of those cases, `newdata` must have the same number of rows as were used to fit the original data. For details on these 'spillover effects', see LeSage and Pace (2009) and LeSage (2014).
+#' ## Spatial lag of X
+#'
+#' Spatially-lagged covariates which were included via the `slx` argument will, by default, not be included. They will be be included in predictions if `add_slx = TRUE` or if the fitted model is a SAR model of type 'SLM' or 'SDLM'. In either of those cases, `newdata` must have the same number of rows as were used to fit the original data.
+#'
+#' ## Spatial lag of Y
+#'
+#' The typical 'marginal effect' interpretation of the regression coefficients does not hold for the SAR models of type 'SLM' or 'SDLM'. For details on these 'spillover effects', see LeSage and Pace (2009), LeSage (2014), and `geostan::impacts`.
#'
#' Predictions for the spatial lag model (SAR models of type 'SLM') are equal to:
#' \deqn{
#' (I - \rho W)^{-1} X \beta
#' }
-#' where \eqn{X \beta} contains the intercept and covariates. (For intercept-only models, the above term is equal to the constant intercept.) Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to:
+#' where \eqn{X \beta} contains the intercept and covariates. Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to:
#' \deqn{
#' (I - \rho W)^{-1} (X \beta + WX \gamma)
#' }
-#' where \eqn{WX \gamma} are spatially lagged covariates multiplied by their coefficients. The SLM and SDLM differ from all other model types in that the spatial component of the model cannot be separated from the linear predictor and is, therefore, automatically incorporated into the predictions.
+#' where \eqn{WX \gamma} are spatially lagged covariates multiplied by their coefficients.
+#'
+#' The inverse of the matrix \eqn{(I - \rho W)} can be time consuming to compute (especially when iterating over MCMC samples). You can use `approx = TRUE` to approximate the inverse using a series of matrix powers. The argument \eqn{K} controls how many powers to use for the approximation. As a rule, higher values of \eqn{\rho} require larger \eqn{K}. Notice that \eqn{\rho^K} should be close to zero for the approximation to hold. For example, for \eqn{\rho = .5} a value of \eqn{K=8} may suffice (eqn{0.5^8 = 0.004}), but larger values of \eqn{\rho} require higher values of \eqn{K}.
+#'
+#'
+#' ## Generalized linear models
#'
-#' In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR (error model), or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the `alpha` argument.
+#' In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to geographic location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR, or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the `alpha` argument.
#'
#' @return
#'
@@ -441,16 +456,17 @@ as.array.geostan_fit <- function(x, ...){
#'
#' fit <- stan_glm(deaths.male ~ offset(log(pop.at.risk.male)) + log(income),
#' data = georgia,
+#' re = ~ GEOID,
#' centerx = TRUE,
#' family = poisson(),
#' chains = 2, iter = 600) # for speed only
#'
-#' # note: pop.at.risk.male=1 leads to log(pop.at.risk.male)=0
+#' # note: pop.at.risk.male=1 leads to offset of log(pop.at.risk.male)=0
#' # so that the predicted values are rates
#' newdata <- data.frame(
#' income = seq(min(georgia$income),
#' max(georgia$income),
-#' length.out = 100),
+#' length.out = 200),
#' pop.at.risk.male = 1)
#'
#' preds <- predict(fit, newdata, type = "response")
@@ -465,12 +481,35 @@ as.array.geostan_fit <- function(x, ...){
#' newdata$pop.at.risk.male <- 10e3
#' preds <- predict(fit, newdata, type = "response")
#' head(preds)
+#'
+#' # plot range
+#' y_lim <- c(min(preds$`2.5%`), max(preds$`97.5%`))
+#'
+#' # plot line
#' plot(preds$income,
#' preds$mean,
-#' type = "l",
+#' type = "l",
#' ylab = "Deaths per 10,000",
-#' xlab = "Income ($1,000s)")
-#'
+#' xlab = "Income ($1,000s)",
+#' ylim = y_lim,
+#' axes = FALSE)
+#'
+#' # add shaded cred. interval
+#' x <- c(preds$income, rev(preds$income))
+#' y <- c(preds$`2.5%`, rev(preds$`97.5%`))
+#' polygon(x = x, y = y,
+#' col = rgb(0.1, 0.2, 0.3, 0.3),
+#' border = NA)
+#'
+#' # add axes
+#' yat = seq(0, 300, by = 20)
+#' axis(2, at = yat)
+#'
+#' xat = seq(0, 200, by = 10)
+#' axis(1, at = xat)
+#'
+#' # show county incomes
+#' rug(georgia$income)
#' @source
#'
#' Goulard, Michael, Thibault Laurent, and Christine Thomas-Agnan (2017). About predictions in spatial autoregressive models: optimal and almost optimal strategies. *Spatial Economic Analysis* 12 (2-3): 304-325.
@@ -490,6 +529,8 @@ predict.geostan_fit <- function(object,
summary = TRUE,
type = c("link", "response"),
add_slx = FALSE,
+ approx = FALSE,
+ K = 15,
...) {
type <- match.arg(type)
if (missing(newdata)) return (fitted(object, summary = summary, ...))
@@ -511,11 +552,10 @@ predict.geostan_fit <- function(object,
lag_y <- FALSE
if (object$spatial$method == "SAR") {
add_slx <- grepl("SDEM|SDLM", object$sar_type)
- lay_y <- grepl("SLM|SDLM", object$sar_type)
+ lag_y <- grepl("SLM|SDLM", object$sar_type)
}
- if (add_slx) {
-
+ if (add_slx) {
stopifnot( nrow(newdata) == nrow(object$C) )
Gamma <- as.matrix(object, pars = "gamma")
gnames <- gsub("^w.", "", colnames(Gamma))
@@ -524,8 +564,7 @@ predict.geostan_fit <- function(object,
X_tmp <- as.matrix(X[, idx])
if (length(idx) != ncol(Gamma)) stop("Mis-match of SLX coefficient names and X column names.")
WX <- W %*% X_tmp
- for (m in 1:M) P[m,] <- as.numeric( P[m,] + WX %*% as.matrix(Gamma[m,]) )
-
+ for (m in 1:M) P[m,] <- as.numeric( P[m,] + WX %*% as.matrix(Gamma[m,]) )
}
if (lag_y == TRUE) {
@@ -537,10 +576,30 @@ predict.geostan_fit <- function(object,
W <- object$C
I <- Matrix::Diagonal(N)
rho <- as.matrix(object, pars = "sar_rho")[,1]
-
- for (m in 1:M) {
- IMRWI <- Matrix::solve(I - rho[m] * W)
- P[m,] <- as.numeric( IMRWI %*% P[m,] )
+
+ if (approx) {
+ Q <- K + 1
+ powers = 0:K
+ Mlist <- list()
+ Mlist[[1]] <- I
+ Mlist[[2]] <- W
+ W_k <- W
+ for (q in 3:Q) {
+ W_k <- W %*% W_k
+ Mlist[[q]] <- W_k
+ }
+ for (m in 1:M) {
+ rho_powers <- rho[ m ]^powers
+ Mpowers <- lapply(seq(Q), function(j) Mlist[[j]] * rho_powers[j])
+ Multiplier <- Reduce(`+`, Mpowers)
+ P[ m, ] <- as.numeric( Multiplier %*% P[ m, ] )
+ }
+
+ } else {
+ for (m in 1:M) {
+ Multiplier <- Matrix::solve(I - rho[m] * W)
+ P[m,] <- as.numeric( Multiplier %*% P[m,] )
+ }
}
}
diff --git a/R/impacts.R b/R/impacts.R
index 7ce67655..f84dfa54 100644
--- a/R/impacts.R
+++ b/R/impacts.R
@@ -87,13 +87,6 @@ spill <- function(beta, gamma = 0, rho, W, method = c('quick', 'proper'), K = 20
return( impacts_multiplier(beta, gamma, rho, T, K) )
}
- # matrix powers: slower method of approximation
- ## M_tmp <- I + rho * W
- ## W_k <- W
- ## for (j in 2:K) {
- ## W_k <- W %*% W_k
- ## M_tmp = M_tmp + rho^j * W_k
-
N <- nrow(W)
I <- diag(rep(1, N))
imrw <- I - rho * W
@@ -137,7 +130,6 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) {
gamma_idx <- match( gsub("^w.", "", colnames(gamma)), Blabs )
for (j in seq_along(gamma_idx)) G[ , gamma_idx[j] ] <- gamma[ , j ]
}
-
impax <- vector("list", length = M)
@@ -145,8 +137,8 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) {
for (m in 1:M) {
impax[[m]] <- sapply(1:S, function(s)
- spill(beta = B[s, m],
- gamma = G[s, m],
+ spill(beta = as.numeric( B[s, m] ),
+ gamma = as.numeric( G[s, m] ),
rho = rho[s],
W = W,
method,
@@ -164,7 +156,7 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) {
for (m in 1:M) {
impax[[m]] <- sapply(1:S, function(s)
- impacts_multiplier(B[s,m], G[s,m], rho[s], T, K)) |>
+ impacts_multiplier(as.numeric( B[s,m] ), as.numeric( G[s,m] ), rho[s], T, K)) |>
t()
}
@@ -179,14 +171,31 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) {
upr = as.numeric(apply(impax[[m]], 2, quantile, probs = 0.975))
res <- cbind(mean = est, median = est2, sd, lwr, upr)
row.names(res) <- c('Direct', 'Indirect', 'Total')
- summary[[m]] <- res
- names(summary)[m] <- Blabs[m]
- }
+ summary[[m]] <- res
+ }
+
+ names(impax) <- Blabs
+ names(summary) <- Blabs
+ out <- list(summary = summary, samples = impax)
+ class(out) <- append("impacts_slm", class(out))
- return(list(summary = summary, samples = impax))
+ return(out)
}
+#' @export
+#'
+#' @param x An object of class 'impacts_slm', as returned by `geostan::impacts`
+#'
+#' @param digits Round results to this many digits
+#'
+#' @param ... Additional arguments will be passed to `base::print`
+#'
+#' @rdname impacts
+print.impacts_slm <- function(x, digits = 2, ...) {
+ print(x$summary, digits = digits, ...)
+}
+
#' After LeSage and Pace 2009 pp. 114--115
#' @noRd
impacts_multiplier <- function(beta, gamma, rho, T, K) {
@@ -206,7 +215,7 @@ impacts_multiplier <- function(beta, gamma, rho, T, K) {
# indirect
indirect <- total - direct
- return (c(direct = direct, indirect = indirect, total = total))
+ retunr (c(direct = direct, indirect = indirect, total = total))
}
#' diagonal entries of matrix powers e..g, diag( W^{20} )
diff --git a/R/moran.R b/R/moran.R
index 4c835524..52887a27 100644
--- a/R/moran.R
+++ b/R/moran.R
@@ -40,13 +40,13 @@ mc <- function(x, w, digits = 3, warn = TRUE, na.rm = FALSE) {
check_sa_data(x, w)
na_idx <- which(is.na(x))
if (na.rm == TRUE && length(na_idx) > 0) {
- if (warn) message(length(na_idx), " NA values found in x; they will be removed from the data before calculating the Moran coefficient. If matrix w was row-standardized, it may not longer be. You may want to use a binary connectivity matrix by using style = 'B' in shape2mat.")
+ if (warn) message(length(na_idx), " dropping NA values found in x (nb: this disrupts row-standardization of matrix w) ")
x <- x[-na_idx]
w <- w[-na_idx, -na_idx]
}
if (any(Matrix::rowSums(w) == 0)) {
zero.idx <- which(Matrix::rowSums(w) == 0)
- if (warn) message(length(zero.idx), " observations with no neighbors found. They will be removed from the data before calculating the Moran coefficient.")
+ if (warn) message(length(zero.idx), " dropping observations with zero neighbors ")
x <- x[-zero.idx]
w <- w[-zero.idx, -zero.idx]
}
@@ -102,11 +102,12 @@ moran_plot <- function(x, w, xlab = "x (centered)", ylab = "Spatial Lag", pch =
if (length(na_idx) > 0) {
if (na.rm == TRUE) {
msg <- paste(length(na_idx),
- "NA values found in x will be dropped from data x and matrix w"
+ "NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)"
)
- if (!inherits(w, "ngCMatrix")) msg <- paste(msg,
- "\n If you are using a row-standardized matrix: dropping the NA values means that some rows in matrix w do not sum to 1; you may want to remove missing observations manually and then make row-standardized w, or use a binary matrix (see ?shape2mat)."
- )
+ ## if (!inherits(w, "ngCMatrix")) msg <- paste(msg,
+ ## "\n If you are using a row-standardized matrix: dropping the NA values means that some rows in matrix w do not sum to 1; you may want to remove missing observations manually and then make row-standardized w, or use a binary matrix (see ?shape2mat)."
+ ## )
+
message(msg)
x <- x[-na_idx]
w <- w[-na_idx, -na_idx]
@@ -116,7 +117,7 @@ moran_plot <- function(x, w, xlab = "x (centered)", ylab = "Spatial Lag", pch =
}
if (any(Matrix::rowSums(w) == 0)) {
zero.idx <- which(Matrix::rowSums(w) == 0)
- message(length(zero.idx), " observations with no neighbors found. They will be dropped from the data before creating the Moran plot.")
+ message(length(zero.idx), " dropping observations with zero neighbors ")
x <- x[-zero.idx]
w <- w[-zero.idx, -zero.idx]
}
diff --git a/R/stan_car.R b/R/stan_car.R
index 8714d49c..420e44e9 100644
--- a/R/stan_car.R
+++ b/R/stan_car.R
@@ -15,9 +15,9 @@
#'
#' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data.
#'
-#' @param car_parts A list of data for the CAR model, as returned by \code{\link[geostan]{prep_car_data}}.
+#' @param car_parts A list of data for the CAR model, as returned by \code{\link[geostan]{prep_car_data}}. If not provided by the user, then \code{C} will automatically be passed to \code{prep_car_data} to create it.
#'
-#' @param C Optional spatial connectivity matrix which will be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms; it will automatically be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}.
+#' @param C Spatial connectivity matrix which will be used internally to create \code{car_parts} (if \code{car_parts} is missing); if the user provides an \code{slx} formula for the model, the required connectivity matrix will be taken from the \code{car_parts} list. See \code{\link[geostan]{shape2mat}}.
#'
#' @param family The likelihood function for the outcome variable. Current options are \code{auto_gaussian()}, \code{binomial(link = "logit")}, and \code{poisson(link = "log")}; if `family = gaussian()` is provided, it will automatically be converted to `auto_gaussian()`.
#'
@@ -195,8 +195,12 @@
#' C <- shape2mat(georgia, style = "B")
#' cars <- prep_car_data(C)
#'
+#' # MCMC specs: set for purpose of demo speed
+#' iter = 500
+#' chains = 2
+#'
#' fit <- stan_car(log(rate.male) ~ 1, data = georgia,
-#' car_parts = cars, iter = 400, quiet = TRUE)
+#' car_parts = cars, iter = iter, chains = chains)
#'
#' # model diagnostics
#' sp_diag(fit, georgia)
@@ -207,7 +211,7 @@
#' car_parts = cars,
#' data = georgia,
#' family = poisson(),
-#' iter = 400, quiet = TRUE)
+#' iter = iter, chains = chains)
#'
#' # model diagnostics
#' sp_diag(fit, georgia)
@@ -235,7 +239,7 @@
#' data = georgia,
#' car = dcars,
#' family = poisson(),
-#' iter = 400, quiet = TRUE)
+#' iter = iter, chains = chains)
#'
#' sp_diag(Dfit, georgia, dcars$C)
#' dic(Dfit); dic(fit)
@@ -278,15 +282,18 @@ stan_car <- function(formula,
check_car_parts(car_parts)
stopifnot(car_parts$n == nrow(data))
if (quiet) refresh <- 0
- if (!missing(C)) {
- stopifnot(inherits(C, "Matrix") | inherits(C, "matrix"))
- stopifnot(all(dim(C) == nrow(data)))
- } else {
- C <- car_parts$C
+ C <- car_parts$C
+
+ ## C [CAR: always take C from car_parts]
+ ## if (!missing(C)) {
+ ## stopifnot(inherits(C, "Matrix") | inherits(C, "matrix"))
+ ## stopifnot(all(dim(C) == nrow(data)))
+ ## } else {
+ ## C <- car_parts$C
# if (car_parts$WCAR == 0) {
# message("Since you did not provide C, calculation of residual SA and any spatial-lag of X terms will use the matrix found in car_parts$C.")
# }
- }
+ #}
# zero-mean constraint parameterization
car_parts$ZMP <- ifelse(missing(zmp), 0, zmp)
@@ -346,12 +353,12 @@ stan_car <- function(formula,
x_full <- xraw
} else {
stopifnot(inherits(slx, "formula"))
- W <- C
- if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix")
- xrs <- Matrix::rowSums(W)
- if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
+ ## W <- C
+ ## if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix")
+ ## xrs <- Matrix::rowSums(W)
+ ## if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
# efficient transform to CRS representation for W.list (via transpose)
- Wij <- as(W, "TsparseMatrix")
+ Wij <- as(C, "TsparseMatrix")
Tw <- Matrix::sparseMatrix(i = Wij@j + 1,
j = Wij@i + 1,
x = Wij@x,
@@ -359,7 +366,7 @@ stan_car <- function(formula,
W.list <- list(w = Tw@x,
v = Tw@i + 1,
u = Tw@p + 1)
- Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W)
+ Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = C)
dwx <- ncol(Wx)
wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx )
x_full <- cbind(Wx, xraw)
diff --git a/R/stan_esf.R b/R/stan_esf.R
index c3198f94..184ac3ed 100644
--- a/R/stan_esf.R
+++ b/R/stan_esf.R
@@ -12,13 +12,13 @@
#' alpha_tau ~ Student_t(d.f., location, scale).
#' ```
#'
-#' @param C Spatial connectivity matrix which will be used to calculate eigenvectors, if `EV` is not provided by the user. Typically, the binary connectivity matrix is best for calculating eigenvectors (i.e., using `C = shape2mat(shape, style = "B")`). This matrix will also be used to calculate residual spatial autocorrelation and any user specified \code{slx} terms; it will be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}.
+#' @param C Spatial connectivity matrix. This will be used to calculate eigenvectors if `EV` is not provided by the user. See \code{\link[geostan]{shape2mat}}. Use of row-normalization (as in `\code{shape2mat(shape, 'W')} is not recommended for creating \code{EV}. Matrix \code{C} will also be used ('as is') to create any user-specified \code{slx} terms.
#'
#' @param nsa Include eigenvectors representing negative spatial autocorrelation? Defaults to \code{nsa = FALSE}. This is ignored if \code{EV} is provided.
#'
#' @param threshold Eigenvectors with standardized Moran coefficient values below this `threshold` value will be excluded from the candidate set of eigenvectors, `EV`. This defaults to \code{threshold = 0.25}, and is ignored if \code{EV} is provided.
#'
-#' @param EV A matrix of eigenvectors from any (transformed) connectivity matrix, presumably spatial (see \code{\link[geostan]{make_EV}}). If `EV` is provided, still also provide a spatial weights matrix \code{C} for other purposes; `threshold` and `nsa` are ignored for user provided `EV`.
+#' @param EV A matrix of eigenvectors from any (transformed) connectivity matrix, presumably spatial or network-based (see \code{\link[geostan]{make_EV}}). If `EV` is provided, still also provide a spatial weights matrix \code{C} for other purposes; `threshold` and `nsa` are ignored for user provided `EV`.
#'
#' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data.
#'
@@ -273,12 +273,11 @@ stan_esf <- function(formula,
x_full <- xraw
} else {
stopifnot(inherits(slx, "formula"))
- W <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
- W.list <- rstan::extract_sparse_parts(W)
- Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W)
+ #W <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
+ W.list <- rstan::extract_sparse_parts(C)
+ Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = C)
dwx <- ncol(Wx)
wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx )
-
x_full <- cbind(Wx, xraw)
}
}
diff --git a/R/stan_glm.R b/R/stan_glm.R
index 1bb81128..69db18be 100644
--- a/R/stan_glm.R
+++ b/R/stan_glm.R
@@ -19,7 +19,7 @@
#'
#' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data.
#'
-#' @param C Optional spatial connectivity matrix which will be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms; it will automatically be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}.
+#' @param C Spatial connectivity matrix which will be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. See \code{\link[geostan]{shape2mat}}.
#'
#' @param family The likelihood function for the outcome variable. Current options are \code{poisson(link = "log")}, \code{binomial(link = "logit")}, \code{student_t()}, and the default \code{gaussian()}.
#'
@@ -392,8 +392,8 @@ stan_glm <- function(formula,
stopifnot(inherits(slx, "formula"))
W <- C
if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix")
- xrs <- Matrix::rowSums(W)
- if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
+ ## xrs <- Matrix::rowSums(W)
+ ## if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
# efficient transform to CRS representation for W.list (via transpose)
Wij <- as(W, "TsparseMatrix")
Tw <- Matrix::sparseMatrix(i = Wij@j + 1,
diff --git a/R/stan_icar.R b/R/stan_icar.R
index eb0858cf..591a5cf2 100644
--- a/R/stan_icar.R
+++ b/R/stan_icar.R
@@ -15,7 +15,7 @@
#'
#' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data.
#'
-#' @param C Spatial connectivity matrix which will be used to construct an edge list for the ICAR model, and to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. It will automatically be row-standardized before calculating \code{slx} terms. \code{C} must be a binary symmetric \code{n x n} matrix.
+#' @param C Spatial connectivity matrix which will be used to construct an edge list for the ICAR model, and to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. It will automatically be row-standardized before calculating \code{slx} terms (matching the ICAR model). \code{C} must be a binary symmetric \code{n x n} matrix.
#'
#' @param type Defaults to "icar" (partial pooling of neighboring observations through parameter \code{phi}); specify "bym" to add a second parameter vector \code{theta} to perform partial pooling across all observations; specify "bym2" for the innovation introduced by Riebler et al. (2016). See \code{Details} for more information.
#'
@@ -338,7 +338,6 @@ stan_icar <- function(formula,
Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W)
dwx <- ncol(Wx)
wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx )
-
x_full <- cbind(Wx, xraw)
}
}
diff --git a/R/stan_sar.R b/R/stan_sar.R
index bcc3a85f..0ef00c06 100644
--- a/R/stan_sar.R
+++ b/R/stan_sar.R
@@ -15,9 +15,9 @@
#'
#' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data.
#'
-#' @param C Spatial weights matrix (conventionally referred to as \eqn{W} in the SAR model). Typically, this will be created using `geostan::shape2mat(shape, style = "W")`. This will be passed internally to \code{\link[geostan]{prep_sar_data}}, and will also be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. See \code{\link[geostan]{shape2mat}}.
-#'
#' @param sar_parts List of data constructed by \code{\link[geostan]{prep_sar_data}}. If not provided, then `C` will automatically be passed to \code{\link[geostan]{prep_sar_data}} to create `sar_parts`.
+#'
+#' @param C Spatial connectivity matrix which will be used internally to create \code{sar_parts} (if \code{sar_parts} is missing); if the user provides an \code{slx} formula for the model, the required connectivity matrix will be taken from the \code{sar_parts} list. See \code{\link[geostan]{shape2mat}}.
#'
#' @param family The likelihood function for the outcome variable. Current options are \code{auto_gaussian()}, \code{binomial()} (with logit link function) and \code{poisson()} (with log link function); if `family = gaussian()` is provided, it will automatically be converted to `auto_gaussian()`.
#'
@@ -262,7 +262,7 @@
#' ## fit models
#' ##
#'
-#' # DSEM
+#' # SDEM
#' # y = mu + rho*W*(y - mu) + epsilon
#' # mu = beta*x + gamma*Wx
#' fit_sdem <- stan_sar(y ~ x, data = dat,
@@ -358,13 +358,15 @@ stan_sar <- function(formula,
Durbin <- grepl('SDEM|SDLM', type)
if(grepl("SLM|SDLM", type) & family$family != "auto_gaussian") stop("SLM/SDLM are only available as auto-normal models (not Poisson or binomial models).")
#### SAR type [stop]
- # C
- if (!missing(C)) {
- stopifnot(inherits(C, "Matrix") | inherits(C, "matrix"))
- stopifnot(all(dim(C) == nrow(data)))
- } else {
- C <- sar_parts$W
- }
+ # C [SAR: always take C from sar_parts]
+ C <- sar_parts$W
+
+ ## if (!missing(C)) {
+ ## stopifnot(inherits(C, "Matrix") | inherits(C, "matrix"))
+ ## stopifnot(all(dim(C) == nrow(data)))
+ ## } else {
+ ## C <- sar_parts$W
+ #}
# zero-mean constraint parameterization
sar_parts$ZMP <- ifelse(missing(zmp), 0, zmp)
@@ -432,15 +434,12 @@ stan_sar <- function(formula,
}
slx = formula[-2]
}
-
- W <- C
- if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix")
- xrs <- Matrix::rowSums(W)
- if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
-
- # nb: in stan_sar, W is taken from sar_parts, not calculated here.
- Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W, Durbin = Durbin)
+ ## xrs <- Matrix::rowSums(C)
+ ## if (!all(xrs == 1)) C <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.")
+ # nb: in stan_sar, CSR rep of W is taken from sar_parts, not calculated here.
+
+ Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = C, Durbin = Durbin)
dwx <- ncol(Wx)
wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx )
x_full <- cbind(Wx, xraw)
diff --git a/README.Rmd b/README.Rmd
index 166b4b88..d2b9627e 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -30,7 +30,7 @@ Features include:
* **Observational uncertainty** Incorporate information on data reliability, such as standard errors of American Community Survey estimates, into any geostan model.
* **Missing and Censored observations** Vital statistics and disease surveillance systems like CDC Wonder censor case counts that fall below a threshold number; geostan can model disease or mortality risk for small areas with censored observations or with missing observations.
* **The RStan ecosystem** Interfaces easily with many high-quality R packages for Bayesian modeling.
-* **Custom spatial models** Tools for building custom spatial models in Stan.
+* **Custom spatial models** Tools for building custom spatial or network models in Stan.
For public health research, geostan complements the [surveil](https://connordonegan.github.io/surveil/) R package for the study of time trends in disease incidence or mortality data.
@@ -78,33 +78,47 @@ library(geostan)
data(georgia)
```
-This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them.
+This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them to protect privacy.
-The `sp_diag` function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. Here is a visual summary of crude female mortality rates (as deaths per 10,000):
+The `sp_diag` function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. The Moran scatter plot displays the values against a summary of their neighboring values, so that the slope of the regression line gives a measure of their degree of autocorrelation.
+
+Here is a quick visual summary of crude female mortality rates (as deaths per 10,000):
```{r fig.width = 8}
-A <- shape2mat(georgia, style = "B")
+# create adjacency matrix ("B" is for binary)
+C <- shape2mat(georgia, style = "B")
+
+# crude mortality rate per 10,000 at risk
mortality_rate <- georgia$rate.female * 10e3
-sp_diag(mortality_rate, georgia, w = A)
+
+# quick spatial diagnostics
+sp_diag(mortality_rate, georgia, w = C, name = "Mortality")
```
-The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area qualities. The R syntax for fitting the models is similar to using `lm` or `glm`. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform. In this case, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the `censor_point` argument and setting it to `censor_point = 9`, the model will account for the censoring process when providing estimates of the mortality rates:
+Mortality rates and other health statistics for counties are, in many cases, highly unstable estimates that cannot be relied upon for public advisories or inference (due to small population sizes). Hence, we need models to make inferences from small area data.
+
+The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area variables. The R syntax for fitting the models is similar to using `lm` or `glm`. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform.
+
+In our Georgia mortality data, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the `censor_point` argument and setting it to `censor_point = 9`, we can easily obtain estimates for the censored counties (along with all the others) using models account for the censoring process:
```{r}
-cars <- prep_car_data(A)
+# prepare a list of data for CAR models in Stan
+cars <- prep_car_data(C)
+
+# fit the model to female mortality rates
fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
censor_point = 9,
data = georgia,
car_parts = cars,
family = poisson(),
- cores = 4, # for multi-core processing
- refresh = 0) # to silence some printing
+ iter = 1e3, # no. MCMC samples
+ quiet = TRUE) # to silence printing
```
Passing a fitted model to the `sp_diag` function will return a set of diagnostics for spatial models:
```{r fig.width = 8}
-sp_diag(fit, georgia, w = A)
+sp_diag(fit, georgia)
```
The `print` method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean `se_mean`, effective sample size `n_eff`, and the R-hat statistic `Rhat`):
@@ -113,17 +127,97 @@ The `print` method returns a summary of the probability distributions for model
print(fit)
```
-Applying the `fitted` method to the fitted model will return the fitted values from the model - in this case, the fitted values are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:
+To extract estimates of the county mortality rates from this, we apply the `fitted` method - in this case, the fitted values from the model are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:
```{r}
+# mortality rates per 10,000 at risk
mortality_est <- fitted(fit) * 10e3
+
+# display rates with county names
county_name <- georgia$NAME
head( cbind(county_name, mortality_est) )
```
-The mortality estimates are stored in the column named "mean", and the limits of the 95\% credible interval are found in the columns "2.5%" and "97.5%".
+The mortality estimates are stored in the column named "mean", and the limits of the 95\% credible interval are found in the columns "2.5%" and "97.5%". Here we create a map of estimates (with some help from `sf` package):
+
+```{r fig.width = 7, fig.height = 4.5}
+library(sf)
+
+# put estimates into bins for map colors
+x <- mortality_est$mean
+brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1))
+est_cut <- cut(x, breaks = brks, include.lowest = TRUE)
+
+# assign colors to values
+rank <- as.numeric( est_cut )
+pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") )
+pal <- pal_fun( max(rank) )
+colors <- pal[ rank ]
+
+# set plot margins
+og=par(mar=rep(1, 4))
+
+# get boundaries
+geom <- sf::st_geometry(georgia)
+
+# map estimates
+plot(geom,
+ lwd = 0.2,
+ col = colors)
+
+# legend
+legend("right",
+ fill = pal,
+ title = 'Mortality per 10,000',
+ legend = levels(est_cut),
+ bty = 'n'
+)
+
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2)
+# reset margins
+par(og)
+```
+
+Using the credible intervals, we can complement our map with a point-interval plot:
+
+```{r fig.height = 6.5}
+# order counties by mortality rate
+index <- order(mortality_est$mean, decreasing = TRUE)
+dat <- mortality_est[index, ]
+
+# gather estimate with credible interval (95%)
+est <- dat$mean
+lwr <- dat$`2.5%`
+upr <- dat$`97.5%`
+y <- seq_along(county_name)
+x_lim <- c(min(lwr), max(upr)) |>
+ round()
+
+og=par(mar = c(3, 0, 0, 0))
+
+# points
+plot(est,
+ y,
+ pch = 5,
+ col = 'gray50',
+ bty = 'L',
+ axes = FALSE,
+ xlim = x_lim,
+ ylab = NA,
+ xlab = NA)
+
+# intervals
+segments(x0 = lwr, x1 = upr,
+ y0 = y, y1 = y,
+ col = colors[ index ])
+
+# x axis
+axis(1, at = seq(x_lim[1], x_lim[2], by = 20))
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2)
+par(og)
+```
-Details and demonstrations can be found in the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html).
+More details and demonstrations can be found in the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html).
## Citing geostan
diff --git a/README.html b/README.html
index ed8a0bf3..f1bae065 100644
--- a/README.html
+++ b/README.html
@@ -610,12 +610,12 @@ geostan: Bayesian spatial analysisIntroductions to the software can be found at r-spatial.org and in the package vignettes.
Features include:
-- Disease mapping and spatial regression Statistical models for data recorded across areal units like states, counties, or census tracts.
+- Spatial regression and disease mapping Statistical models for data recorded across areal units (states, counties, or census tracts) or networks, including spatial econometric models.
- Spatial analysis tools Tools for visualizing and measuring spatial autocorrelation and map patterns, for exploratory analysis and model diagnostics.
- Observational uncertainty Incorporate information on data reliability, such as standard errors of American Community Survey estimates, into any geostan model.
- Missing and Censored observations Vital statistics and disease surveillance systems like CDC Wonder censor case counts that fall below a threshold number; geostan can model disease or mortality risk for small areas with censored observations or with missing observations.
- The RStan ecosystem Interfaces easily with many high-quality R packages for Bayesian modeling.
-- Custom spatial models Tools for building custom spatial models in Stan.
+- Custom spatial models Tools for building custom spatial or network models in Stan.
For public health research, geostan complements the surveil R package for the study of time trends in disease incidence or mortality data.
Installation
@@ -642,103 +642,174 @@ Usage
Load the package and the georgia
county mortality data set:
library(geostan)
data(georgia)
-This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them.
-The sp_diag
function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. Here is a visual summary of crude female mortality rates (as deaths per 10,000):
-A <- shape2mat(georgia, style = "B")
-#> Contiguity condition: queen
-#> Number of neighbors per unit, summary:
-#> Min. 1st Qu. Median Mean 3rd Qu. Max.
-#> 1.000 4.000 5.000 5.409 6.000 10.000
-#>
-#> Spatial weights, summary:
-#> Min. 1st Qu. Median Mean 3rd Qu. Max.
-#> 1 1 1 1 1 1
-mortality_rate <- georgia$rate.female * 10e3
-sp_diag(mortality_rate, georgia, w = A)
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing non-finite outside the scale
-#> range (`stat_bin()`).
-
+This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them to protect privacy.
+The sp_diag
function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. The Moran scatter plot displays the values against a summary of their neighboring values, so that the slope of the regression line gives a measure of their degree of autocorrelation.
+Here is a quick visual summary of crude female mortality rates (as deaths per 10,000):
+# create adjacency matrix ("B" is for binary)
+C <- shape2mat(georgia, style = "B")
+#> Contiguity condition: queen
+#> Number of neighbors per unit, summary:
+#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+#> 1.000 4.000 5.000 5.409 6.000 10.000
+#>
+#> Spatial weights, summary:
+#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+#> 1 1 1 1 1 1
+
+# crude mortality rate per 10,000 at risk
+mortality_rate <- georgia$rate.female * 10e3
+
+# quick spatial diagnostics
+sp_diag(mortality_rate, georgia, w = C, name = "Mortality")
+#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)
+#> Warning: Removed 3 rows containing non-finite outside the scale range
+#> (`stat_bin()`).
+
-The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area qualities. The R syntax for fitting the models is similar to using lm
or glm
. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform. In this case, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the censor_point
argument and setting it to censor_point = 9
, the model will account for the censoring process when providing estimates of the mortality rates:
-cars <- prep_car_data(A)
-#> Range of permissible rho values: -1.661, 1
-fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
- censor_point = 9,
- data = georgia,
- car_parts = cars,
- family = poisson(),
- cores = 4, # for multi-core processing
- refresh = 0) # to silence some printing
-#> 3 NA values identified in the outcome variable
-#> Found in rows: 55, 126, 157
-#>
-#> *Setting prior parameters for intercept
-#> Distribution: normal
-#> location scale
-#> 1 -4.7 5
-#>
-#> *Setting prior for CAR scale parameter (car_scale)
-#> Distribution: student_t
-#> df location scale
-#> 1 10 0 3
-#>
-#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
-#> Distribution: uniform
-#> lower upper
-#> 1 -1.7 1
+Mortality rates and other health statistics for counties are, in many cases, highly unstable estimates that cannot be relied upon for public advisories or inference (due to small population sizes). Hence, we need models to make inferences from small area data.
+The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area variables. The R syntax for fitting the models is similar to using lm
or glm
. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform.
+In our Georgia mortality data, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the censor_point
argument and setting it to censor_point = 9
, we can easily obtain estimates for the censored counties (along with all the others) using models account for the censoring process:
+# prepare a list of data for CAR models in Stan
+cars <- prep_car_data(C)
+#> Range of permissible rho values: -1.661, 1
+
+# fit the model to female mortality rates
+fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
+ censor_point = 9,
+ data = georgia,
+ car_parts = cars,
+ family = poisson(),
+ iter = 1e3, # no. MCMC samples
+ quiet = TRUE) # to silence printing
+#> 3 NA values identified in the outcome variable
+#> Found in rows: 55, 126, 157
Passing a fitted model to the sp_diag
function will return a set of diagnostics for spatial models:
-sp_diag(fit, georgia, w = A)
-#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing missing values or values
-#> outside the scale range (`geom_pointrange()`).
-
+sp_diag(fit, georgia)
+#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)
+#> Warning: Removed 3 rows containing missing values or values outside the
+#> scale range (`geom_pointrange()`).
+
The print
method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean se_mean
, effective sample size n_eff
, and the R-hat statistic Rhat
):
print(fit)
#> Spatial Model Results
#> Formula: deaths.female ~ offset(log(pop.at.risk.female))
-#> Spatial method (outcome): CAR
-#> Likelihood function: poisson
-#> Link function: log
-#> Residual Moran Coefficient: 0.0011525
-#> WAIC: 1227.47
-#> Observations: 156
-#> Data models (ME): none
-#> Inference for Stan model: foundation.
-#> 4 chains, each with iter=2000; warmup=1000; thin=1;
-#> post-warmup draws per chain=1000, total post-warmup draws=4000.
-#>
-#> mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat
-#> intercept -4.674 0.002 0.089 -4.849 -4.730 -4.674 -4.621 -4.505 2362 1.000
-#> car_rho 0.923 0.001 0.058 0.778 0.879 0.937 0.973 0.995 3319 1.000
-#> car_scale 0.458 0.001 0.036 0.395 0.428 0.456 0.488 0.534 3618 0.999
-#>
-#> Samples were drawn using NUTS(diag_e) at Tue Sep 17 16:44:56 2024.
-#> For each parameter, n_eff is a crude measure of effective sample size,
-#> and Rhat is the potential scale reduction factor on split chains (at
-#> convergence, Rhat=1).
-Applying the fitted
method to the fitted model will return the fitted values from the model - in this case, the fitted values are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:
-mortality_est <- fitted(fit) * 10e3
-county_name <- georgia$NAME
-head( cbind(county_name, mortality_est) )
-#> county_name mean sd 2.5% 20% 50%
-#> fitted[1] Crisp 101.48785 9.604829 83.99009 93.31163 101.17610
-#> fitted[2] Candler 136.99885 15.905146 109.27395 123.11823 136.31355
-#> fitted[3] Barrow 94.25470 6.071597 82.80270 89.20105 94.16678
-#> fitted[4] DeKalb 59.76214 1.579194 56.72962 58.44624 59.75766
-#> fitted[5] Columbia 53.33958 3.257549 47.19615 50.56654 53.28387
-#> fitted[6] Cobb 54.12983 1.498260 51.24933 52.85101 54.10133
-#> 80% 97.5%
-#> fitted[1] 109.30723 121.16598
-#> fitted[2] 150.17348 169.77611
-#> fitted[3] 99.19399 106.44508
-#> fitted[4] 61.07091 62.86805
-#> fitted[5] 56.08790 59.78086
-#> fitted[6] 55.42278 57.02966
-The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and “97.5%”.
-Details and demonstrations can be found in the package help pages and vignettes.
+#> Likelihood: poisson
+#> Link: log
+#> Spatial method: CAR
+#> Residual Moran Coefficient: -0.0031375
+#> Observations: 156
+#>
+#> Inference for Stan model: foundation.
+#> 4 chains, each with iter=1000; warmup=500; thin=1;
+#> post-warmup draws per chain=500, total post-warmup draws=2000.
+#>
+#> mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat
+#> intercept -4.677 0.003 0.092 -4.871 -4.734 -4.677 -4.619 -4.513 861 1.004
+#> car_rho 0.924 0.001 0.058 0.784 0.883 0.936 0.973 0.996 1606 0.999
+#> car_scale 0.456 0.001 0.036 0.391 0.424 0.454 0.486 0.532 1899 1.002
+#>
+#> Samples were drawn using NUTS(diag_e) at Wed Nov 13 16:09:50 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split chains (at
+#> convergence, Rhat=1).
+To extract estimates of the county mortality rates from this, we apply the fitted
method - in this case, the fitted values from the model are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:
+# mortality rates per 10,000 at risk
+mortality_est <- fitted(fit) * 10e3
+
+# display rates with county names
+county_name <- georgia$NAME
+head( cbind(county_name, mortality_est) )
+#> county_name mean sd 2.5% 20% 50%
+#> fitted[1] Crisp 101.89147 9.784748 83.87184 93.37621 101.37227
+#> fitted[2] Candler 137.13841 15.643722 108.50033 123.57906 136.51359
+#> fitted[3] Barrow 94.35971 6.364805 82.62612 88.73920 94.14574
+#> fitted[4] DeKalb 59.74315 1.575741 56.77068 58.33325 59.71677
+#> fitted[5] Columbia 53.34581 3.207432 47.33439 50.61504 53.26339
+#> fitted[6] Cobb 54.12259 1.495041 51.24262 52.86109 54.12912
+#> 80% 97.5%
+#> fitted[1] 110.47617 122.04006
+#> fitted[2] 150.26114 169.29720
+#> fitted[3] 99.74677 107.51136
+#> fitted[4] 61.13469 62.84502
+#> fitted[5] 56.05293 59.83064
+#> fitted[6] 55.38574 57.00559
+The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and “97.5%”. Here we create a map of estimates (with some help from sf
package):
+library(sf)
+
+# put estimates into bins for map colors
+x <- mortality_est$mean
+brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1))
+est_cut <- cut(x, breaks = brks, include.lowest = TRUE)
+
+# assign colors to values
+rank <- as.numeric( est_cut )
+pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") )
+pal <- pal_fun( max(rank) )
+colors <- pal[ rank ]
+
+# set plot margins
+og=par(mar=rep(1, 4))
+
+# get boundaries
+geom <- sf::st_geometry(georgia)
+
+# map estimates
+plot(geom,
+ lwd = 0.2,
+ col = colors)
+
+# legend
+legend("right",
+ fill = pal,
+ title = 'Mortality per 10,000',
+ legend = levels(est_cut),
+ bty = 'n'
+)
+
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2)
+
+
+
+Using the credible intervals, we can complement our map with a point-interval plot:
+# order counties by mortality rate
+index <- order(mortality_est$mean, decreasing = TRUE)
+dat <- mortality_est[index, ]
+
+# gather estimate with credible interval (95%)
+est <- dat$mean
+lwr <- dat$`2.5%`
+upr <- dat$`97.5%`
+y <- seq_along(county_name)
+x_lim <- c(min(lwr), max(upr)) |>
+ round()
+
+og=par(mar = c(3, 0, 0, 0))
+
+# points
+plot(est,
+ y,
+ pch = 5,
+ col = 'gray50',
+ bty = 'L',
+ axes = FALSE,
+ xlim = x_lim,
+ ylab = NA,
+ xlab = NA)
+
+# intervals
+segments(x0 = lwr, x1 = upr,
+ y0 = y, y1 = y,
+ col = colors[ index ])
+
+# x axis
+axis(1, at = seq(x_lim[1], x_lim[2], by = 20))
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2)
+
+
+
+More details and demonstrations can be found in the package help pages and vignettes.
Citing geostan
If you use geostan in published work, please include a citation.
Donegan, Connor (2022) “geostan: An R package for Bayesian spatial analysis” The Journal of Open Source Software. 7, no. 79: 4716. https://doi.org/10.21105/joss.04716.
diff --git a/README.md b/README.md
index 7227b471..1cddde97 100644
--- a/README.md
+++ b/README.md
@@ -19,9 +19,9 @@ and in the package
Features include:
- - **Disease mapping and spatial regression** Statistical models for
- data recorded across areal units like states, counties, or census
- tracts.
+ - **Spatial regression and disease mapping** Statistical models for
+ data recorded across areal units (states, counties, or census
+ tracts) or networks, including spatial econometric models.
- **Spatial analysis tools** Tools for visualizing and measuring
spatial autocorrelation and map patterns, for exploratory analysis
and model diagnostics.
@@ -35,8 +35,8 @@ Features include:
observations.
- **The RStan ecosystem** Interfaces easily with many high-quality R
packages for Bayesian modeling.
- - **Custom spatial models** Tools for building custom spatial models
- in Stan.
+ - **Custom spatial models** Tools for building custom spatial or
+ network models in Stan.
For public health research, geostan complements the
[surveil](https://connordonegan.github.io/surveil/) R package for the
@@ -110,14 +110,21 @@ data(georgia)
This has county population and mortality data by sex for ages 55-64, and
for the period 2014-2018. As is common for public access data, some of
-the observations missing because the CDC has censored them.
+the observations missing because the CDC has censored them to protect
+privacy.
The `sp_diag` function provides visual summaries of spatial data,
-including a histogram, Moran scatter plot, and map. Here is a visual
-summary of crude female mortality rates (as deaths per 10,000):
+including a histogram, Moran scatter plot, and map. The Moran scatter
+plot displays the values against a summary of their neighboring values,
+so that the slope of the regression line gives a measure of their degree
+of autocorrelation.
+
+Here is a quick visual summary of crude female mortality rates (as
+deaths per 10,000):
``` r
-A <- shape2mat(georgia, style = "B")
+# create adjacency matrix ("B" is for binary)
+C <- shape2mat(georgia, style = "B")
#> Contiguity condition: queen
#> Number of neighbors per unit, summary:
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
@@ -126,66 +133,64 @@ A <- shape2mat(georgia, style = "B")
#> Spatial weights, summary:
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1 1 1 1 1 1
+
+# crude mortality rate per 10,000 at risk
mortality_rate <- georgia$rate.female * 10e3
-sp_diag(mortality_rate, georgia, w = A)
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing non-finite outside the scale
-#> range (`stat_bin()`).
+
+# quick spatial diagnostics
+sp_diag(mortality_rate, georgia, w = C, name = "Mortality")
+#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)
+#> Warning: Removed 3 rows containing non-finite outside the scale range
+#> (`stat_bin()`).
```
+Mortality rates and other health statistics for counties are, in many
+cases, highly unstable estimates that cannot be relied upon for public
+advisories or inference (due to small population sizes). Hence, we need
+models to make inferences from small area data.
+
The following code fits a spatial conditional autoregressive (CAR) model
to female county mortality data. These models are used for estimating
disease risk in small areas like counties, and for analyzing covariation
-of health outcomes with other area qualities. The R syntax for fitting
+of health outcomes with other area variables. The R syntax for fitting
the models is similar to using `lm` or `glm`. We provide the population
at risk (the denominator for mortality rates) as an offset term, using
-the log-transform. In this case, three of the observations are missing
+the log-transform.
+
+In our Georgia mortality data, three of the observations are missing
because they have been censored; per CDC criteria, this means that there
were 9 or fewer deaths in those counties. By using the `censor_point`
-argument and setting it to `censor_point = 9`, the model will account
-for the censoring process when providing estimates of the mortality
-rates:
+argument and setting it to `censor_point = 9`, we can easily obtain
+estimates for the censored counties (along with all the others) using
+models account for the censoring process:
``` r
-cars <- prep_car_data(A)
+# prepare a list of data for CAR models in Stan
+cars <- prep_car_data(C)
#> Range of permissible rho values: -1.661, 1
+
+# fit the model to female mortality rates
fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
censor_point = 9,
data = georgia,
car_parts = cars,
family = poisson(),
- cores = 4, # for multi-core processing
- refresh = 0) # to silence some printing
+ iter = 1e3, # no. MCMC samples
+ quiet = TRUE) # to silence printing
#> 3 NA values identified in the outcome variable
#> Found in rows: 55, 126, 157
-#>
-#> *Setting prior parameters for intercept
-#> Distribution: normal
-#> location scale
-#> 1 -4.7 5
-#>
-#> *Setting prior for CAR scale parameter (car_scale)
-#> Distribution: student_t
-#> df location scale
-#> 1 10 0 3
-#>
-#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
-#> Distribution: uniform
-#> lower upper
-#> 1 -1.7 1
```
Passing a fitted model to the `sp_diag` function will return a set of
diagnostics for spatial models:
``` r
-sp_diag(fit, georgia, w = A)
-#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing missing values or values
-#> outside the scale range (`geom_pointrange()`).
+sp_diag(fit, georgia)
+#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)
+#> Warning: Removed 3 rows containing missing values or values outside the
+#> scale range (`geom_pointrange()`).
```
@@ -200,58 +205,149 @@ diagnostics from Stan (Monte Carlo standard errors of the mean
print(fit)
#> Spatial Model Results
#> Formula: deaths.female ~ offset(log(pop.at.risk.female))
-#> Spatial method (outcome): CAR
-#> Likelihood function: poisson
-#> Link function: log
-#> Residual Moran Coefficient: 0.0011525
-#> WAIC: 1227.47
+#> Likelihood: poisson
+#> Link: log
+#> Spatial method: CAR
+#> Residual Moran Coefficient: -0.0031375
#> Observations: 156
-#> Data models (ME): none
+#>
#> Inference for Stan model: foundation.
-#> 4 chains, each with iter=2000; warmup=1000; thin=1;
-#> post-warmup draws per chain=1000, total post-warmup draws=4000.
+#> 4 chains, each with iter=1000; warmup=500; thin=1;
+#> post-warmup draws per chain=500, total post-warmup draws=2000.
#>
#> mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat
-#> intercept -4.674 0.002 0.089 -4.849 -4.730 -4.674 -4.621 -4.505 2362 1.000
-#> car_rho 0.923 0.001 0.058 0.778 0.879 0.937 0.973 0.995 3319 1.000
-#> car_scale 0.458 0.001 0.036 0.395 0.428 0.456 0.488 0.534 3618 0.999
+#> intercept -4.677 0.003 0.092 -4.871 -4.734 -4.677 -4.619 -4.513 861 1.004
+#> car_rho 0.924 0.001 0.058 0.784 0.883 0.936 0.973 0.996 1606 0.999
+#> car_scale 0.456 0.001 0.036 0.391 0.424 0.454 0.486 0.532 1899 1.002
#>
-#> Samples were drawn using NUTS(diag_e) at Tue Sep 17 16:44:56 2024.
+#> Samples were drawn using NUTS(diag_e) at Wed Nov 13 16:09:50 2024.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split chains (at
#> convergence, Rhat=1).
```
-Applying the `fitted` method to the fitted model will return the fitted
-values from the model - in this case, the fitted values are the
-estimates of the county mortality rates. Multiplying them by 10,000
+To extract estimates of the county mortality rates from this, we apply
+the `fitted` method - in this case, the fitted values from the model are
+the estimates of the county mortality rates. Multiplying them by 10,000
gives mortality rate per 10,000 at risk:
``` r
+# mortality rates per 10,000 at risk
mortality_est <- fitted(fit) * 10e3
+
+# display rates with county names
county_name <- georgia$NAME
head( cbind(county_name, mortality_est) )
#> county_name mean sd 2.5% 20% 50%
-#> fitted[1] Crisp 101.48785 9.604829 83.99009 93.31163 101.17610
-#> fitted[2] Candler 136.99885 15.905146 109.27395 123.11823 136.31355
-#> fitted[3] Barrow 94.25470 6.071597 82.80270 89.20105 94.16678
-#> fitted[4] DeKalb 59.76214 1.579194 56.72962 58.44624 59.75766
-#> fitted[5] Columbia 53.33958 3.257549 47.19615 50.56654 53.28387
-#> fitted[6] Cobb 54.12983 1.498260 51.24933 52.85101 54.10133
+#> fitted[1] Crisp 101.89147 9.784748 83.87184 93.37621 101.37227
+#> fitted[2] Candler 137.13841 15.643722 108.50033 123.57906 136.51359
+#> fitted[3] Barrow 94.35971 6.364805 82.62612 88.73920 94.14574
+#> fitted[4] DeKalb 59.74315 1.575741 56.77068 58.33325 59.71677
+#> fitted[5] Columbia 53.34581 3.207432 47.33439 50.61504 53.26339
+#> fitted[6] Cobb 54.12259 1.495041 51.24262 52.86109 54.12912
#> 80% 97.5%
-#> fitted[1] 109.30723 121.16598
-#> fitted[2] 150.17348 169.77611
-#> fitted[3] 99.19399 106.44508
-#> fitted[4] 61.07091 62.86805
-#> fitted[5] 56.08790 59.78086
-#> fitted[6] 55.42278 57.02966
+#> fitted[1] 110.47617 122.04006
+#> fitted[2] 150.26114 169.29720
+#> fitted[3] 99.74677 107.51136
+#> fitted[4] 61.13469 62.84502
+#> fitted[5] 56.05293 59.83064
+#> fitted[6] 55.38574 57.00559
```
The mortality estimates are stored in the column named “mean”, and the
limits of the 95% credible interval are found in the columns “2.5%” and
-“97.5%”.
+“97.5%”. Here we create a map of estimates (with some help from `sf`
+package):
+
+``` r
+library(sf)
+
+# put estimates into bins for map colors
+x <- mortality_est$mean
+brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1))
+est_cut <- cut(x, breaks = brks, include.lowest = TRUE)
+
+# assign colors to values
+rank <- as.numeric( est_cut )
+pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") )
+pal <- pal_fun( max(rank) )
+colors <- pal[ rank ]
+
+# set plot margins
+og=par(mar=rep(1, 4))
+
+# get boundaries
+geom <- sf::st_geometry(georgia)
+
+# map estimates
+plot(geom,
+ lwd = 0.2,
+ col = colors)
+
+# legend
+legend("right",
+ fill = pal,
+ title = 'Mortality per 10,000',
+ legend = levels(est_cut),
+ bty = 'n'
+)
+
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2)
+```
+
+
+
+``` r
+# reset margins
+par(og)
+```
+
+Using the credible intervals, we can complement our map with a
+point-interval plot:
+
+``` r
+# order counties by mortality rate
+index <- order(mortality_est$mean, decreasing = TRUE)
+dat <- mortality_est[index, ]
+
+# gather estimate with credible interval (95%)
+est <- dat$mean
+lwr <- dat$`2.5%`
+upr <- dat$`97.5%`
+y <- seq_along(county_name)
+x_lim <- c(min(lwr), max(upr)) |>
+ round()
+
+og=par(mar = c(3, 0, 0, 0))
+
+# points
+plot(est,
+ y,
+ pch = 5,
+ col = 'gray50',
+ bty = 'L',
+ axes = FALSE,
+ xlim = x_lim,
+ ylab = NA,
+ xlab = NA)
+
+# intervals
+segments(x0 = lwr, x1 = upr,
+ y0 = y, y1 = y,
+ col = colors[ index ])
+
+# x axis
+axis(1, at = seq(x_lim[1], x_lim[2], by = 20))
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2)
+```
+
+
+
+``` r
+par(og)
+```
-Details and demonstrations can be found in the package [help
+More details and demonstrations can be found in the package [help
pages](https://connordonegan.github.io/geostan/reference/index.html) and
[vignettes](https://connordonegan.github.io/geostan/articles/index.html).
diff --git a/docs/index.html b/docs/index.html
index 3592e456..34cb49ad 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -85,7 +85,7 @@