diff --git a/NAMESPACE b/NAMESPACE index c2289224..e0ed96a6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(log_lik,geostan_fit) S3method(plot,geostan_fit) S3method(predict,geostan_fit) S3method(print,geostan_fit) +S3method(print,impacts_slm) S3method(print,prior) S3method(residuals,geostan_fit) S3method(sp_diag,geostan_fit) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 7c9c0b5a..5cb85377 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -327,7 +327,7 @@ sp_diag.geostan_fit <- function(y, plot = TRUE, mc_style = c("scatter", "hist"), style = c("W", "B"), - w, + w = y$C, rates = TRUE, binwidth = function(x) 0.5 * stats::sd(x, na.rm = TRUE), size = 0.1, @@ -336,7 +336,7 @@ sp_diag.geostan_fit <- function(y, if (inherits(y$C, "Matrix") | inherits(y$C, "matrix")) { w <- y$C } else { - w <- shape2mat(shape, style = match.arg(style)) + w <- shape2mat(shape, style = match.arg(style), quiet = TRUE) } } mc_style <- match.arg(mc_style, c("scatter", "hist")) @@ -344,11 +344,11 @@ sp_diag.geostan_fit <- function(y, outcome <- y$data[,1] fits <- fitted(y, summary = TRUE, rates = rates) if (rates && y$family$family == "binomial") { - message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.") + #message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.") outcome <- outcome / (outcome + y$data[,2]) } if (rates && y$family$family == "poisson" && "offset" %in% c(colnames(y$data))) { - message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.") + #message("Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.") log.at.risk <- y$data[, "offset"] at.risk <- exp( log.at.risk ) outcome <- outcome / at.risk @@ -365,8 +365,11 @@ sp_diag.geostan_fit <- function(y, labs(x = "Observed", y = "Fitted") + theme_classic() + # map of marginal residuals - marginal_residual <- apply(residuals(y, summary = FALSE, rates = rates, ...), 2, mean, na.rm = TRUE) + R <- residuals(y, summary = FALSE, rates = rates, ...) + marginal_residual <- apply(R, 2, mean, na.rm = TRUE) + map.y <- ggplot(shape) + geom_sf(aes(fill = marginal_residual), lwd = .05, @@ -374,8 +377,8 @@ sp_diag.geostan_fit <- function(y, scale_fill_gradient2(name = name, label = signs::signs) + theme_void() + # residual autocorrelation - R <- residuals(y, summary = FALSE, rates = rates, ...) R.mc <- apply(R, 1, mc, w = w, warn = FALSE, na.rm = TRUE) if (mc_style == "scatter") { g.mc <- moran_plot(marginal_residual, w, xlab = name, na.rm = TRUE) @@ -391,9 +394,11 @@ sp_diag.geostan_fit <- function(y, x = "Residual MC", subtitle = paste0("MC (mean) = ", round(R.mc.mu, 2))) } + if (length(unique(R.mc)) == 1) { g.mc <- moran_plot(R[1,], w, xlab = name, na.rm = TRUE) - } + } + if (plot) { return( gridExtra::grid.arrange(ovf, g.mc, map.y, ncol = 3) ) } else { diff --git a/R/geostan_fit-methods.R b/R/geostan_fit-methods.R index 9413bcff..d0e5bc2e 100644 --- a/R/geostan_fit-methods.R +++ b/R/geostan_fit-methods.R @@ -55,17 +55,18 @@ print.geostan_fit <- function(x, print(x$re$formula) pars <- c(pars, "alpha_tau") } - + cat("Likelihood: ", x$family$family, "\n") + cat("Link: ", x$family$link, "\n") + meth <- x$spatial$method if (meth == "SAR") meth <- paste0(meth, " (", x$sar_type, ")") - cat("Spatial method (outcome): ", as.character(meth), "\n") + cat("Spatial method: ", as.character(meth), "\n") if (x$spatial$method == "CAR") pars <- c(pars, "car_rho", "car_scale") if (x$spatial$method == "SAR") pars <- c(pars, "sar_rho", "sar_scale") if (x$spatial$method == "BYM2") pars <- c(pars, "rho", "spatial_scale") if (x$spatial$method == "BYM") pars <- c(pars, "spatial_scale", "theta_scale") if (x$spatial$method == "ICAR") pars <- c(pars, "spatial_scale") - cat("Likelihood function: ", x$family$family, "\n") - cat("Link function: ", x$family$link, "\n") + if (!is.null(x$diagnostic$Residual_MC)) cat("Residual Moran Coefficient: ", x$diagnostic$Residual_MC, "\n") cat("Observations: ", x$N, "\n") if (x$ME$has_me) { @@ -408,6 +409,9 @@ as.array.geostan_fit <- function(x, ...){ #' @param type By default, results from `predict` are on the scale of the linear predictor (`type = "link")`). The alternative (`type = "response"`) is on the scale of the response variable. For example, the default return values for a Poisson model on the log scale, and using `type = "response"` will return the original scale of the outcome variable (by exponentiating the log values). #' #' @param add_slx Logical. If `add_slx = TRUE`, any spatially-lagged covariates that were specified through the 'slx' argument (of the model fitting function, e.g., `stan_glm`) will be added to the linear predictor. The spatial lag terms will be calculated internally using `object$C`, the spatial weights matrix used to fit the model. Hence, `newdata` must have `N = object$N` rows. Predictions from spatial lag models (SAR models of type 'SLM' and 'SDLM') always include the SLX terms (i.e., any value passed to `add_slx` will be overwritten with `TRUE`). +#' @param approx For SAR models of type 'SLM' or 'SDLM' only; use an approximation for matrix inversion? See details below. +#' +#' @param K Number of matrix powers to use with \code{approx}. #' #' @param ... Not used #' @@ -415,21 +419,32 @@ as.array.geostan_fit <- function(x, ...){ #' #' The primary purpose of the predict method is to explore marginal effects of covariates. #' -#' The model formula will be taken from `object$formula`, and then a model matrix will be created by passing `newdata` to the \link[stats]{model.frame} function (as in: \code{model.frame(newdata, object$formula}). Parameters are taken from `as.matrix(object, pars = c("intercept", "beta"))`. If `add_slx = TRUE`, SLX coefficients will be taken from `as.matrix(object, pars = "gamma")`. +#' The model formula will be taken from `object$formula`, and then a model matrix will be created by passing `newdata` to the \link[stats]{model.frame} function (as in: \code{model.frame(object$formula, newdata)}. Parameters are taken from \code{as.matrix(object, pars = c("intercept", "beta"))}. #' -#' Spatially-lagged covariates added via the `slx` argument ('spillover effects') will be included if `add_slx = TRUE` or if a spatial lag model is provided (a SAR model of type 'SLM' or 'SDLM'). In either of those cases, `newdata` must have the same number of rows as were used to fit the original data. For details on these 'spillover effects', see LeSage and Pace (2009) and LeSage (2014). +#' ## Spatial lag of X +#' +#' Spatially-lagged covariates which were included via the `slx` argument will, by default, not be included. They will be be included in predictions if `add_slx = TRUE` or if the fitted model is a SAR model of type 'SLM' or 'SDLM'. In either of those cases, `newdata` must have the same number of rows as were used to fit the original data. +#' +#' ## Spatial lag of Y +#' +#' The typical 'marginal effect' interpretation of the regression coefficients does not hold for the SAR models of type 'SLM' or 'SDLM'. For details on these 'spillover effects', see LeSage and Pace (2009), LeSage (2014), and `geostan::impacts`. #' #' Predictions for the spatial lag model (SAR models of type 'SLM') are equal to: #' \deqn{ #' (I - \rho W)^{-1} X \beta #' } -#' where \eqn{X \beta} contains the intercept and covariates. (For intercept-only models, the above term is equal to the constant intercept.) Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to: +#' where \eqn{X \beta} contains the intercept and covariates. Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to: #' \deqn{ #' (I - \rho W)^{-1} (X \beta + WX \gamma) #' } -#' where \eqn{WX \gamma} are spatially lagged covariates multiplied by their coefficients. The SLM and SDLM differ from all other model types in that the spatial component of the model cannot be separated from the linear predictor and is, therefore, automatically incorporated into the predictions. +#' where \eqn{WX \gamma} are spatially lagged covariates multiplied by their coefficients. +#' +#' The inverse of the matrix \eqn{(I - \rho W)} can be time consuming to compute (especially when iterating over MCMC samples). You can use `approx = TRUE` to approximate the inverse using a series of matrix powers. The argument \eqn{K} controls how many powers to use for the approximation. As a rule, higher values of \eqn{\rho} require larger \eqn{K}. Notice that \eqn{\rho^K} should be close to zero for the approximation to hold. For example, for \eqn{\rho = .5} a value of \eqn{K=8} may suffice (eqn{0.5^8 = 0.004}), but larger values of \eqn{\rho} require higher values of \eqn{K}. +#' +#' +#' ## Generalized linear models #' -#' In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR (error model), or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the `alpha` argument. +#' In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to geographic location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR, or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the `alpha` argument. #' #' @return #' @@ -441,16 +456,17 @@ as.array.geostan_fit <- function(x, ...){ #' #' fit <- stan_glm(deaths.male ~ offset(log(pop.at.risk.male)) + log(income), #' data = georgia, +#' re = ~ GEOID, #' centerx = TRUE, #' family = poisson(), #' chains = 2, iter = 600) # for speed only #' -#' # note: pop.at.risk.male=1 leads to log(pop.at.risk.male)=0 +#' # note: pop.at.risk.male=1 leads to offset of log(pop.at.risk.male)=0 #' # so that the predicted values are rates #' newdata <- data.frame( #' income = seq(min(georgia$income), #' max(georgia$income), -#' length.out = 100), +#' length.out = 200), #' pop.at.risk.male = 1) #' #' preds <- predict(fit, newdata, type = "response") @@ -465,12 +481,35 @@ as.array.geostan_fit <- function(x, ...){ #' newdata$pop.at.risk.male <- 10e3 #' preds <- predict(fit, newdata, type = "response") #' head(preds) +#' +#' # plot range +#' y_lim <- c(min(preds$`2.5%`), max(preds$`97.5%`)) +#' +#' # plot line #' plot(preds$income, #' preds$mean, -#' type = "l", +#' type = "l", #' ylab = "Deaths per 10,000", -#' xlab = "Income ($1,000s)") -#' +#' xlab = "Income ($1,000s)", +#' ylim = y_lim, +#' axes = FALSE) +#' +#' # add shaded cred. interval +#' x <- c(preds$income, rev(preds$income)) +#' y <- c(preds$`2.5%`, rev(preds$`97.5%`)) +#' polygon(x = x, y = y, +#' col = rgb(0.1, 0.2, 0.3, 0.3), +#' border = NA) +#' +#' # add axes +#' yat = seq(0, 300, by = 20) +#' axis(2, at = yat) +#' +#' xat = seq(0, 200, by = 10) +#' axis(1, at = xat) +#' +#' # show county incomes +#' rug(georgia$income) #' @source #' #' Goulard, Michael, Thibault Laurent, and Christine Thomas-Agnan (2017). About predictions in spatial autoregressive models: optimal and almost optimal strategies. *Spatial Economic Analysis* 12 (2-3): 304-325. @@ -490,6 +529,8 @@ predict.geostan_fit <- function(object, summary = TRUE, type = c("link", "response"), add_slx = FALSE, + approx = FALSE, + K = 15, ...) { type <- match.arg(type) if (missing(newdata)) return (fitted(object, summary = summary, ...)) @@ -511,11 +552,10 @@ predict.geostan_fit <- function(object, lag_y <- FALSE if (object$spatial$method == "SAR") { add_slx <- grepl("SDEM|SDLM", object$sar_type) - lay_y <- grepl("SLM|SDLM", object$sar_type) + lag_y <- grepl("SLM|SDLM", object$sar_type) } - if (add_slx) { - + if (add_slx) { stopifnot( nrow(newdata) == nrow(object$C) ) Gamma <- as.matrix(object, pars = "gamma") gnames <- gsub("^w.", "", colnames(Gamma)) @@ -524,8 +564,7 @@ predict.geostan_fit <- function(object, X_tmp <- as.matrix(X[, idx]) if (length(idx) != ncol(Gamma)) stop("Mis-match of SLX coefficient names and X column names.") WX <- W %*% X_tmp - for (m in 1:M) P[m,] <- as.numeric( P[m,] + WX %*% as.matrix(Gamma[m,]) ) - + for (m in 1:M) P[m,] <- as.numeric( P[m,] + WX %*% as.matrix(Gamma[m,]) ) } if (lag_y == TRUE) { @@ -537,10 +576,30 @@ predict.geostan_fit <- function(object, W <- object$C I <- Matrix::Diagonal(N) rho <- as.matrix(object, pars = "sar_rho")[,1] - - for (m in 1:M) { - IMRWI <- Matrix::solve(I - rho[m] * W) - P[m,] <- as.numeric( IMRWI %*% P[m,] ) + + if (approx) { + Q <- K + 1 + powers = 0:K + Mlist <- list() + Mlist[[1]] <- I + Mlist[[2]] <- W + W_k <- W + for (q in 3:Q) { + W_k <- W %*% W_k + Mlist[[q]] <- W_k + } + for (m in 1:M) { + rho_powers <- rho[ m ]^powers + Mpowers <- lapply(seq(Q), function(j) Mlist[[j]] * rho_powers[j]) + Multiplier <- Reduce(`+`, Mpowers) + P[ m, ] <- as.numeric( Multiplier %*% P[ m, ] ) + } + + } else { + for (m in 1:M) { + Multiplier <- Matrix::solve(I - rho[m] * W) + P[m,] <- as.numeric( Multiplier %*% P[m,] ) + } } } diff --git a/R/impacts.R b/R/impacts.R index 7ce67655..f84dfa54 100644 --- a/R/impacts.R +++ b/R/impacts.R @@ -87,13 +87,6 @@ spill <- function(beta, gamma = 0, rho, W, method = c('quick', 'proper'), K = 20 return( impacts_multiplier(beta, gamma, rho, T, K) ) } - # matrix powers: slower method of approximation - ## M_tmp <- I + rho * W - ## W_k <- W - ## for (j in 2:K) { - ## W_k <- W %*% W_k - ## M_tmp = M_tmp + rho^j * W_k - N <- nrow(W) I <- diag(rep(1, N)) imrw <- I - rho * W @@ -137,7 +130,6 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) { gamma_idx <- match( gsub("^w.", "", colnames(gamma)), Blabs ) for (j in seq_along(gamma_idx)) G[ , gamma_idx[j] ] <- gamma[ , j ] } - impax <- vector("list", length = M) @@ -145,8 +137,8 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) { for (m in 1:M) { impax[[m]] <- sapply(1:S, function(s) - spill(beta = B[s, m], - gamma = G[s, m], + spill(beta = as.numeric( B[s, m] ), + gamma = as.numeric( G[s, m] ), rho = rho[s], W = W, method, @@ -164,7 +156,7 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) { for (m in 1:M) { impax[[m]] <- sapply(1:S, function(s) - impacts_multiplier(B[s,m], G[s,m], rho[s], T, K)) |> + impacts_multiplier(as.numeric( B[s,m] ), as.numeric( G[s,m] ), rho[s], T, K)) |> t() } @@ -179,14 +171,31 @@ impacts <- function(object, method = c('quick', 'proper'), K = 20) { upr = as.numeric(apply(impax[[m]], 2, quantile, probs = 0.975)) res <- cbind(mean = est, median = est2, sd, lwr, upr) row.names(res) <- c('Direct', 'Indirect', 'Total') - summary[[m]] <- res - names(summary)[m] <- Blabs[m] - } + summary[[m]] <- res + } + + names(impax) <- Blabs + names(summary) <- Blabs + out <- list(summary = summary, samples = impax) + class(out) <- append("impacts_slm", class(out)) - return(list(summary = summary, samples = impax)) + return(out) } +#' @export +#' +#' @param x An object of class 'impacts_slm', as returned by `geostan::impacts` +#' +#' @param digits Round results to this many digits +#' +#' @param ... Additional arguments will be passed to `base::print` +#' +#' @rdname impacts +print.impacts_slm <- function(x, digits = 2, ...) { + print(x$summary, digits = digits, ...) +} + #' After LeSage and Pace 2009 pp. 114--115 #' @noRd impacts_multiplier <- function(beta, gamma, rho, T, K) { @@ -206,7 +215,7 @@ impacts_multiplier <- function(beta, gamma, rho, T, K) { # indirect indirect <- total - direct - return (c(direct = direct, indirect = indirect, total = total)) + retunr (c(direct = direct, indirect = indirect, total = total)) } #' diagonal entries of matrix powers e..g, diag( W^{20} ) diff --git a/R/moran.R b/R/moran.R index 4c835524..52887a27 100644 --- a/R/moran.R +++ b/R/moran.R @@ -40,13 +40,13 @@ mc <- function(x, w, digits = 3, warn = TRUE, na.rm = FALSE) { check_sa_data(x, w) na_idx <- which(is.na(x)) if (na.rm == TRUE && length(na_idx) > 0) { - if (warn) message(length(na_idx), " NA values found in x; they will be removed from the data before calculating the Moran coefficient. If matrix w was row-standardized, it may not longer be. You may want to use a binary connectivity matrix by using style = 'B' in shape2mat.") + if (warn) message(length(na_idx), " dropping NA values found in x (nb: this disrupts row-standardization of matrix w) ") x <- x[-na_idx] w <- w[-na_idx, -na_idx] } if (any(Matrix::rowSums(w) == 0)) { zero.idx <- which(Matrix::rowSums(w) == 0) - if (warn) message(length(zero.idx), " observations with no neighbors found. They will be removed from the data before calculating the Moran coefficient.") + if (warn) message(length(zero.idx), " dropping observations with zero neighbors ") x <- x[-zero.idx] w <- w[-zero.idx, -zero.idx] } @@ -102,11 +102,12 @@ moran_plot <- function(x, w, xlab = "x (centered)", ylab = "Spatial Lag", pch = if (length(na_idx) > 0) { if (na.rm == TRUE) { msg <- paste(length(na_idx), - "NA values found in x will be dropped from data x and matrix w" + "NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)" ) - if (!inherits(w, "ngCMatrix")) msg <- paste(msg, - "\n If you are using a row-standardized matrix: dropping the NA values means that some rows in matrix w do not sum to 1; you may want to remove missing observations manually and then make row-standardized w, or use a binary matrix (see ?shape2mat)." - ) + ## if (!inherits(w, "ngCMatrix")) msg <- paste(msg, + ## "\n If you are using a row-standardized matrix: dropping the NA values means that some rows in matrix w do not sum to 1; you may want to remove missing observations manually and then make row-standardized w, or use a binary matrix (see ?shape2mat)." + ## ) + message(msg) x <- x[-na_idx] w <- w[-na_idx, -na_idx] @@ -116,7 +117,7 @@ moran_plot <- function(x, w, xlab = "x (centered)", ylab = "Spatial Lag", pch = } if (any(Matrix::rowSums(w) == 0)) { zero.idx <- which(Matrix::rowSums(w) == 0) - message(length(zero.idx), " observations with no neighbors found. They will be dropped from the data before creating the Moran plot.") + message(length(zero.idx), " dropping observations with zero neighbors ") x <- x[-zero.idx] w <- w[-zero.idx, -zero.idx] } diff --git a/R/stan_car.R b/R/stan_car.R index 8714d49c..420e44e9 100644 --- a/R/stan_car.R +++ b/R/stan_car.R @@ -15,9 +15,9 @@ #' #' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data. #' -#' @param car_parts A list of data for the CAR model, as returned by \code{\link[geostan]{prep_car_data}}. +#' @param car_parts A list of data for the CAR model, as returned by \code{\link[geostan]{prep_car_data}}. If not provided by the user, then \code{C} will automatically be passed to \code{prep_car_data} to create it. #' -#' @param C Optional spatial connectivity matrix which will be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms; it will automatically be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}. +#' @param C Spatial connectivity matrix which will be used internally to create \code{car_parts} (if \code{car_parts} is missing); if the user provides an \code{slx} formula for the model, the required connectivity matrix will be taken from the \code{car_parts} list. See \code{\link[geostan]{shape2mat}}. #' #' @param family The likelihood function for the outcome variable. Current options are \code{auto_gaussian()}, \code{binomial(link = "logit")}, and \code{poisson(link = "log")}; if `family = gaussian()` is provided, it will automatically be converted to `auto_gaussian()`. #' @@ -195,8 +195,12 @@ #' C <- shape2mat(georgia, style = "B") #' cars <- prep_car_data(C) #' +#' # MCMC specs: set for purpose of demo speed +#' iter = 500 +#' chains = 2 +#' #' fit <- stan_car(log(rate.male) ~ 1, data = georgia, -#' car_parts = cars, iter = 400, quiet = TRUE) +#' car_parts = cars, iter = iter, chains = chains) #' #' # model diagnostics #' sp_diag(fit, georgia) @@ -207,7 +211,7 @@ #' car_parts = cars, #' data = georgia, #' family = poisson(), -#' iter = 400, quiet = TRUE) +#' iter = iter, chains = chains) #' #' # model diagnostics #' sp_diag(fit, georgia) @@ -235,7 +239,7 @@ #' data = georgia, #' car = dcars, #' family = poisson(), -#' iter = 400, quiet = TRUE) +#' iter = iter, chains = chains) #' #' sp_diag(Dfit, georgia, dcars$C) #' dic(Dfit); dic(fit) @@ -278,15 +282,18 @@ stan_car <- function(formula, check_car_parts(car_parts) stopifnot(car_parts$n == nrow(data)) if (quiet) refresh <- 0 - if (!missing(C)) { - stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) - stopifnot(all(dim(C) == nrow(data))) - } else { - C <- car_parts$C + C <- car_parts$C + + ## C [CAR: always take C from car_parts] + ## if (!missing(C)) { + ## stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) + ## stopifnot(all(dim(C) == nrow(data))) + ## } else { + ## C <- car_parts$C # if (car_parts$WCAR == 0) { # message("Since you did not provide C, calculation of residual SA and any spatial-lag of X terms will use the matrix found in car_parts$C.") # } - } + #} # zero-mean constraint parameterization car_parts$ZMP <- ifelse(missing(zmp), 0, zmp) @@ -346,12 +353,12 @@ stan_car <- function(formula, x_full <- xraw } else { stopifnot(inherits(slx, "formula")) - W <- C - if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") - xrs <- Matrix::rowSums(W) - if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") + ## W <- C + ## if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") + ## xrs <- Matrix::rowSums(W) + ## if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") # efficient transform to CRS representation for W.list (via transpose) - Wij <- as(W, "TsparseMatrix") + Wij <- as(C, "TsparseMatrix") Tw <- Matrix::sparseMatrix(i = Wij@j + 1, j = Wij@i + 1, x = Wij@x, @@ -359,7 +366,7 @@ stan_car <- function(formula, W.list <- list(w = Tw@x, v = Tw@i + 1, u = Tw@p + 1) - Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W) + Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = C) dwx <- ncol(Wx) wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx ) x_full <- cbind(Wx, xraw) diff --git a/R/stan_esf.R b/R/stan_esf.R index c3198f94..184ac3ed 100644 --- a/R/stan_esf.R +++ b/R/stan_esf.R @@ -12,13 +12,13 @@ #' alpha_tau ~ Student_t(d.f., location, scale). #' ``` #' -#' @param C Spatial connectivity matrix which will be used to calculate eigenvectors, if `EV` is not provided by the user. Typically, the binary connectivity matrix is best for calculating eigenvectors (i.e., using `C = shape2mat(shape, style = "B")`). This matrix will also be used to calculate residual spatial autocorrelation and any user specified \code{slx} terms; it will be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}. +#' @param C Spatial connectivity matrix. This will be used to calculate eigenvectors if `EV` is not provided by the user. See \code{\link[geostan]{shape2mat}}. Use of row-normalization (as in `\code{shape2mat(shape, 'W')} is not recommended for creating \code{EV}. Matrix \code{C} will also be used ('as is') to create any user-specified \code{slx} terms. #' #' @param nsa Include eigenvectors representing negative spatial autocorrelation? Defaults to \code{nsa = FALSE}. This is ignored if \code{EV} is provided. #' #' @param threshold Eigenvectors with standardized Moran coefficient values below this `threshold` value will be excluded from the candidate set of eigenvectors, `EV`. This defaults to \code{threshold = 0.25}, and is ignored if \code{EV} is provided. #' -#' @param EV A matrix of eigenvectors from any (transformed) connectivity matrix, presumably spatial (see \code{\link[geostan]{make_EV}}). If `EV` is provided, still also provide a spatial weights matrix \code{C} for other purposes; `threshold` and `nsa` are ignored for user provided `EV`. +#' @param EV A matrix of eigenvectors from any (transformed) connectivity matrix, presumably spatial or network-based (see \code{\link[geostan]{make_EV}}). If `EV` is provided, still also provide a spatial weights matrix \code{C} for other purposes; `threshold` and `nsa` are ignored for user provided `EV`. #' #' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data. #' @@ -273,12 +273,11 @@ stan_esf <- function(formula, x_full <- xraw } else { stopifnot(inherits(slx, "formula")) - W <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") - W.list <- rstan::extract_sparse_parts(W) - Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W) + #W <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") + W.list <- rstan::extract_sparse_parts(C) + Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = C) dwx <- ncol(Wx) wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx ) - x_full <- cbind(Wx, xraw) } } diff --git a/R/stan_glm.R b/R/stan_glm.R index 1bb81128..69db18be 100644 --- a/R/stan_glm.R +++ b/R/stan_glm.R @@ -19,7 +19,7 @@ #' #' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data. #' -#' @param C Optional spatial connectivity matrix which will be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms; it will automatically be row-standardized before calculating \code{slx} terms. See \code{\link[geostan]{shape2mat}}. +#' @param C Spatial connectivity matrix which will be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. See \code{\link[geostan]{shape2mat}}. #' #' @param family The likelihood function for the outcome variable. Current options are \code{poisson(link = "log")}, \code{binomial(link = "logit")}, \code{student_t()}, and the default \code{gaussian()}. #' @@ -392,8 +392,8 @@ stan_glm <- function(formula, stopifnot(inherits(slx, "formula")) W <- C if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") - xrs <- Matrix::rowSums(W) - if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") + ## xrs <- Matrix::rowSums(W) + ## if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") # efficient transform to CRS representation for W.list (via transpose) Wij <- as(W, "TsparseMatrix") Tw <- Matrix::sparseMatrix(i = Wij@j + 1, diff --git a/R/stan_icar.R b/R/stan_icar.R index eb0858cf..591a5cf2 100644 --- a/R/stan_icar.R +++ b/R/stan_icar.R @@ -15,7 +15,7 @@ #' #' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data. #' -#' @param C Spatial connectivity matrix which will be used to construct an edge list for the ICAR model, and to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. It will automatically be row-standardized before calculating \code{slx} terms. \code{C} must be a binary symmetric \code{n x n} matrix. +#' @param C Spatial connectivity matrix which will be used to construct an edge list for the ICAR model, and to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. It will automatically be row-standardized before calculating \code{slx} terms (matching the ICAR model). \code{C} must be a binary symmetric \code{n x n} matrix. #' #' @param type Defaults to "icar" (partial pooling of neighboring observations through parameter \code{phi}); specify "bym" to add a second parameter vector \code{theta} to perform partial pooling across all observations; specify "bym2" for the innovation introduced by Riebler et al. (2016). See \code{Details} for more information. #' @@ -338,7 +338,6 @@ stan_icar <- function(formula, Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W) dwx <- ncol(Wx) wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx ) - x_full <- cbind(Wx, xraw) } } diff --git a/R/stan_sar.R b/R/stan_sar.R index bcc3a85f..0ef00c06 100644 --- a/R/stan_sar.R +++ b/R/stan_sar.R @@ -15,9 +15,9 @@ #' #' @param data A \code{data.frame} or an object coercible to a data frame by \code{as.data.frame} containing the model data. #' -#' @param C Spatial weights matrix (conventionally referred to as \eqn{W} in the SAR model). Typically, this will be created using `geostan::shape2mat(shape, style = "W")`. This will be passed internally to \code{\link[geostan]{prep_sar_data}}, and will also be used to calculate residual spatial autocorrelation as well as any user specified \code{slx} terms. See \code{\link[geostan]{shape2mat}}. -#' #' @param sar_parts List of data constructed by \code{\link[geostan]{prep_sar_data}}. If not provided, then `C` will automatically be passed to \code{\link[geostan]{prep_sar_data}} to create `sar_parts`. +#' +#' @param C Spatial connectivity matrix which will be used internally to create \code{sar_parts} (if \code{sar_parts} is missing); if the user provides an \code{slx} formula for the model, the required connectivity matrix will be taken from the \code{sar_parts} list. See \code{\link[geostan]{shape2mat}}. #' #' @param family The likelihood function for the outcome variable. Current options are \code{auto_gaussian()}, \code{binomial()} (with logit link function) and \code{poisson()} (with log link function); if `family = gaussian()` is provided, it will automatically be converted to `auto_gaussian()`. #' @@ -262,7 +262,7 @@ #' ## fit models #' ## #' -#' # DSEM +#' # SDEM #' # y = mu + rho*W*(y - mu) + epsilon #' # mu = beta*x + gamma*Wx #' fit_sdem <- stan_sar(y ~ x, data = dat, @@ -358,13 +358,15 @@ stan_sar <- function(formula, Durbin <- grepl('SDEM|SDLM', type) if(grepl("SLM|SDLM", type) & family$family != "auto_gaussian") stop("SLM/SDLM are only available as auto-normal models (not Poisson or binomial models).") #### SAR type [stop] - # C - if (!missing(C)) { - stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) - stopifnot(all(dim(C) == nrow(data))) - } else { - C <- sar_parts$W - } + # C [SAR: always take C from sar_parts] + C <- sar_parts$W + + ## if (!missing(C)) { + ## stopifnot(inherits(C, "Matrix") | inherits(C, "matrix")) + ## stopifnot(all(dim(C) == nrow(data))) + ## } else { + ## C <- sar_parts$W + #} # zero-mean constraint parameterization sar_parts$ZMP <- ifelse(missing(zmp), 0, zmp) @@ -432,15 +434,12 @@ stan_sar <- function(formula, } slx = formula[-2] } - - W <- C - if (!inherits(W, "sparseMatrix")) W <- as(W, "CsparseMatrix") - xrs <- Matrix::rowSums(W) - if (!all(xrs == 1)) W <- row_standardize(W, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") - - # nb: in stan_sar, W is taken from sar_parts, not calculated here. - Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = W, Durbin = Durbin) + ## xrs <- Matrix::rowSums(C) + ## if (!all(xrs == 1)) C <- row_standardize(C, warn = !quiet, msg = "Row standardizing matrix C for spatial lag of X calculations.") + # nb: in stan_sar, CSR rep of W is taken from sar_parts, not calculated here. + + Wx <- SLX(f = slx, DF = mod_frame, x = xraw, W = C, Durbin = Durbin) dwx <- ncol(Wx) wx_idx <- as.array( which(paste0("w.", colnames(xraw)) %in% colnames(Wx)), dim = dwx ) x_full <- cbind(Wx, xraw) diff --git a/README.Rmd b/README.Rmd index 166b4b88..d2b9627e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -30,7 +30,7 @@ Features include: * **Observational uncertainty** Incorporate information on data reliability, such as standard errors of American Community Survey estimates, into any geostan model. * **Missing and Censored observations** Vital statistics and disease surveillance systems like CDC Wonder censor case counts that fall below a threshold number; geostan can model disease or mortality risk for small areas with censored observations or with missing observations. * **The RStan ecosystem** Interfaces easily with many high-quality R packages for Bayesian modeling. -* **Custom spatial models** Tools for building custom spatial models in Stan. +* **Custom spatial models** Tools for building custom spatial or network models in Stan. For public health research, geostan complements the [surveil](https://connordonegan.github.io/surveil/) R package for the study of time trends in disease incidence or mortality data. @@ -78,33 +78,47 @@ library(geostan) data(georgia) ``` -This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them. +This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them to protect privacy. -The `sp_diag` function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. Here is a visual summary of crude female mortality rates (as deaths per 10,000): +The `sp_diag` function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. The Moran scatter plot displays the values against a summary of their neighboring values, so that the slope of the regression line gives a measure of their degree of autocorrelation. + +Here is a quick visual summary of crude female mortality rates (as deaths per 10,000): ```{r fig.width = 8} -A <- shape2mat(georgia, style = "B") +# create adjacency matrix ("B" is for binary) +C <- shape2mat(georgia, style = "B") + +# crude mortality rate per 10,000 at risk mortality_rate <- georgia$rate.female * 10e3 -sp_diag(mortality_rate, georgia, w = A) + +# quick spatial diagnostics +sp_diag(mortality_rate, georgia, w = C, name = "Mortality") ``` -The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area qualities. The R syntax for fitting the models is similar to using `lm` or `glm`. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform. In this case, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the `censor_point` argument and setting it to `censor_point = 9`, the model will account for the censoring process when providing estimates of the mortality rates: +Mortality rates and other health statistics for counties are, in many cases, highly unstable estimates that cannot be relied upon for public advisories or inference (due to small population sizes). Hence, we need models to make inferences from small area data. + +The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area variables. The R syntax for fitting the models is similar to using `lm` or `glm`. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform. + +In our Georgia mortality data, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the `censor_point` argument and setting it to `censor_point = 9`, we can easily obtain estimates for the censored counties (along with all the others) using models account for the censoring process: ```{r} -cars <- prep_car_data(A) +# prepare a list of data for CAR models in Stan +cars <- prep_car_data(C) + +# fit the model to female mortality rates fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)), censor_point = 9, data = georgia, car_parts = cars, family = poisson(), - cores = 4, # for multi-core processing - refresh = 0) # to silence some printing + iter = 1e3, # no. MCMC samples + quiet = TRUE) # to silence printing ``` Passing a fitted model to the `sp_diag` function will return a set of diagnostics for spatial models: ```{r fig.width = 8} -sp_diag(fit, georgia, w = A) +sp_diag(fit, georgia) ``` The `print` method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean `se_mean`, effective sample size `n_eff`, and the R-hat statistic `Rhat`): @@ -113,17 +127,97 @@ The `print` method returns a summary of the probability distributions for model print(fit) ``` -Applying the `fitted` method to the fitted model will return the fitted values from the model - in this case, the fitted values are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk: +To extract estimates of the county mortality rates from this, we apply the `fitted` method - in this case, the fitted values from the model are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk: ```{r} +# mortality rates per 10,000 at risk mortality_est <- fitted(fit) * 10e3 + +# display rates with county names county_name <- georgia$NAME head( cbind(county_name, mortality_est) ) ``` -The mortality estimates are stored in the column named "mean", and the limits of the 95\% credible interval are found in the columns "2.5%" and "97.5%". +The mortality estimates are stored in the column named "mean", and the limits of the 95\% credible interval are found in the columns "2.5%" and "97.5%". Here we create a map of estimates (with some help from `sf` package): + +```{r fig.width = 7, fig.height = 4.5} +library(sf) + +# put estimates into bins for map colors +x <- mortality_est$mean +brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +est_cut <- cut(x, breaks = brks, include.lowest = TRUE) + +# assign colors to values +rank <- as.numeric( est_cut ) +pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") ) +pal <- pal_fun( max(rank) ) +colors <- pal[ rank ] + +# set plot margins +og=par(mar=rep(1, 4)) + +# get boundaries +geom <- sf::st_geometry(georgia) + +# map estimates +plot(geom, + lwd = 0.2, + col = colors) + +# legend +legend("right", + fill = pal, + title = 'Mortality per 10,000', + legend = levels(est_cut), + bty = 'n' +) + +mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2) +# reset margins +par(og) +``` + +Using the credible intervals, we can complement our map with a point-interval plot: + +```{r fig.height = 6.5} +# order counties by mortality rate +index <- order(mortality_est$mean, decreasing = TRUE) +dat <- mortality_est[index, ] + +# gather estimate with credible interval (95%) +est <- dat$mean +lwr <- dat$`2.5%` +upr <- dat$`97.5%` +y <- seq_along(county_name) +x_lim <- c(min(lwr), max(upr)) |> + round() + +og=par(mar = c(3, 0, 0, 0)) + +# points +plot(est, + y, + pch = 5, + col = 'gray50', + bty = 'L', + axes = FALSE, + xlim = x_lim, + ylab = NA, + xlab = NA) + +# intervals +segments(x0 = lwr, x1 = upr, + y0 = y, y1 = y, + col = colors[ index ]) + +# x axis +axis(1, at = seq(x_lim[1], x_lim[2], by = 20)) +mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2) +par(og) +``` -Details and demonstrations can be found in the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html). +More details and demonstrations can be found in the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html). ## Citing geostan diff --git a/README.html b/README.html index ed8a0bf3..f1bae065 100644 --- a/README.html +++ b/README.html @@ -610,12 +610,12 @@

geostan: Bayesian spatial analysisIntroductions to the software can be found at r-spatial.org and in the package vignettes.

Features include:

For public health research, geostan complements the surveil R package for the study of time trends in disease incidence or mortality data.

Installation

@@ -642,103 +642,174 @@

Usage

Load the package and the georgia county mortality data set:

library(geostan)
 data(georgia)
-

This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them.

-

The sp_diag function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. Here is a visual summary of crude female mortality rates (as deaths per 10,000):

-
A <- shape2mat(georgia, style = "B")
-#> Contiguity condition: queen
-#> Number of neighbors per unit, summary:
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>   1.000   4.000   5.000   5.409   6.000  10.000
-#> 
-#> Spatial weights, summary:
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>       1       1       1       1       1       1
-mortality_rate <- georgia$rate.female * 10e3
-sp_diag(mortality_rate, georgia, w = A)
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing non-finite outside the scale
-#> range (`stat_bin()`).
- +

This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them to protect privacy.

+

The sp_diag function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. The Moran scatter plot displays the values against a summary of their neighboring values, so that the slope of the regression line gives a measure of their degree of autocorrelation.

+

Here is a quick visual summary of crude female mortality rates (as deaths per 10,000):

+
# create adjacency matrix ("B" is for binary)
+C <- shape2mat(georgia, style = "B")
+#> Contiguity condition: queen
+#> Number of neighbors per unit, summary:
+#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#>   1.000   4.000   5.000   5.409   6.000  10.000
+#> 
+#> Spatial weights, summary:
+#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#>       1       1       1       1       1       1
+
+# crude mortality rate per 10,000 at risk
+mortality_rate <- georgia$rate.female * 10e3
+
+# quick spatial diagnostics
+sp_diag(mortality_rate, georgia, w = C, name = "Mortality")
+#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)
+#> Warning: Removed 3 rows containing non-finite outside the scale range
+#> (`stat_bin()`).
+ -

The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area qualities. The R syntax for fitting the models is similar to using lm or glm. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform. In this case, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the censor_point argument and setting it to censor_point = 9, the model will account for the censoring process when providing estimates of the mortality rates:

-
cars <- prep_car_data(A)
-#> Range of permissible rho values: -1.661, 1
-fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
-                censor_point = 9,
-        data = georgia,
-        car_parts = cars,
-        family = poisson(),
-        cores = 4, # for multi-core processing
-        refresh = 0) # to silence some printing
-#> 3 NA values identified in the outcome variable
-#> Found in rows: 55, 126, 157
-#> 
-#> *Setting prior parameters for intercept
-#> Distribution: normal
-#>   location scale
-#> 1     -4.7     5
-#> 
-#> *Setting prior for CAR scale parameter (car_scale)
-#> Distribution: student_t
-#>   df location scale
-#> 1 10        0     3
-#> 
-#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
-#> Distribution: uniform
-#>   lower upper
-#> 1  -1.7     1
+

Mortality rates and other health statistics for counties are, in many cases, highly unstable estimates that cannot be relied upon for public advisories or inference (due to small population sizes). Hence, we need models to make inferences from small area data.

+

The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area variables. The R syntax for fitting the models is similar to using lm or glm. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform.

+

In our Georgia mortality data, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the censor_point argument and setting it to censor_point = 9, we can easily obtain estimates for the censored counties (along with all the others) using models account for the censoring process:

+
# prepare a list of data for CAR models in Stan
+cars <- prep_car_data(C)
+#> Range of permissible rho values: -1.661, 1
+
+# fit the model to female mortality rates
+fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
+                censor_point = 9,
+        data = georgia,
+        car_parts = cars,
+        family = poisson(),
+        iter = 1e3, # no. MCMC samples
+        quiet = TRUE) # to silence printing
+#> 3 NA values identified in the outcome variable
+#> Found in rows: 55, 126, 157

Passing a fitted model to the sp_diag function will return a set of diagnostics for spatial models:

-
sp_diag(fit, georgia, w = A)
-#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing missing values or values
-#> outside the scale range (`geom_pointrange()`).
- +
sp_diag(fit, georgia)
+#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w)
+#> Warning: Removed 3 rows containing missing values or values outside the
+#> scale range (`geom_pointrange()`).
+

The print method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean se_mean, effective sample size n_eff, and the R-hat statistic Rhat):

print(fit)
 #> Spatial Model Results 
 #> Formula: deaths.female ~ offset(log(pop.at.risk.female))
-#> Spatial method (outcome):  CAR 
-#> Likelihood function:  poisson 
-#> Link function:  log 
-#> Residual Moran Coefficient:  0.0011525 
-#> WAIC:  1227.47 
-#> Observations:  156 
-#> Data models (ME): none
-#> Inference for Stan model: foundation.
-#> 4 chains, each with iter=2000; warmup=1000; thin=1; 
-#> post-warmup draws per chain=1000, total post-warmup draws=4000.
-#> 
-#>             mean se_mean    sd   2.5%    20%    50%    80%  97.5% n_eff  Rhat
-#> intercept -4.674   0.002 0.089 -4.849 -4.730 -4.674 -4.621 -4.505  2362 1.000
-#> car_rho    0.923   0.001 0.058  0.778  0.879  0.937  0.973  0.995  3319 1.000
-#> car_scale  0.458   0.001 0.036  0.395  0.428  0.456  0.488  0.534  3618 0.999
-#> 
-#> Samples were drawn using NUTS(diag_e) at Tue Sep 17 16:44:56 2024.
-#> For each parameter, n_eff is a crude measure of effective sample size,
-#> and Rhat is the potential scale reduction factor on split chains (at 
-#> convergence, Rhat=1).
-

Applying the fitted method to the fitted model will return the fitted values from the model - in this case, the fitted values are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:

-
mortality_est <- fitted(fit) * 10e3
-county_name <- georgia$NAME
-head( cbind(county_name, mortality_est) )
-#>           county_name      mean        sd      2.5%       20%       50%
-#> fitted[1]       Crisp 101.48785  9.604829  83.99009  93.31163 101.17610
-#> fitted[2]     Candler 136.99885 15.905146 109.27395 123.11823 136.31355
-#> fitted[3]      Barrow  94.25470  6.071597  82.80270  89.20105  94.16678
-#> fitted[4]      DeKalb  59.76214  1.579194  56.72962  58.44624  59.75766
-#> fitted[5]    Columbia  53.33958  3.257549  47.19615  50.56654  53.28387
-#> fitted[6]        Cobb  54.12983  1.498260  51.24933  52.85101  54.10133
-#>                 80%     97.5%
-#> fitted[1] 109.30723 121.16598
-#> fitted[2] 150.17348 169.77611
-#> fitted[3]  99.19399 106.44508
-#> fitted[4]  61.07091  62.86805
-#> fitted[5]  56.08790  59.78086
-#> fitted[6]  55.42278  57.02966
-

The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and “97.5%”.

-

Details and demonstrations can be found in the package help pages and vignettes.

+#> Likelihood: poisson +#> Link: log +#> Spatial method: CAR +#> Residual Moran Coefficient: -0.0031375 +#> Observations: 156 +#> +#> Inference for Stan model: foundation. +#> 4 chains, each with iter=1000; warmup=500; thin=1; +#> post-warmup draws per chain=500, total post-warmup draws=2000. +#> +#> mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat +#> intercept -4.677 0.003 0.092 -4.871 -4.734 -4.677 -4.619 -4.513 861 1.004 +#> car_rho 0.924 0.001 0.058 0.784 0.883 0.936 0.973 0.996 1606 0.999 +#> car_scale 0.456 0.001 0.036 0.391 0.424 0.454 0.486 0.532 1899 1.002 +#> +#> Samples were drawn using NUTS(diag_e) at Wed Nov 13 16:09:50 2024. +#> For each parameter, n_eff is a crude measure of effective sample size, +#> and Rhat is the potential scale reduction factor on split chains (at +#> convergence, Rhat=1). +

To extract estimates of the county mortality rates from this, we apply the fitted method - in this case, the fitted values from the model are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:

+
# mortality rates per 10,000 at risk
+mortality_est <- fitted(fit) * 10e3
+
+# display rates with county names
+county_name <- georgia$NAME
+head( cbind(county_name, mortality_est) )
+#>           county_name      mean        sd      2.5%       20%       50%
+#> fitted[1]       Crisp 101.89147  9.784748  83.87184  93.37621 101.37227
+#> fitted[2]     Candler 137.13841 15.643722 108.50033 123.57906 136.51359
+#> fitted[3]      Barrow  94.35971  6.364805  82.62612  88.73920  94.14574
+#> fitted[4]      DeKalb  59.74315  1.575741  56.77068  58.33325  59.71677
+#> fitted[5]    Columbia  53.34581  3.207432  47.33439  50.61504  53.26339
+#> fitted[6]        Cobb  54.12259  1.495041  51.24262  52.86109  54.12912
+#>                 80%     97.5%
+#> fitted[1] 110.47617 122.04006
+#> fitted[2] 150.26114 169.29720
+#> fitted[3]  99.74677 107.51136
+#> fitted[4]  61.13469  62.84502
+#> fitted[5]  56.05293  59.83064
+#> fitted[6]  55.38574  57.00559
+

The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and “97.5%”. Here we create a map of estimates (with some help from sf package):

+
library(sf)
+
+# put estimates into bins for map colors
+x <- mortality_est$mean
+brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1)) 
+est_cut <- cut(x, breaks = brks, include.lowest = TRUE)
+  
+# assign colors to values
+rank <- as.numeric( est_cut )  
+pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") )
+pal <- pal_fun( max(rank) )
+colors <-  pal[ rank ]
+
+# set plot margins
+og=par(mar=rep(1, 4))
+
+# get boundaries
+geom <- sf::st_geometry(georgia)
+
+# map  estimates
+plot(geom,
+    lwd = 0.2,
+    col = colors)
+
+# legend
+legend("right",
+     fill = pal,
+     title = 'Mortality per 10,000',
+     legend = levels(est_cut),
+     bty = 'n'
+)
+
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2)
+ + +
# reset margins
+par(og)
+

Using the credible intervals, we can complement our map with a point-interval plot:

+
# order counties by mortality rate
+index <- order(mortality_est$mean, decreasing = TRUE)
+dat <- mortality_est[index, ]
+
+# gather estimate with credible interval (95%)
+est <- dat$mean
+lwr <- dat$`2.5%`
+upr <- dat$`97.5%`
+y <- seq_along(county_name)
+x_lim <- c(min(lwr), max(upr)) |>
+      round()
+
+og=par(mar = c(3, 0, 0, 0))
+
+# points
+plot(est,
+     y,
+     pch = 5,
+     col = 'gray50',
+     bty = 'L',
+     axes = FALSE,
+     xlim = x_lim,
+     ylab = NA,
+     xlab = NA)
+
+# intervals
+segments(x0 = lwr, x1 = upr,
+         y0 = y, y1 = y,
+     col = colors[ index ])
+
+# x axis
+axis(1, at = seq(x_lim[1], x_lim[2], by = 20))
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2)
+ + +
par(og)
+

More details and demonstrations can be found in the package help pages and vignettes.

Citing geostan

If you use geostan in published work, please include a citation.

Donegan, Connor (2022) “geostan: An R package for Bayesian spatial analysis” The Journal of Open Source Software. 7, no. 79: 4716. https://doi.org/10.21105/joss.04716.

diff --git a/README.md b/README.md index 7227b471..1cddde97 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@ and in the package Features include: - - **Disease mapping and spatial regression** Statistical models for - data recorded across areal units like states, counties, or census - tracts. + - **Spatial regression and disease mapping** Statistical models for + data recorded across areal units (states, counties, or census + tracts) or networks, including spatial econometric models. - **Spatial analysis tools** Tools for visualizing and measuring spatial autocorrelation and map patterns, for exploratory analysis and model diagnostics. @@ -35,8 +35,8 @@ Features include: observations. - **The RStan ecosystem** Interfaces easily with many high-quality R packages for Bayesian modeling. - - **Custom spatial models** Tools for building custom spatial models - in Stan. + - **Custom spatial models** Tools for building custom spatial or + network models in Stan. For public health research, geostan complements the [surveil](https://connordonegan.github.io/surveil/) R package for the @@ -110,14 +110,21 @@ data(georgia) This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of -the observations missing because the CDC has censored them. +the observations missing because the CDC has censored them to protect +privacy. The `sp_diag` function provides visual summaries of spatial data, -including a histogram, Moran scatter plot, and map. Here is a visual -summary of crude female mortality rates (as deaths per 10,000): +including a histogram, Moran scatter plot, and map. The Moran scatter +plot displays the values against a summary of their neighboring values, +so that the slope of the regression line gives a measure of their degree +of autocorrelation. + +Here is a quick visual summary of crude female mortality rates (as +deaths per 10,000): ``` r -A <- shape2mat(georgia, style = "B") +# create adjacency matrix ("B" is for binary) +C <- shape2mat(georgia, style = "B") #> Contiguity condition: queen #> Number of neighbors per unit, summary: #> Min. 1st Qu. Median Mean 3rd Qu. Max. @@ -126,66 +133,64 @@ A <- shape2mat(georgia, style = "B") #> Spatial weights, summary: #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 1 1 1 1 1 1 + +# crude mortality rate per 10,000 at risk mortality_rate <- georgia$rate.female * 10e3 -sp_diag(mortality_rate, georgia, w = A) -#> 3 NA values found in x will be dropped from data x and matrix w -#> Warning: Removed 3 rows containing non-finite outside the scale -#> range (`stat_bin()`). + +# quick spatial diagnostics +sp_diag(mortality_rate, georgia, w = C, name = "Mortality") +#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w) +#> Warning: Removed 3 rows containing non-finite outside the scale range +#> (`stat_bin()`). ``` +Mortality rates and other health statistics for counties are, in many +cases, highly unstable estimates that cannot be relied upon for public +advisories or inference (due to small population sizes). Hence, we need +models to make inferences from small area data. + The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation -of health outcomes with other area qualities. The R syntax for fitting +of health outcomes with other area variables. The R syntax for fitting the models is similar to using `lm` or `glm`. We provide the population at risk (the denominator for mortality rates) as an offset term, using -the log-transform. In this case, three of the observations are missing +the log-transform. + +In our Georgia mortality data, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the `censor_point` -argument and setting it to `censor_point = 9`, the model will account -for the censoring process when providing estimates of the mortality -rates: +argument and setting it to `censor_point = 9`, we can easily obtain +estimates for the censored counties (along with all the others) using +models account for the censoring process: ``` r -cars <- prep_car_data(A) +# prepare a list of data for CAR models in Stan +cars <- prep_car_data(C) #> Range of permissible rho values: -1.661, 1 + +# fit the model to female mortality rates fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)), censor_point = 9, data = georgia, car_parts = cars, family = poisson(), - cores = 4, # for multi-core processing - refresh = 0) # to silence some printing + iter = 1e3, # no. MCMC samples + quiet = TRUE) # to silence printing #> 3 NA values identified in the outcome variable #> Found in rows: 55, 126, 157 -#> -#> *Setting prior parameters for intercept -#> Distribution: normal -#> location scale -#> 1 -4.7 5 -#> -#> *Setting prior for CAR scale parameter (car_scale) -#> Distribution: student_t -#> df location scale -#> 1 10 0 3 -#> -#> *Setting prior for CAR spatial autocorrelation parameter (car_rho) -#> Distribution: uniform -#> lower upper -#> 1 -1.7 1 ``` Passing a fitted model to the `sp_diag` function will return a set of diagnostics for spatial models: ``` r -sp_diag(fit, georgia, w = A) -#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE. -#> 3 NA values found in x will be dropped from data x and matrix w -#> Warning: Removed 3 rows containing missing values or values -#> outside the scale range (`geom_pointrange()`). +sp_diag(fit, georgia) +#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w) +#> Warning: Removed 3 rows containing missing values or values outside the +#> scale range (`geom_pointrange()`). ``` @@ -200,58 +205,149 @@ diagnostics from Stan (Monte Carlo standard errors of the mean print(fit) #> Spatial Model Results #> Formula: deaths.female ~ offset(log(pop.at.risk.female)) -#> Spatial method (outcome): CAR -#> Likelihood function: poisson -#> Link function: log -#> Residual Moran Coefficient: 0.0011525 -#> WAIC: 1227.47 +#> Likelihood: poisson +#> Link: log +#> Spatial method: CAR +#> Residual Moran Coefficient: -0.0031375 #> Observations: 156 -#> Data models (ME): none +#> #> Inference for Stan model: foundation. -#> 4 chains, each with iter=2000; warmup=1000; thin=1; -#> post-warmup draws per chain=1000, total post-warmup draws=4000. +#> 4 chains, each with iter=1000; warmup=500; thin=1; +#> post-warmup draws per chain=500, total post-warmup draws=2000. #> #> mean se_mean sd 2.5% 20% 50% 80% 97.5% n_eff Rhat -#> intercept -4.674 0.002 0.089 -4.849 -4.730 -4.674 -4.621 -4.505 2362 1.000 -#> car_rho 0.923 0.001 0.058 0.778 0.879 0.937 0.973 0.995 3319 1.000 -#> car_scale 0.458 0.001 0.036 0.395 0.428 0.456 0.488 0.534 3618 0.999 +#> intercept -4.677 0.003 0.092 -4.871 -4.734 -4.677 -4.619 -4.513 861 1.004 +#> car_rho 0.924 0.001 0.058 0.784 0.883 0.936 0.973 0.996 1606 0.999 +#> car_scale 0.456 0.001 0.036 0.391 0.424 0.454 0.486 0.532 1899 1.002 #> -#> Samples were drawn using NUTS(diag_e) at Tue Sep 17 16:44:56 2024. +#> Samples were drawn using NUTS(diag_e) at Wed Nov 13 16:09:50 2024. #> For each parameter, n_eff is a crude measure of effective sample size, #> and Rhat is the potential scale reduction factor on split chains (at #> convergence, Rhat=1). ``` -Applying the `fitted` method to the fitted model will return the fitted -values from the model - in this case, the fitted values are the -estimates of the county mortality rates. Multiplying them by 10,000 +To extract estimates of the county mortality rates from this, we apply +the `fitted` method - in this case, the fitted values from the model are +the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk: ``` r +# mortality rates per 10,000 at risk mortality_est <- fitted(fit) * 10e3 + +# display rates with county names county_name <- georgia$NAME head( cbind(county_name, mortality_est) ) #> county_name mean sd 2.5% 20% 50% -#> fitted[1] Crisp 101.48785 9.604829 83.99009 93.31163 101.17610 -#> fitted[2] Candler 136.99885 15.905146 109.27395 123.11823 136.31355 -#> fitted[3] Barrow 94.25470 6.071597 82.80270 89.20105 94.16678 -#> fitted[4] DeKalb 59.76214 1.579194 56.72962 58.44624 59.75766 -#> fitted[5] Columbia 53.33958 3.257549 47.19615 50.56654 53.28387 -#> fitted[6] Cobb 54.12983 1.498260 51.24933 52.85101 54.10133 +#> fitted[1] Crisp 101.89147 9.784748 83.87184 93.37621 101.37227 +#> fitted[2] Candler 137.13841 15.643722 108.50033 123.57906 136.51359 +#> fitted[3] Barrow 94.35971 6.364805 82.62612 88.73920 94.14574 +#> fitted[4] DeKalb 59.74315 1.575741 56.77068 58.33325 59.71677 +#> fitted[5] Columbia 53.34581 3.207432 47.33439 50.61504 53.26339 +#> fitted[6] Cobb 54.12259 1.495041 51.24262 52.86109 54.12912 #> 80% 97.5% -#> fitted[1] 109.30723 121.16598 -#> fitted[2] 150.17348 169.77611 -#> fitted[3] 99.19399 106.44508 -#> fitted[4] 61.07091 62.86805 -#> fitted[5] 56.08790 59.78086 -#> fitted[6] 55.42278 57.02966 +#> fitted[1] 110.47617 122.04006 +#> fitted[2] 150.26114 169.29720 +#> fitted[3] 99.74677 107.51136 +#> fitted[4] 61.13469 62.84502 +#> fitted[5] 56.05293 59.83064 +#> fitted[6] 55.38574 57.00559 ``` The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and -“97.5%”. +“97.5%”. Here we create a map of estimates (with some help from `sf` +package): + +``` r +library(sf) + +# put estimates into bins for map colors +x <- mortality_est$mean +brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1)) +est_cut <- cut(x, breaks = brks, include.lowest = TRUE) + +# assign colors to values +rank <- as.numeric( est_cut ) +pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") ) +pal <- pal_fun( max(rank) ) +colors <- pal[ rank ] + +# set plot margins +og=par(mar=rep(1, 4)) + +# get boundaries +geom <- sf::st_geometry(georgia) + +# map estimates +plot(geom, + lwd = 0.2, + col = colors) + +# legend +legend("right", + fill = pal, + title = 'Mortality per 10,000', + legend = levels(est_cut), + bty = 'n' +) + +mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2) +``` + + + +``` r +# reset margins +par(og) +``` + +Using the credible intervals, we can complement our map with a +point-interval plot: + +``` r +# order counties by mortality rate +index <- order(mortality_est$mean, decreasing = TRUE) +dat <- mortality_est[index, ] + +# gather estimate with credible interval (95%) +est <- dat$mean +lwr <- dat$`2.5%` +upr <- dat$`97.5%` +y <- seq_along(county_name) +x_lim <- c(min(lwr), max(upr)) |> + round() + +og=par(mar = c(3, 0, 0, 0)) + +# points +plot(est, + y, + pch = 5, + col = 'gray50', + bty = 'L', + axes = FALSE, + xlim = x_lim, + ylab = NA, + xlab = NA) + +# intervals +segments(x0 = lwr, x1 = upr, + y0 = y, y1 = y, + col = colors[ index ]) + +# x axis +axis(1, at = seq(x_lim[1], x_lim[2], by = 20)) +mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2) +``` + + + +``` r +par(og) +``` -Details and demonstrations can be found in the package [help +More details and demonstrations can be found in the package [help pages](https://connordonegan.github.io/geostan/reference/index.html) and [vignettes](https://connordonegan.github.io/geostan/articles/index.html). diff --git a/docs/index.html b/docs/index.html index 3592e456..34cb49ad 100644 --- a/docs/index.html +++ b/docs/index.html @@ -85,7 +85,7 @@

geostan: Bayesian spatial analysisFeatures include:

For public health research, geostan complements the surveil R package for the study of time trends in disease incidence or mortality data.

@@ -142,10 +142,12 @@

Usage
 library(geostan)
 data(georgia)
-

This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them.

-

The sp_diag function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. Here is a visual summary of crude female mortality rates (as deaths per 10,000):

+

This has county population and mortality data by sex for ages 55-64, and for the period 2014-2018. As is common for public access data, some of the observations missing because the CDC has censored them to protect privacy.

+

The sp_diag function provides visual summaries of spatial data, including a histogram, Moran scatter plot, and map. The Moran scatter plot displays the values against a summary of their neighboring values, so that the slope of the regression line gives a measure of their degree of autocorrelation.

+

Here is a quick visual summary of crude female mortality rates (as deaths per 10,000):

-A <- shape2mat(georgia, style = "B")
+# create adjacency matrix ("B" is for binary)
+C <- shape2mat(georgia, style = "B")
 #> Contiguity condition: queen
 #> Number of neighbors per unit, summary:
 #>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
@@ -154,94 +156,165 @@ 

Usage #> Spatial weights, summary: #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 1 1 1 1 1 1 + +# crude mortality rate per 10,000 at risk mortality_rate <- georgia$rate.female * 10e3 -sp_diag(mortality_rate, georgia, w = A) -#> 3 NA values found in x will be dropped from data x and matrix w -#> Warning: Removed 3 rows containing non-finite outside the scale -#> range (`stat_bin()`).

+ +# quick spatial diagnostics +sp_diag(mortality_rate, georgia, w = C, name = "Mortality") +#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w) +#> Warning: Removed 3 rows containing non-finite outside the scale range +#> (`stat_bin()`).

-

The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area qualities. The R syntax for fitting the models is similar to using lm or glm. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform. In this case, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the censor_point argument and setting it to censor_point = 9, the model will account for the censoring process when providing estimates of the mortality rates:

+

Mortality rates and other health statistics for counties are, in many cases, highly unstable estimates that cannot be relied upon for public advisories or inference (due to small population sizes). Hence, we need models to make inferences from small area data.

+

The following code fits a spatial conditional autoregressive (CAR) model to female county mortality data. These models are used for estimating disease risk in small areas like counties, and for analyzing covariation of health outcomes with other area variables. The R syntax for fitting the models is similar to using lm or glm. We provide the population at risk (the denominator for mortality rates) as an offset term, using the log-transform.

+

In our Georgia mortality data, three of the observations are missing because they have been censored; per CDC criteria, this means that there were 9 or fewer deaths in those counties. By using the censor_point argument and setting it to censor_point = 9, we can easily obtain estimates for the censored counties (along with all the others) using models account for the censoring process:

-cars <- prep_car_data(A)
+# prepare a list of data for CAR models in Stan
+cars <- prep_car_data(C)
 #> Range of permissible rho values: -1.661, 1
+
+# fit the model to female mortality rates
 fit <- stan_car(deaths.female ~ offset(log(pop.at.risk.female)),
                 censor_point = 9,
         data = georgia,
         car_parts = cars,
         family = poisson(),
-        cores = 4, # for multi-core processing
-        refresh = 0) # to silence some printing
+        iter = 1e3, # no. MCMC samples
+        quiet = TRUE) # to silence printing
 #> 3 NA values identified in the outcome variable
-#> Found in rows: 55, 126, 157
-#> 
-#> *Setting prior parameters for intercept
-#> Distribution: normal
-#>   location scale
-#> 1     -4.7     5
-#> 
-#> *Setting prior for CAR scale parameter (car_scale)
-#> Distribution: student_t
-#>   df location scale
-#> 1 10        0     3
-#> 
-#> *Setting prior for CAR spatial autocorrelation parameter (car_rho)
-#> Distribution: uniform
-#>   lower upper
-#> 1  -1.7     1
+#> Found in rows: 55, 126, 157

Passing a fitted model to the sp_diag function will return a set of diagnostics for spatial models:

-sp_diag(fit, georgia, w = A)
-#> Using sp_diag(y, shape, rates = TRUE, ...). To examine data as (unstandardized) counts, use rates = FALSE.
-#> 3 NA values found in x will be dropped from data x and matrix w
-#> Warning: Removed 3 rows containing missing values or values
-#> outside the scale range (`geom_pointrange()`).
+sp_diag(fit, georgia) +#> 3 NA values found in x will be dropped from data x and from matrix w (nb: this disrupts row-standardization of w) +#> Warning: Removed 3 rows containing missing values or values outside the +#> scale range (`geom_pointrange()`).

The print method returns a summary of the probability distributions for model parameters, as well as Markov chain Monte Carlo (MCMC) diagnostics from Stan (Monte Carlo standard errors of the mean se_mean, effective sample size n_eff, and the R-hat statistic Rhat):

 print(fit)
 #> Spatial Model Results 
 #> Formula: deaths.female ~ offset(log(pop.at.risk.female))
-#> Spatial method (outcome):  CAR 
-#> Likelihood function:  poisson 
-#> Link function:  log 
-#> Residual Moran Coefficient:  0.0011525 
-#> WAIC:  1227.47 
+#> Likelihood:  poisson 
+#> Link:  log 
+#> Spatial method:  CAR 
+#> Residual Moran Coefficient:  -0.0031375 
 #> Observations:  156 
-#> Data models (ME): none
+#> 
 #> Inference for Stan model: foundation.
-#> 4 chains, each with iter=2000; warmup=1000; thin=1; 
-#> post-warmup draws per chain=1000, total post-warmup draws=4000.
+#> 4 chains, each with iter=1000; warmup=500; thin=1; 
+#> post-warmup draws per chain=500, total post-warmup draws=2000.
 #> 
 #>             mean se_mean    sd   2.5%    20%    50%    80%  97.5% n_eff  Rhat
-#> intercept -4.674   0.002 0.089 -4.849 -4.730 -4.674 -4.621 -4.505  2362 1.000
-#> car_rho    0.923   0.001 0.058  0.778  0.879  0.937  0.973  0.995  3319 1.000
-#> car_scale  0.458   0.001 0.036  0.395  0.428  0.456  0.488  0.534  3618 0.999
+#> intercept -4.677   0.003 0.092 -4.871 -4.734 -4.677 -4.619 -4.513   861 1.004
+#> car_rho    0.924   0.001 0.058  0.784  0.883  0.936  0.973  0.996  1606 0.999
+#> car_scale  0.456   0.001 0.036  0.391  0.424  0.454  0.486  0.532  1899 1.002
 #> 
-#> Samples were drawn using NUTS(diag_e) at Tue Sep 17 16:44:56 2024.
+#> Samples were drawn using NUTS(diag_e) at Wed Nov 13 16:09:50 2024.
 #> For each parameter, n_eff is a crude measure of effective sample size,
 #> and Rhat is the potential scale reduction factor on split chains (at 
 #> convergence, Rhat=1).
-

Applying the fitted method to the fitted model will return the fitted values from the model - in this case, the fitted values are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:

+

To extract estimates of the county mortality rates from this, we apply the fitted method - in this case, the fitted values from the model are the estimates of the county mortality rates. Multiplying them by 10,000 gives mortality rate per 10,000 at risk:

-mortality_est <- fitted(fit) * 10e3
+# mortality rates per 10,000 at risk
+mortality_est <- fitted(fit) * 10e3
+
+# display rates with county names
 county_name <- georgia$NAME
 head( cbind(county_name, mortality_est) )
 #>           county_name      mean        sd      2.5%       20%       50%
-#> fitted[1]       Crisp 101.48785  9.604829  83.99009  93.31163 101.17610
-#> fitted[2]     Candler 136.99885 15.905146 109.27395 123.11823 136.31355
-#> fitted[3]      Barrow  94.25470  6.071597  82.80270  89.20105  94.16678
-#> fitted[4]      DeKalb  59.76214  1.579194  56.72962  58.44624  59.75766
-#> fitted[5]    Columbia  53.33958  3.257549  47.19615  50.56654  53.28387
-#> fitted[6]        Cobb  54.12983  1.498260  51.24933  52.85101  54.10133
+#> fitted[1]       Crisp 101.89147  9.784748  83.87184  93.37621 101.37227
+#> fitted[2]     Candler 137.13841 15.643722 108.50033 123.57906 136.51359
+#> fitted[3]      Barrow  94.35971  6.364805  82.62612  88.73920  94.14574
+#> fitted[4]      DeKalb  59.74315  1.575741  56.77068  58.33325  59.71677
+#> fitted[5]    Columbia  53.34581  3.207432  47.33439  50.61504  53.26339
+#> fitted[6]        Cobb  54.12259  1.495041  51.24262  52.86109  54.12912
 #>                 80%     97.5%
-#> fitted[1] 109.30723 121.16598
-#> fitted[2] 150.17348 169.77611
-#> fitted[3]  99.19399 106.44508
-#> fitted[4]  61.07091  62.86805
-#> fitted[5]  56.08790  59.78086
-#> fitted[6]  55.42278  57.02966
-

The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and “97.5%”.

-

Details and demonstrations can be found in the package help pages and vignettes.

+#> fitted[1] 110.47617 122.04006 +#> fitted[2] 150.26114 169.29720 +#> fitted[3] 99.74677 107.51136 +#> fitted[4] 61.13469 62.84502 +#> fitted[5] 56.05293 59.83064 +#> fitted[6] 55.38574 57.00559
+

The mortality estimates are stored in the column named “mean”, and the limits of the 95% credible interval are found in the columns “2.5%” and “97.5%”. Here we create a map of estimates (with some help from sf package):

+
+library(sf)
+
+# put estimates into bins for map colors
+x <- mortality_est$mean
+brks <- quantile(x, probs = c(0, 0.2, 0.4, 0.6, 0.8, 1)) 
+est_cut <- cut(x, breaks = brks, include.lowest = TRUE)
+  
+# assign colors to values
+rank <- as.numeric( est_cut )  
+pal_fun <- colorRampPalette( c("#5D74A5FF", "gray90", "#A8554EFF") )
+pal <- pal_fun( max(rank) )
+colors <-  pal[ rank ]
+
+# set plot margins
+og=par(mar=rep(1, 4))
+
+# get boundaries
+geom <- sf::st_geometry(georgia)
+
+# map  estimates
+plot(geom,
+    lwd = 0.2,
+    col = colors)
+
+# legend
+legend("right",
+     fill = pal,
+     title = 'Mortality per 10,000',
+     legend = levels(est_cut),
+     bty = 'n'
+)
+
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 3, font = 2)
+

+
+# reset margins
+par(og)
+

Using the credible intervals, we can complement our map with a point-interval plot:

+
+# order counties by mortality rate
+index <- order(mortality_est$mean, decreasing = TRUE)
+dat <- mortality_est[index, ]
+
+# gather estimate with credible interval (95%)
+est <- dat$mean
+lwr <- dat$`2.5%`
+upr <- dat$`97.5%`
+y <- seq_along(county_name)
+x_lim <- c(min(lwr), max(upr)) |>
+      round()
+
+og=par(mar = c(3, 0, 0, 0))
+
+# points
+plot(est,
+     y,
+     pch = 5,
+     col = 'gray50',
+     bty = 'L',
+     axes = FALSE,
+     xlim = x_lim,
+     ylab = NA,
+     xlab = NA)
+
+# intervals
+segments(x0 = lwr, x1 = upr,
+         y0 = y, y1 = y,
+     col = colors[ index ])
+
+# x axis
+axis(1, at = seq(x_lim[1], x_lim[2], by = 20))
+mtext('County mortality rates per 10,000, Georgia women ages 55-64', side = 1, line = 2)
+

+
+par(og)
+

More details and demonstrations can be found in the package help pages and vignettes.

Citing geostan @@ -249,16 +322,16 @@

Citing geostanIf you use geostan in published work, please include a citation.

Donegan, Connor (2022) “geostan: An R package for Bayesian spatial analysis” The Journal of Open Source Software. 7, no. 79: 4716. https://doi.org/10.21105/joss.04716.

DOI

-
@Article{,
-  title = {{geostan}: An {R} package for {B}ayesian spatial analysis},
-  author = {Connor Donegan},
-  journal = {The Journal of Open Source Software},
-  year = {2022},
-  volume = {7},
-  number = {79},
-  pages = {4716},
-  doi = {10.21105/joss.04716},
-}
+
@Article{,
+  title = {{geostan}: An {R} package for {B}ayesian spatial analysis},
+  author = {Connor Donegan},
+  journal = {The Journal of Open Source Software},
+  year = {2022},
+  volume = {7},
+  number = {79},
+  pages = {4716},
+  doi = {10.21105/joss.04716},
+}

diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 824e8eab..9ee08f28 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -8,7 +8,7 @@ articles: spatial-analysis: spatial-analysis.html spatial-me-models: spatial-me-models.html spatial-weights-matrix: spatial-weights-matrix.html -last_built: 2024-11-07T19:06Z +last_built: 2024-11-13T22:58Z urls: reference: https://connordonegan.github.io/geostan/reference article: https://connordonegan.github.io/geostan/articles diff --git a/docs/reference/figures/README-unnamed-chunk-3-1.png b/docs/reference/figures/README-unnamed-chunk-3-1.png index 927b4069..0c8b7c51 100644 Binary files a/docs/reference/figures/README-unnamed-chunk-3-1.png and b/docs/reference/figures/README-unnamed-chunk-3-1.png differ diff --git a/docs/reference/figures/README-unnamed-chunk-5-1.png b/docs/reference/figures/README-unnamed-chunk-5-1.png index f3570fa8..85bcdffa 100644 Binary files a/docs/reference/figures/README-unnamed-chunk-5-1.png and b/docs/reference/figures/README-unnamed-chunk-5-1.png differ diff --git a/docs/reference/figures/README-unnamed-chunk-8-1.png b/docs/reference/figures/README-unnamed-chunk-8-1.png new file mode 100644 index 00000000..665a79c7 Binary files /dev/null and b/docs/reference/figures/README-unnamed-chunk-8-1.png differ diff --git a/docs/reference/figures/README-unnamed-chunk-9-1.png b/docs/reference/figures/README-unnamed-chunk-9-1.png new file mode 100644 index 00000000..3ac77a84 Binary files /dev/null and b/docs/reference/figures/README-unnamed-chunk-9-1.png differ diff --git a/docs/reference/impacts.html b/docs/reference/impacts.html index 305cbbb4..b2643b19 100644 --- a/docs/reference/impacts.html +++ b/docs/reference/impacts.html @@ -58,7 +58,10 @@

Spillover/diffusion effects for spatial lag models

spill(beta, gamma = 0, rho, W, method = c("quick", "proper"), K = 20)
 
-impacts(object, method = c("quick", "proper"), K = 20)
+impacts(object, method = c("quick", "proper"), K = 20) + +# S3 method for impacts_slm +print(x, digits = 2, ...)
@@ -95,6 +98,18 @@

Arguments

object

A fitted spatial lag model (from stan_sar)

+ +
x
+

An object of class 'impacts_slm', as returned by geostan::impacts

+ + +
digits
+

Round results to this many digits

+ + +
...
+

Additional arguments will be passed to base::print

+

Details

diff --git a/docs/reference/index.html b/docs/reference/index.html index b0187c1e..71b8cb24 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -210,7 +210,7 @@

Methods and diagnostics spill() impacts()

+

spill() impacts() print(<impacts_slm>)

Spillover/diffusion effects for spatial lag models

diff --git a/docs/reference/predict.geostan_fit.html b/docs/reference/predict.geostan_fit.html index 9218d80f..904f4ac8 100644 --- a/docs/reference/predict.geostan_fit.html +++ b/docs/reference/predict.geostan_fit.html @@ -65,6 +65,8 @@

Predict method for geostan_fit models

summary = TRUE, type = c("link", "response"), add_slx = FALSE, + approx = FALSE, + K = 15, ... )
@@ -105,6 +107,14 @@

Arguments

Logical. If add_slx = TRUE, any spatially-lagged covariates that were specified through the 'slx' argument (of the model fitting function, e.g., stan_glm) will be added to the linear predictor. The spatial lag terms will be calculated internally using object$C, the spatial weights matrix used to fit the model. Hence, newdata must have N = object$N rows. Predictions from spatial lag models (SAR models of type 'SLM' and 'SDLM') always include the SLX terms (i.e., any value passed to add_slx will be overwritten with TRUE).

+
approx
+

For SAR models of type 'SLM' or 'SDLM' only; use an approximation for matrix inversion? See details below.

+ + +
K
+

Number of matrix powers to use with approx.

+ +
...

Not used

@@ -118,18 +128,37 @@

Value

Details

The primary purpose of the predict method is to explore marginal effects of covariates.

-

The model formula will be taken from object$formula, and then a model matrix will be created by passing newdata to the model.frame function (as in: model.frame(newdata, object$formula). Parameters are taken from as.matrix(object, pars = c("intercept", "beta")). If add_slx = TRUE, SLX coefficients will be taken from as.matrix(object, pars = "gamma").

-

Spatially-lagged covariates added via the slx argument ('spillover effects') will be included if add_slx = TRUE or if a spatial lag model is provided (a SAR model of type 'SLM' or 'SDLM'). In either of those cases, newdata must have the same number of rows as were used to fit the original data. For details on these 'spillover effects', see LeSage and Pace (2009) and LeSage (2014).

+

The model formula will be taken from object$formula, and then a model matrix will be created by passing newdata to the model.frame function (as in: model.frame(object$formula, newdata). Parameters are taken from as.matrix(object, pars = c("intercept", "beta")).

+

Spatial lag of X

+ + +

Spatially-lagged covariates which were included via the slx argument will, by default, not be included. They will be be included in predictions if add_slx = TRUE or if the fitted model is a SAR model of type 'SLM' or 'SDLM'. In either of those cases, newdata must have the same number of rows as were used to fit the original data.

+
+ +
+

Spatial lag of Y

+ + +

The typical 'marginal effect' interpretation of the regression coefficients does not hold for the SAR models of type 'SLM' or 'SDLM'. For details on these 'spillover effects', see LeSage and Pace (2009), LeSage (2014), and geostan::impacts.

Predictions for the spatial lag model (SAR models of type 'SLM') are equal to: $$ (I - \rho W)^{-1} X \beta $$ -where \(X \beta\) contains the intercept and covariates. (For intercept-only models, the above term is equal to the constant intercept.) Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to: +where \(X \beta\) contains the intercept and covariates. Predictions for the spatial Durbin lag model (SAR models of type 'SDLM') are equal to: $$ (I - \rho W)^{-1} (X \beta + WX \gamma) $$ -where \(WX \gamma\) are spatially lagged covariates multiplied by their coefficients. The SLM and SDLM differ from all other model types in that the spatial component of the model cannot be separated from the linear predictor and is, therefore, automatically incorporated into the predictions.

-

In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR (error model), or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the alpha argument.

+where \(WX \gamma\) are spatially lagged covariates multiplied by their coefficients.

+

The inverse of the matrix \((I - \rho W)\) can be time consuming to compute (especially when iterating over MCMC samples). You can use approx = TRUE to approximate the inverse using a series of matrix powers. The argument \(K\) controls how many powers to use for the approximation. As a rule, higher values of \(\rho\) require larger \(K\). Notice that \(\rho^K\) should be close to zero for the approximation to hold. For example, for \(\rho = .5\) a value of \(K=8\) may suffice (eqn0.5^8 = 0.004), but larger values of \(\rho\) require higher values of \(K\).

+
+ +
+

Generalized linear models

+ + +

In generalized linear models (such as Poisson and Binomial models) marginal effects plots on the response scale may be sensitive to the level of other covariates in the model and to geographic location. If the model includes a spatial autocorrelation component (for example, you used a spatial CAR, SAR, or ESF model), by default these terms will be fixed at zero for the purposes of calculating marginal effects. If you want to change this, you can introduce spatial trend values by specifying a varying intercept using the alpha argument.

+
+
@@ -139,16 +168,17 @@

Examples

fit <- stan_glm(deaths.male ~ offset(log(pop.at.risk.male)) + log(income), data = georgia, + re = ~ GEOID, centerx = TRUE, family = poisson(), chains = 2, iter = 600) # for speed only -# note: pop.at.risk.male=1 leads to log(pop.at.risk.male)=0 +# note: pop.at.risk.male=1 leads to offset of log(pop.at.risk.male)=0 # so that the predicted values are rates newdata <- data.frame( income = seq(min(georgia$income), max(georgia$income), - length.out = 100), + length.out = 200), pop.at.risk.male = 1) preds <- predict(fit, newdata, type = "response") @@ -163,12 +193,35 @@

Examples

newdata$pop.at.risk.male <- 10e3 preds <- predict(fit, newdata, type = "response") head(preds) + +# plot range +y_lim <- c(min(preds$`2.5%`), max(preds$`97.5%`)) + +# plot line plot(preds$income, preds$mean, - type = "l", + type = "l", ylab = "Deaths per 10,000", - xlab = "Income ($1,000s)") -
+ xlab = "Income ($1,000s)", + ylim = y_lim, + axes = FALSE) + +# add shaded cred. interval +x <- c(preds$income, rev(preds$income)) +y <- c(preds$`2.5%`, rev(preds$`97.5%`)) +polygon(x = x, y = y, + col = rgb(0.1, 0.2, 0.3, 0.3), + border = NA) + +# add axes +yat = seq(0, 300, by = 25) +axis(2, at = yat) + +xat = seq(0, 200, by = 20) +axis(1, at = xat) + +# show county incomes +rug(georgia$income, ticksize = 0.015)